Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
257 lines (208 sloc) 9.44 KB
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
__version__ = "1.0"
__date__ = "20180321"
__author__ = "kthoden@mpiwg-berlin.mpg.de"
import os
import sys
import argparse
import logging
import re
import pickle
import difflib
import libeoaconvert
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
TMP_DIR = os.path.expanduser("tmp_files")
def write_report(report):
"""Output statistics"""
print("="*60)
print(' '*4, "Report")
print("-"*60)
print("Number of entries in the formatted bibliography: {}".format(report["found_formatted"]))
print("Number of unique citekeys in the publication: {}".format(report["found_citekeys"]))
if len(report["unmatched"]) > 0:
print("{} {} found in the publication could not be matched to entries from the formatted bibliography.".format(len(report["unmatched"]), libeoaconvert.plural(len(report["unmatched"]), "citekey")))
for item in report["unmatched"]:
print(' '*4, item)
else:
print("All citekeys found in the publication could be linked successfully.")
if len(report["formatted_notremoved"]) > 0:
print("{} {} in the formatted bibliography could not be matched to a citekey found in the publication:".format(len(report["formatted_notremoved"]), libeoaconvert.plural(len(report["formatted_notremoved"]), "citation")))
for item in report["formatted_notremoved"]:
print(' '*4, item)
else:
print("All entries from the formatted bibliography could be matched to a citekey.")
print("="*60)
# def write_report ends here
def tmp_citations(formatted_bibliography):
"""Create temporary citekeys out of formatted bibliography
Return a dictionary containing temporary citekey and full citation."""
authoryear_pattern = re.compile(r"^(?P<author>.+?),.+?\((?P<year>)((1[0-9]|20)[0-9]{2}[a-z]?|forth.*?)\).*?$")
tmp_citekey_dict = {}
nonematcher = 1
with open(formatted_bibliography, "r") as formbib:
citations = formbib.readlines()
for citation in citations:
print(len(citation))
if len(citation) > 2:
logging.debug("Creating temporary citekey from %s." % citation)
matches = re.match(authoryear_pattern, citation)
try:
tmp_citekey = "{}_{}".format(matches.group(1), matches.group(3))
tmp_citekey_dict[tmp_citekey] = citation
except AttributeError:
tmp_citekey_dict["None_{:03d}".format(nonematcher)] = citation
nonematcher += 1
else:
logging.debug("Skipping blank line")
return tmp_citekey_dict
# def tmp_citations ends here
def fix_bibentry(bib_entries):
"""Fix the bibentry in interactive mode"""
available_types = {"a" : "article", "i" : "incollection", "m" :
"misc", "b" : "book", "k" : "booklet", "r" : "report", "t" :
"thesis", "p" : "inproceedings", "n" : "newspaper"}
db = BibDatabase()
db_list = []
for entry in bib_entries:
if len(entry["keyword"]) != 0:
entry["keywords"] = entry["keyword"]
entry.pop("keyword")
if len(entry["comments"]) == 0:
logging.info("Skipping {}".format(entry["ID"]))
else:
print("{}\nEntry type is currently set to _{}_. This is the formatted entry:\n\n{}\n\nDo you want to change the entry type?".format("="*20, entry["ENTRYTYPE"], entry["comments"]))
for xx in available_types:
print(xx, " -> ", available_types[xx])
entrytype = input("Your selection: ")
if entrytype in available_types.keys():
entry["ENTRYTYPE"] = available_types[entrytype]
elif entrytype == "":
entry["ENTRYTYPE"] = entry["ENTRYTYPE"]
else:
print("Wrong key. Continuing.")
db_list.append(entry)
bibfile_path = TMP_DIR + os.path.sep + "fixed.bib"
db.entries = db_list
writer = BibTexWriter()
writer.indent = ' ' # indent entries with 4 spaces instead of one
with open(bibfile_path, 'w') as fakebibfile:
fakebibfile.write(writer.write(db))
logging.debug("Fixed the entry types and wrote %s." % bibfile_path)
# def fix_bibentry ends here
def main():
"""The main bit"""
parser = argparse.ArgumentParser()
parser.add_argument("-k", "--keyword", help="Assign a keyword.")
parser.add_argument("-c", "--citekeys", help="Specify a file containing one citekey per line. Otherwise use data gathered by fix_tei.")
parser.add_argument("-f", "--fix_entry", help="In a second round, assign the correct entry type.", action="store_true")
parser.add_argument("formattedbib", help="A text file containing a formatted bibliography.")
args = parser.parse_args()
if not os.path.exists(TMP_DIR):
os.makedirs(TMP_DIR)
report = {}
citekey_dict = tmp_citations(args.formattedbib)
tmp_keys = citekey_dict.keys()
if args.citekeys is not None:
citekeys = args.citekeys
with open(citekeys, "r") as bf:
entries = bf.readlines()
else:
with open('tmp_files/data.pickle', 'rb') as f:
data = pickle.load(f)
entries = list(set(data["citekey_not_in_bib"]))
num_tmp_keys = len(tmp_keys)
num_entries = len(entries)
report["found_formatted"] = num_tmp_keys
report["found_citekeys"] = num_entries
removed_items = []
unmatched_citekeys = []
db = BibDatabase()
db_list = []
for e in entries:
tmp_keys = citekey_dict.keys()
candidates = difflib.get_close_matches(e, tmp_keys)
sys.stdout.write("{} Citekey {} of {} {}\nLooking at {}.\n".format("="*10, entries.index(e) + 1, len(entries) + 1, "="*10, e.rstrip()))
count = 1
if len(candidates) == 1:
sys.stdout.write("Found only one match: {}\n".format(citekey_dict[candidates[0]]))
yesno = input("""Assign that entry to the found citekey? [Y/n] """)
if yesno.lower() in ["y", ""]:
annotation = citekey_dict[candidates[0]]
removed_item = citekey_dict.pop(candidates[0])
removed_items.append(removed_item)
entry_keyword = "FIXME"
else:
unmatched_citekeys.append(e.rstrip())
entry_keyword = "FILLMEIN"
annotation = "No candidate found in formatted bibliography"
elif len(candidates) > 1:
sys.stdout.write("Found \n")
for i in candidates:
sys.stdout.write("%s. %s" % (count, citekey_dict[i]))
count += 1
answer = input("""Does one of those (1 - %s) fit?\nTo abort, press 'n': """ % (count - 1, ))
print("The answer is", answer)
if answer.lower() == 'n':
annotation = "No candidate found in formatted bibliography"
entry_keyword = "FILLMEIN"
elif int(answer) in range(1, count):
chosen_citekey = candidates[int(answer) - 1 ]
annotation = citekey_dict[chosen_citekey]
entry_keyword = "FIXME"
removed_item = citekey_dict.pop(chosen_citekey)
removed_items.append(removed_item)
else:
sys.stdout.write("No candidate found in formatted bibliography for citekey %s.\n" % e.rstrip())
unmatched_citekeys.append(e.rstrip())
annotation = "No candidate found in formatted bibliography"
entry_keyword = "FILLMEIN"
entrydict = {}
if args.keyword is not None:
all_keywords = entry_keyword.split(", ")
if args.keyword in all_keywords:
print("keyword already in ")
continue
else:
print("option keyword")
entry_keyword = entry_keyword + ", " + args.keyword
entrydict["keywords"] = entry_keyword
entrydict["ENTRYTYPE"] = "book"
entrydict["title"] = "Faketitle"
entrydict["comments"] = annotation.rstrip()
entrydict["ID"] = e.rstrip()
try:
entrydict["author"], entrydict["year"] = e.split("_")
except ValueError:
print("Could not split %s. Please fix." % e.rstrip())
sys.exit()
print("Removed items: {}".format(len(removed_items)))
db_list.append(entrydict)
report["unmatched"] = unmatched_citekeys
report["formatted_removed"] = removed_items
report["formatted_notremoved"] = [x for x in citekey_dict.values() if not x in removed_items]
formatted_not_found_path = TMP_DIR + os.path.sep + "unassigned_formatted_entries.txt"
with open(formatted_not_found_path, 'w') as unassigned:
unassigned.write("\n".join([x for x in citekey_dict.values() if not x in removed_items]))
bibfile_path = TMP_DIR + os.path.sep + "prelim.bib"
write_report(report)
db.entries = db_list
writer = BibTexWriter()
writer.indent = ' ' # indent entries with 4 spaces instead of one
with open(bibfile_path, 'w') as fakebibfile:
fakebibfile.write(writer.write(db))
logging.debug("Wrote a preliminary bibtex file")
if args.fix_entry:
with open(bibfile_path) as btf:
btb = bibtexparser.load(btf)
tmp_dict = btb.entries
fix_bibentry(tmp_dict)
else:
logging.debug("Finished")
# def main ends here
if __name__ == '__main__':
main()
# finis