diff --git a/create_tmpbib.py b/create_tmpbib.py new file mode 100644 index 0000000..f75efef --- /dev/null +++ b/create_tmpbib.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8; mode: python -*- + +__version__ = "1.0" +__date__ = "20180321" +__author__ = "kthoden@mpiwg-berlin.mpg.de" + +import os +import sys +import argparse +import logging +import re +import pickle +import difflib +import libeoaconvert +import bibtexparser +from bibtexparser.bwriter import BibTexWriter +from bibtexparser.bibdatabase import BibDatabase + +logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') + +TMP_DIR = os.path.expanduser("tmp_files") + +def write_report(report): + """Output statistics""" + + print("="*60) + print(' '*4, "Report") + print("-"*60) + + print("Number of entries in the formatted bibliography: {}".format(report["found_formatted"])) + print("Number of unique citekeys in the publication: {}".format(report["found_citekeys"])) + + if len(report["unmatched"]) > 0: + print("{} {} could not be matched to entries from the formatted bibliography.".format(len(report["unmatched"]), libeoaconvert.plural(len(report["unmatched"]), "citekey"))) + for item in report["unmatched"]: + print(' '*4, item) + else: + print("All citekeys found in the publication could be linked successfully.") + if len(report["formatted_notremoved"]) > 0: + print("{} {} in the formatted bibliography could not be matched to a citekey found in the publication:".format(len(report["formatted_notremoved"]), libeoaconvert.plural(len(report["formatted_notremoved"]), "citation"))) + for item in report["formatted_notremoved"]: + print(' '*4, item) + else: + print("All entries from the formatted bibliography could be matched to a citekey.") + print("="*60) +# def write_report ends here + +def tmp_citations(formatted_bibliography): + """Create temporary citekeys out of formatted bibliography + + Return a dictionary containing temporary citekey and full citation.""" + + authoryear_pattern = re.compile(r"^(?P.+?),.+?\((?P)((1[0-9]|20)[0-9]{2}[a-z]?|forth.*?)\).*?$") + tmp_citekey_dict = {} + nonematcher = 1 + + with open(formatted_bibliography, "r") as formbib: + citations = formbib.readlines() + + for citation in citations: + matches = re.match(authoryear_pattern, citation) + try: + tmp_citekey = "{}_{}".format(matches.group(1), matches.group(3)) + tmp_citekey_dict[tmp_citekey] = citation + except AttributeError: + tmp_citekey_dict["None_{:03d}".format(nonematcher)] = citation + nonematcher += 1 + + return tmp_citekey_dict +# def tmp_citations ends here + +def fix_bibentry(bib_entries): + """Fix the bibentry in interactive mode""" + + available_types = {"a" : "article", "i" : "incollection", "m" : + "misc", "b" : "book", "k" : "booklet", "r" : "report", "t" : + "thesis", "p" : "inproceedings", "n" : "newspaper"} + + db = BibDatabase() + db_list = [] + + for entry in bib_entries: + if len(entry["keyword"]) != 0: + entry["keywords"] = entry["keyword"] + entry.pop("keyword") + + if len(entry["comments"]) == 0: + logging.info("Skipping {}".format(entry["ID"])) + else: + print("{}\nEntry type is currently set to _{}_. This is the formatted entry:\n\n{}\n\nDo you want to change the entry type?".format("="*20, entry["ENTRYTYPE"], entry["comments"])) + for xx in available_types: + print(xx, " -> ", available_types[xx]) + entrytype = input("Your selection: ") + + if entrytype in available_types.keys(): + entry["ENTRYTYPE"] = available_types[entrytype] + elif entrytype == "": + entry["ENTRYTYPE"] = entry["ENTRYTYPE"] + else: + print("Wrong key. Continuing.") + + db_list.append(entry) + + bibfile_path = TMP_DIR + os.path.sep + "fixed.bib" + + db.entries = db_list + + writer = BibTexWriter() + writer.indent = ' ' # indent entries with 4 spaces instead of one + with open(bibfile_path, 'w') as fakebibfile: + fakebibfile.write(writer.write(db)) + logging.debug("Fixed the entry types and wrote %s." % bibfile_path) +# def fix_bibentry ends here + +def main(): + """The main bit""" + + parser = argparse.ArgumentParser() + parser.add_argument("-k", "--keyword", help="Assign a keyword.") + parser.add_argument("-c", "--citekeys", help="Specify a file containing one citekey per line. Otherwise use data gathered by fix_tei.") + parser.add_argument("-f", "--fix_entry", help="In a second round, assign the correct entry type.", action="store_true") + parser.add_argument("formattedbib", help="A text file containing a formatted bibliography.") + args = parser.parse_args() + + if not os.path.exists(TMP_DIR): + os.makedirs(TMP_DIR) + + report = {} + + citekey_dict = tmp_citations(args.formattedbib) + tmp_keys = citekey_dict.keys() + + if args.citekeys is not None: + citekeys = args.citekeys + with open(citekeys, "r") as bf: + entries = bf.readlines() + else: + with open('tmp_files/data.pickle', 'rb') as f: + data = pickle.load(f) + entries = list(set(data["notcited"])) + + num_tmp_keys = len(tmp_keys) + num_entries = len(entries) + + report["found_formatted"] = num_tmp_keys + report["found_citekeys"] = num_entries + + removed_items = [] + unmatched_citekeys = [] + + db = BibDatabase() + db_list = [] + + for e in entries: + + tmp_keys = citekey_dict.keys() + candidates = difflib.get_close_matches(e, tmp_keys) + sys.stdout.write("{} Citekey {} of {} {}\nLooking at {}.\n".format("="*10, entries.index(e) + 1, len(entries) + 1, "="*10, e.rstrip())) + + count = 1 + if len(candidates) == 1: + sys.stdout.write("Found only one match: {}\n".format(citekey_dict[candidates[0]])) + yesno = input("""Assign that entry to the found citekey? [Y/n]""") + print("input", yesno) + if yesno.lower() in ["y", ""]: + annotation = citekey_dict[candidates[0]] + removed_item = citekey_dict.pop(candidates[0]) + print("RI", removed_item) + removed_items.append(removed_item) + else: + annotation = "" + elif len(candidates) > 1: + sys.stdout.write("Found \n") + for i in candidates: + sys.stdout.write("%s. %s" % (count, citekey_dict[i])) + count += 1 + answer = input("""Does one of those (1 - %s) fit?\nTo abort, just press Enter: """ % (count - 1, )) + if answer: + chosen_citekey = candidates[int(answer) - 1 ] + annotation = citekey_dict[chosen_citekey] + removed_item = citekey_dict.pop(chosen_citekey) + print(removed_item) + removed_items.append(removed_item) + else: + annotation = "" + else: + sys.stdout.write("No candidate found.") + unmatched_citekeys.append(e.rstrip()) + annotation = "" + + # logging.debug(e.rstrip()) + entrydict = {} + if args.keyword is not None: + entrydict["keywords"] = args.keyword + entrydict["ENTRYTYPE"] = "book" + entrydict["title"] = "Faketitle" + entrydict["comments"] = annotation.rstrip() + entrydict["ID"] = e.rstrip() + try: + entrydict["author"], entrydict["year"] = e.split("_") + except ValueError: + print("Could not split %s. Please fix." % e.rstrip()) + sys.exit() + + print("Removed items: {}".format(len(removed_items))) + db_list.append(entrydict) + + report["unmatched"] = unmatched_citekeys + report["formatted_removed"] = removed_items + report["formatted_notremoved"] = [x for x in citekey_dict.values() if not x in removed_items] + + bibfile_path = TMP_DIR + os.path.sep + "prelim.bib" + + write_report(report) + + db.entries = db_list + + writer = BibTexWriter() + writer.indent = ' ' # indent entries with 4 spaces instead of one + with open(bibfile_path, 'w') as fakebibfile: + fakebibfile.write(writer.write(db)) + logging.debug("Wrote a preliminary bibtex file") + + if args.fix_entry: + with open(bibfile_path) as btf: + btb = bibtexparser.load(btf) + tmp_dict = btb.entries + fix_bibentry(tmp_dict) + else: + logging.debug("Finished") +# def main ends here + +if __name__ == '__main__': + main() +# finis