Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
311 lines (255 sloc) 11.9 KB
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
"""A helper script that creates a temporary bibtex file from a
formatted list of references.
In cases where authors hand in a formatted version of the bibliography
(rather than a reference database), this tool can help creating a
database in BibTeX format.
It is being used in conjunction with the DocX workflow where citekeys
are already present in the source file. The script receives a list of
formatted references as input (one entry per line). Optionally, a
keyword can be assigned, for example the name of the chapter author,
or the entry type can be fixed.
We require authors to use shortcuts in their docx manuscript when
citing, including the use of a citekey (`LASTNAME_YEAR`), there should
already be citekeys in the manuscript. When running `fix_tei.py`,
these citekeys are gathered together and can be used as an input to
this tool.
The tool creates temporary citekeys out of the formatted bibliography
and in an interactive session, the user selects the most likely entry.
With this, rudimentary entries can be created.
"""
__version__ = "1.0"
__date__ = "20180321"
__author__ = "kthoden@mpiwg-berlin.mpg.de"
import os
import sys
import argparse
import logging
import re
import pickle
import difflib
import libeoaconvert
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
TMP_DIR = os.path.expanduser("tmp_files")
def write_report(report):
"""Output statistics"""
print("="*60)
print(' '*4, "Report")
print("-"*60)
print("Number of entries in the formatted bibliography: {}".format(report["found_formatted"]))
print("Number of unique citekeys in the publication: {}".format(report["found_citekeys"]))
if len(report["unmatched"]) > 0:
print("{} {} found in the publication could not be matched to entries from the formatted bibliography.".format(len(report["unmatched"]), libeoaconvert.plural(len(report["unmatched"]), "citekey")))
for item in report["unmatched"]:
print(' '*4, item)
else:
print("All citekeys found in the publication could be linked successfully.")
if len(report["formatted_notremoved"]) > 0:
print("{} {} in the formatted bibliography could not be matched to a citekey found in the publication:".format(len(report["formatted_notremoved"]), libeoaconvert.plural(len(report["formatted_notremoved"]), "citation")))
for item in report["formatted_notremoved"]:
print(' '*4, item)
else:
print("All entries from the formatted bibliography could be matched to a citekey.")
print("="*60)
# def write_report ends here
def tmp_citations(formatted_bibliography):
"""Create temporary citekeys out of formatted bibliography
Return a dictionary containing temporary citekey and full citation."""
authoryear_pattern = re.compile(r"^(?P<author>.+?),.+?\((?P<year>)((1[0-9]|20)[0-9]{2}[a-z]?|forth.*?)\).*?$")
tmp_citekey_dict = {}
nonematcher = 1
with open(formatted_bibliography, "r") as formbib:
citations = formbib.readlines()
for citation in citations:
if len(citation) > 2:
logging.debug("Creating temporary citekey from %s." % citation)
matches = re.match(authoryear_pattern, citation)
try:
tmp_citekey = "{}_{}".format(matches.group(1), matches.group(3))
tmp_citekey_dict[tmp_citekey] = citation
except AttributeError:
tmp_citekey_dict["None_{:03d}".format(nonematcher)] = citation
nonematcher += 1
else:
logging.debug("Skipping blank line")
return tmp_citekey_dict
# def tmp_citations ends here
def fix_bibentry(bib_entries):
"""Fix the bibentry in interactive mode"""
available_types = {"a" : "article", "i" : "incollection", "m" :
"misc", "b" : "book", "k" : "booklet", "r" : "report", "t" :
"thesis", "p" : "inproceedings", "n" : "newspaper"}
db = BibDatabase()
db_list = []
for entry in bib_entries:
if len(entry["keyword"]) != 0:
entry["keywords"] = entry["keyword"]
entry.pop("keyword")
if len(entry["comments"]) == 0:
logging.info("Skipping {}".format(entry["ID"]))
else:
print("{}\nEntry type is currently set to _{}_. This is the formatted entry:\n\n{}\n\nDo you want to change the entry type?".format("="*20, entry["ENTRYTYPE"], entry["comments"]))
for xx in available_types:
print(xx, " -> ", available_types[xx])
entrytype = input("Your selection: ")
if entrytype in available_types.keys():
entry["ENTRYTYPE"] = available_types[entrytype]
elif entrytype == "":
entry["ENTRYTYPE"] = entry["ENTRYTYPE"]
else:
print("Wrong key. Continuing.")
db_list.append(entry)
bibfile_path = TMP_DIR + os.path.sep + "fixed.bib"
db.entries = db_list
writer = BibTexWriter()
writer.indent = ' ' # indent entries with 4 spaces instead of one
with open(bibfile_path, 'w') as fakebibfile:
fakebibfile.write(writer.write(db))
logging.debug("Fixed the entry types and wrote %s." % bibfile_path)
# def fix_bibentry ends here
def main():
"""The main bit"""
parser = argparse.ArgumentParser()
parser.add_argument("-k", "--keyword", help="Assign a keyword.")
parser.add_argument("-f", "--fix_entry", help="In a second round, assign the correct entry type.", action="store_true")
group = parser.add_mutually_exclusive_group()
group.add_argument("-c", "--citekeys", help="Specify a file containing one citekey per line. Otherwise use data gathered by fix_tei.")
group.add_argument("-m", "--makekeyfile", help="If there is no citekey file.", action="store_true")
parser.add_argument("formattedbib", help="A text file containing a formatted bibliography.")
args = parser.parse_args()
if not os.path.exists(TMP_DIR):
os.makedirs(TMP_DIR)
report = {}
citekey_dict = tmp_citations(args.formattedbib)
tmp_keys = citekey_dict.keys()
if args.makekeyfile is not None:
entry_list = tmp_keys
entries = list(entry_list)
elif args.citekeys is not None:
citekeys = args.citekeys
with open(citekeys, "r") as bf:
entries = bf.readlines()
else:
with open('tmp_files/data.pickle', 'rb') as f:
data = pickle.load(f)
entries = list(set(data["citekey_not_in_bib"]))
num_tmp_keys = len(tmp_keys)
num_entries = len(entries)
report["found_formatted"] = num_tmp_keys
report["found_citekeys"] = num_entries
removed_items = []
unmatched_citekeys = []
db = BibDatabase()
db_list = []
for e in entries:
tmp_keys = citekey_dict.keys()
if args.makekeyfile is None:
candidates = difflib.get_close_matches(e, tmp_keys)
sys.stdout.write("{} Citekey {} of {} {}\nLooking at {}.\n".format("="*10, entries.index(e) + 1, len(entries) + 1, "="*10, e.rstrip()))
count = 1
if len(candidates) == 1:
sys.stdout.write("Found only one match: {}\n".format(citekey_dict[candidates[0]]))
yesno = input("""Assign that entry to the found citekey? [Y/n] """)
if yesno.lower() in ["y", ""]:
annotation = citekey_dict[candidates[0]]
removed_item = citekey_dict.pop(candidates[0])
removed_items.append(removed_item)
entry_keyword = "FIXME"
else:
unmatched_citekeys.append(e.rstrip())
entry_keyword = "FILLMEIN"
annotation = "No candidate found in formatted bibliography"
elif len(candidates) > 1:
sys.stdout.write("Found \n")
for i in candidates:
sys.stdout.write("%s. %s" % (count, citekey_dict[i]))
count += 1
answer = input("""Does one of those (1 - %s) fit?\nTo abort, press 'n': """ % (count - 1, ))
print("The answer is", answer)
if answer.lower() == 'n':
annotation = "No candidate found in formatted bibliography"
entry_keyword = "FILLMEIN"
elif int(answer) in range(1, count):
chosen_citekey = candidates[int(answer) - 1 ]
annotation = citekey_dict[chosen_citekey]
entry_keyword = "FIXME"
removed_item = citekey_dict.pop(chosen_citekey)
removed_items.append(removed_item)
else:
sys.stdout.write("No candidate found in formatted bibliography for citekey %s.\n" % e.rstrip())
unmatched_citekeys.append(e.rstrip())
annotation = "No candidate found in formatted bibliography"
entry_keyword = "FILLMEIN"
entrydict = {}
if args.keyword is not None:
all_keywords = entry_keyword.split(", ")
if args.keyword in all_keywords:
print("keyword already in ")
continue
else:
print("option keyword")
entry_keyword = entry_keyword + ", " + args.keyword
entrydict["keywords"] = entry_keyword
entrydict["ENTRYTYPE"] = "book"
entrydict["title"] = "Faketitle"
entrydict["comments"] = annotation.rstrip()
entrydict["ID"] = e.rstrip()
try:
entrydict["author"], entrydict["year"] = e.split("_")
except ValueError:
print("Could not split %s. Please fix." % e.rstrip())
sys.exit()
print("Removed items: {}".format(len(removed_items)))
db_list.append(entrydict)
else:
entrydict = {}
annotation = citekey_dict[e]
entry_keyword = "FILLMEIN"
if args.keyword is not None:
all_keywords = entry_keyword.split(", ")
if args.keyword in all_keywords:
print("keyword already in ")
continue
else:
print("option keyword")
entry_keyword = entry_keyword + ", " + args.keyword
entrydict["keywords"] = entry_keyword
entrydict["ENTRYTYPE"] = "book"
entrydict["title"] = "Faketitle"
entrydict["comments"] = annotation.rstrip()
entrydict["ID"] = e.rstrip()
try:
entrydict["author"], entrydict["year"] = e.split("_")
except ValueError:
print("Could not split %s. Please fix." % e.rstrip())
sys.exit()
db_list.append(entrydict)
report["unmatched"] = unmatched_citekeys
report["formatted_removed"] = removed_items
report["formatted_notremoved"] = [x for x in citekey_dict.values() if not x in removed_items]
formatted_not_found_path = TMP_DIR + os.path.sep + "unassigned_formatted_entries.txt"
with open(formatted_not_found_path, 'w') as unassigned:
unassigned.write("\n".join([x for x in citekey_dict.values() if not x in removed_items]))
bibfile_path = TMP_DIR + os.path.sep + "prelim.bib"
write_report(report)
db.entries = db_list
writer = BibTexWriter()
writer.indent = ' ' # indent entries with 4 spaces instead of one
with open(bibfile_path, 'w') as fakebibfile:
fakebibfile.write(writer.write(db))
logging.debug("Wrote a preliminary bibtex file")
if args.fix_entry:
with open(bibfile_path) as btf:
btb = bibtexparser.load(btf)
tmp_dict = btb.entries
fix_bibentry(tmp_dict)
else:
logging.debug("Finished")
# def main ends here
if __name__ == '__main__':
main()
# finis