Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
EOASkripts/src/create_tmpbib.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
310 lines (255 sloc)
11.9 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8; mode: python -*- | |
"""A helper script that creates a temporary bibtex file from a | |
formatted list of references. | |
In cases where authors hand in a formatted version of the bibliography | |
(rather than a reference database), this tool can help creating a | |
database in BibTeX format. | |
It is being used in conjunction with the DocX workflow where citekeys | |
are already present in the source file. The script receives a list of | |
formatted references as input (one entry per line). Optionally, a | |
keyword can be assigned, for example the name of the chapter author, | |
or the entry type can be fixed. | |
We require authors to use shortcuts in their docx manuscript when | |
citing, including the use of a citekey (`LASTNAME_YEAR`), there should | |
already be citekeys in the manuscript. When running `fix_tei.py`, | |
these citekeys are gathered together and can be used as an input to | |
this tool. | |
The tool creates temporary citekeys out of the formatted bibliography | |
and in an interactive session, the user selects the most likely entry. | |
With this, rudimentary entries can be created. | |
""" | |
__version__ = "1.0" | |
__date__ = "20180321" | |
__author__ = "kthoden@mpiwg-berlin.mpg.de" | |
import os | |
import sys | |
import argparse | |
import logging | |
import re | |
import pickle | |
import difflib | |
import libeoaconvert | |
import bibtexparser | |
from bibtexparser.bwriter import BibTexWriter | |
from bibtexparser.bibdatabase import BibDatabase | |
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') | |
TMP_DIR = os.path.expanduser("tmp_files") | |
def write_report(report): | |
"""Output statistics""" | |
print("="*60) | |
print(' '*4, "Report") | |
print("-"*60) | |
print("Number of entries in the formatted bibliography: {}".format(report["found_formatted"])) | |
print("Number of unique citekeys in the publication: {}".format(report["found_citekeys"])) | |
if len(report["unmatched"]) > 0: | |
print("{} {} found in the publication could not be matched to entries from the formatted bibliography.".format(len(report["unmatched"]), libeoaconvert.plural(len(report["unmatched"]), "citekey"))) | |
for item in report["unmatched"]: | |
print(' '*4, item) | |
else: | |
print("All citekeys found in the publication could be linked successfully.") | |
if len(report["formatted_notremoved"]) > 0: | |
print("{} {} in the formatted bibliography could not be matched to a citekey found in the publication:".format(len(report["formatted_notremoved"]), libeoaconvert.plural(len(report["formatted_notremoved"]), "citation"))) | |
for item in report["formatted_notremoved"]: | |
print(' '*4, item) | |
else: | |
print("All entries from the formatted bibliography could be matched to a citekey.") | |
print("="*60) | |
# def write_report ends here | |
def tmp_citations(formatted_bibliography): | |
"""Create temporary citekeys out of formatted bibliography | |
Return a dictionary containing temporary citekey and full citation.""" | |
authoryear_pattern = re.compile(r"^(?P<author>.+?),.+?\((?P<year>)((1[0-9]|20)[0-9]{2}[a-z]?|forth.*?)\).*?$") | |
tmp_citekey_dict = {} | |
nonematcher = 1 | |
with open(formatted_bibliography, "r") as formbib: | |
citations = formbib.readlines() | |
for citation in citations: | |
if len(citation) > 2: | |
logging.debug("Creating temporary citekey from %s." % citation) | |
matches = re.match(authoryear_pattern, citation) | |
try: | |
tmp_citekey = "{}_{}".format(matches.group(1), matches.group(3)) | |
tmp_citekey_dict[tmp_citekey] = citation | |
except AttributeError: | |
tmp_citekey_dict["None_{:03d}".format(nonematcher)] = citation | |
nonematcher += 1 | |
else: | |
logging.debug("Skipping blank line") | |
return tmp_citekey_dict | |
# def tmp_citations ends here | |
def fix_bibentry(bib_entries): | |
"""Fix the bibentry in interactive mode""" | |
available_types = {"a" : "article", "i" : "incollection", "m" : | |
"misc", "b" : "book", "k" : "booklet", "r" : "report", "t" : | |
"thesis", "p" : "inproceedings", "n" : "newspaper"} | |
db = BibDatabase() | |
db_list = [] | |
for entry in bib_entries: | |
if len(entry["keyword"]) != 0: | |
entry["keywords"] = entry["keyword"] | |
entry.pop("keyword") | |
if len(entry["comments"]) == 0: | |
logging.info("Skipping {}".format(entry["ID"])) | |
else: | |
print("{}\nEntry type is currently set to _{}_. This is the formatted entry:\n\n{}\n\nDo you want to change the entry type?".format("="*20, entry["ENTRYTYPE"], entry["comments"])) | |
for xx in available_types: | |
print(xx, " -> ", available_types[xx]) | |
entrytype = input("Your selection: ") | |
if entrytype in available_types.keys(): | |
entry["ENTRYTYPE"] = available_types[entrytype] | |
elif entrytype == "": | |
entry["ENTRYTYPE"] = entry["ENTRYTYPE"] | |
else: | |
print("Wrong key. Continuing.") | |
db_list.append(entry) | |
bibfile_path = TMP_DIR + os.path.sep + "fixed.bib" | |
db.entries = db_list | |
writer = BibTexWriter() | |
writer.indent = ' ' # indent entries with 4 spaces instead of one | |
with open(bibfile_path, 'w') as fakebibfile: | |
fakebibfile.write(writer.write(db)) | |
logging.debug("Fixed the entry types and wrote %s." % bibfile_path) | |
# def fix_bibentry ends here | |
def main(): | |
"""The main bit""" | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-k", "--keyword", help="Assign a keyword.") | |
parser.add_argument("-f", "--fix_entry", help="In a second round, assign the correct entry type.", action="store_true") | |
group = parser.add_mutually_exclusive_group() | |
group.add_argument("-c", "--citekeys", help="Specify a file containing one citekey per line. Otherwise use data gathered by fix_tei.") | |
group.add_argument("-m", "--makekeyfile", help="If there is no citekey file.", action="store_true") | |
parser.add_argument("formattedbib", help="A text file containing a formatted bibliography.") | |
args = parser.parse_args() | |
if not os.path.exists(TMP_DIR): | |
os.makedirs(TMP_DIR) | |
report = {} | |
citekey_dict = tmp_citations(args.formattedbib) | |
tmp_keys = citekey_dict.keys() | |
if args.makekeyfile is not None: | |
entry_list = tmp_keys | |
entries = list(entry_list) | |
elif args.citekeys is not None: | |
citekeys = args.citekeys | |
with open(citekeys, "r") as bf: | |
entries = bf.readlines() | |
else: | |
with open('tmp_files/data.pickle', 'rb') as f: | |
data = pickle.load(f) | |
entries = list(set(data["citekey_not_in_bib"])) | |
num_tmp_keys = len(tmp_keys) | |
num_entries = len(entries) | |
report["found_formatted"] = num_tmp_keys | |
report["found_citekeys"] = num_entries | |
removed_items = [] | |
unmatched_citekeys = [] | |
db = BibDatabase() | |
db_list = [] | |
for e in entries: | |
tmp_keys = citekey_dict.keys() | |
if args.makekeyfile is None: | |
candidates = difflib.get_close_matches(e, tmp_keys) | |
sys.stdout.write("{} Citekey {} of {} {}\nLooking at {}.\n".format("="*10, entries.index(e) + 1, len(entries) + 1, "="*10, e.rstrip())) | |
count = 1 | |
if len(candidates) == 1: | |
sys.stdout.write("Found only one match: {}\n".format(citekey_dict[candidates[0]])) | |
yesno = input("""Assign that entry to the found citekey? [Y/n] """) | |
if yesno.lower() in ["y", ""]: | |
annotation = citekey_dict[candidates[0]] | |
removed_item = citekey_dict.pop(candidates[0]) | |
removed_items.append(removed_item) | |
entry_keyword = "FIXME" | |
else: | |
unmatched_citekeys.append(e.rstrip()) | |
entry_keyword = "FILLMEIN" | |
annotation = "No candidate found in formatted bibliography" | |
elif len(candidates) > 1: | |
sys.stdout.write("Found \n") | |
for i in candidates: | |
sys.stdout.write("%s. %s" % (count, citekey_dict[i])) | |
count += 1 | |
answer = input("""Does one of those (1 - %s) fit?\nTo abort, press 'n': """ % (count - 1, )) | |
print("The answer is", answer) | |
if answer.lower() == 'n': | |
annotation = "No candidate found in formatted bibliography" | |
entry_keyword = "FILLMEIN" | |
elif int(answer) in range(1, count): | |
chosen_citekey = candidates[int(answer) - 1 ] | |
annotation = citekey_dict[chosen_citekey] | |
entry_keyword = "FIXME" | |
removed_item = citekey_dict.pop(chosen_citekey) | |
removed_items.append(removed_item) | |
else: | |
sys.stdout.write("No candidate found in formatted bibliography for citekey %s.\n" % e.rstrip()) | |
unmatched_citekeys.append(e.rstrip()) | |
annotation = "No candidate found in formatted bibliography" | |
entry_keyword = "FILLMEIN" | |
entrydict = {} | |
if args.keyword is not None: | |
all_keywords = entry_keyword.split(", ") | |
if args.keyword in all_keywords: | |
print("keyword already in ") | |
continue | |
else: | |
print("option keyword") | |
entry_keyword = entry_keyword + ", " + args.keyword | |
entrydict["keywords"] = entry_keyword | |
entrydict["ENTRYTYPE"] = "book" | |
entrydict["title"] = "Faketitle" | |
entrydict["comments"] = annotation.rstrip() | |
entrydict["ID"] = e.rstrip() | |
try: | |
entrydict["author"], entrydict["year"] = e.split("_") | |
except ValueError: | |
print("Could not split %s. Please fix." % e.rstrip()) | |
sys.exit() | |
print("Removed items: {}".format(len(removed_items))) | |
db_list.append(entrydict) | |
else: | |
entrydict = {} | |
annotation = citekey_dict[e] | |
entry_keyword = "FILLMEIN" | |
if args.keyword is not None: | |
all_keywords = entry_keyword.split(", ") | |
if args.keyword in all_keywords: | |
print("keyword already in ") | |
continue | |
else: | |
print("option keyword") | |
entry_keyword = entry_keyword + ", " + args.keyword | |
entrydict["keywords"] = entry_keyword | |
entrydict["ENTRYTYPE"] = "book" | |
entrydict["title"] = "Faketitle" | |
entrydict["comments"] = annotation.rstrip() | |
entrydict["ID"] = e.rstrip() | |
try: | |
entrydict["author"], entrydict["year"] = e.split("_") | |
except ValueError: | |
print("Could not split %s. Please fix." % e.rstrip()) | |
sys.exit() | |
db_list.append(entrydict) | |
report["unmatched"] = unmatched_citekeys | |
report["formatted_removed"] = removed_items | |
report["formatted_notremoved"] = [x for x in citekey_dict.values() if not x in removed_items] | |
formatted_not_found_path = TMP_DIR + os.path.sep + "unassigned_formatted_entries.txt" | |
with open(formatted_not_found_path, 'w') as unassigned: | |
unassigned.write("\n".join([x for x in citekey_dict.values() if not x in removed_items])) | |
bibfile_path = TMP_DIR + os.path.sep + "prelim.bib" | |
write_report(report) | |
db.entries = db_list | |
writer = BibTexWriter() | |
writer.indent = ' ' # indent entries with 4 spaces instead of one | |
with open(bibfile_path, 'w') as fakebibfile: | |
fakebibfile.write(writer.write(db)) | |
logging.debug("Wrote a preliminary bibtex file") | |
if args.fix_entry: | |
with open(bibfile_path) as btf: | |
btb = bibtexparser.load(btf) | |
tmp_dict = btb.entries | |
fix_bibentry(tmp_dict) | |
else: | |
logging.debug("Finished") | |
# def main ends here | |
if __name__ == '__main__': | |
main() | |
# finis |