Skip to content
Permalink
4c960b8f72
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
256 lines (208 sloc) 9.44 KB
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
__version__ = "1.0"
__date__ = "20180321"
__author__ = "kthoden@mpiwg-berlin.mpg.de"
import os
import sys
import argparse
import logging
import re
import pickle
import difflib
import libeoaconvert
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
TMP_DIR = os.path.expanduser("tmp_files")
def write_report(report):
"""Output statistics"""
print("="*60)
print(' '*4, "Report")
print("-"*60)
print("Number of entries in the formatted bibliography: {}".format(report["found_formatted"]))
print("Number of unique citekeys in the publication: {}".format(report["found_citekeys"]))
if len(report["unmatched"]) > 0:
print("{} {} found in the publication could not be matched to entries from the formatted bibliography.".format(len(report["unmatched"]), libeoaconvert.plural(len(report["unmatched"]), "citekey")))
for item in report["unmatched"]:
print(' '*4, item)
else:
print("All citekeys found in the publication could be linked successfully.")
if len(report["formatted_notremoved"]) > 0:
print("{} {} in the formatted bibliography could not be matched to a citekey found in the publication:".format(len(report["formatted_notremoved"]), libeoaconvert.plural(len(report["formatted_notremoved"]), "citation")))
for item in report["formatted_notremoved"]:
print(' '*4, item)
else:
print("All entries from the formatted bibliography could be matched to a citekey.")
print("="*60)
# def write_report ends here
def tmp_citations(formatted_bibliography):
"""Create temporary citekeys out of formatted bibliography
Return a dictionary containing temporary citekey and full citation."""
authoryear_pattern = re.compile(r"^(?P<author>.+?),.+?\((?P<year>)((1[0-9]|20)[0-9]{2}[a-z]?|forth.*?)\).*?$")
tmp_citekey_dict = {}
nonematcher = 1
with open(formatted_bibliography, "r") as formbib:
citations = formbib.readlines()
for citation in citations:
print(len(citation))
if len(citation) > 2:
logging.debug("Creating temporary citekey from %s." % citation)
matches = re.match(authoryear_pattern, citation)
try:
tmp_citekey = "{}_{}".format(matches.group(1), matches.group(3))
tmp_citekey_dict[tmp_citekey] = citation
except AttributeError:
tmp_citekey_dict["None_{:03d}".format(nonematcher)] = citation
nonematcher += 1
else:
logging.debug("Skipping blank line")
return tmp_citekey_dict
# def tmp_citations ends here
def fix_bibentry(bib_entries):
"""Fix the bibentry in interactive mode"""
available_types = {"a" : "article", "i" : "incollection", "m" :
"misc", "b" : "book", "k" : "booklet", "r" : "report", "t" :
"thesis", "p" : "inproceedings", "n" : "newspaper"}
db = BibDatabase()
db_list = []
for entry in bib_entries:
if len(entry["keyword"]) != 0:
entry["keywords"] = entry["keyword"]
entry.pop("keyword")
if len(entry["comments"]) == 0:
logging.info("Skipping {}".format(entry["ID"]))
else:
print("{}\nEntry type is currently set to _{}_. This is the formatted entry:\n\n{}\n\nDo you want to change the entry type?".format("="*20, entry["ENTRYTYPE"], entry["comments"]))
for xx in available_types:
print(xx, " -> ", available_types[xx])
entrytype = input("Your selection: ")
if entrytype in available_types.keys():
entry["ENTRYTYPE"] = available_types[entrytype]
elif entrytype == "":
entry["ENTRYTYPE"] = entry["ENTRYTYPE"]
else:
print("Wrong key. Continuing.")
db_list.append(entry)
bibfile_path = TMP_DIR + os.path.sep + "fixed.bib"
db.entries = db_list
writer = BibTexWriter()
writer.indent = ' ' # indent entries with 4 spaces instead of one
with open(bibfile_path, 'w') as fakebibfile:
fakebibfile.write(writer.write(db))
logging.debug("Fixed the entry types and wrote %s." % bibfile_path)
# def fix_bibentry ends here
def main():
"""The main bit"""
parser = argparse.ArgumentParser()
parser.add_argument("-k", "--keyword", help="Assign a keyword.")
parser.add_argument("-c", "--citekeys", help="Specify a file containing one citekey per line. Otherwise use data gathered by fix_tei.")
parser.add_argument("-f", "--fix_entry", help="In a second round, assign the correct entry type.", action="store_true")
parser.add_argument("formattedbib", help="A text file containing a formatted bibliography.")
args = parser.parse_args()
if not os.path.exists(TMP_DIR):
os.makedirs(TMP_DIR)
report = {}
citekey_dict = tmp_citations(args.formattedbib)
tmp_keys = citekey_dict.keys()
if args.citekeys is not None:
citekeys = args.citekeys
with open(citekeys, "r") as bf:
entries = bf.readlines()
else:
with open('tmp_files/data.pickle', 'rb') as f:
data = pickle.load(f)
entries = list(set(data["citekey_not_in_bib"]))
num_tmp_keys = len(tmp_keys)
num_entries = len(entries)
report["found_formatted"] = num_tmp_keys
report["found_citekeys"] = num_entries
removed_items = []
unmatched_citekeys = []
db = BibDatabase()
db_list = []
for e in entries:
tmp_keys = citekey_dict.keys()
candidates = difflib.get_close_matches(e, tmp_keys)
sys.stdout.write("{} Citekey {} of {} {}\nLooking at {}.\n".format("="*10, entries.index(e) + 1, len(entries) + 1, "="*10, e.rstrip()))
count = 1
if len(candidates) == 1:
sys.stdout.write("Found only one match: {}\n".format(citekey_dict[candidates[0]]))
yesno = input("""Assign that entry to the found citekey? [Y/n] """)
if yesno.lower() in ["y", ""]:
annotation = citekey_dict[candidates[0]]
removed_item = citekey_dict.pop(candidates[0])
removed_items.append(removed_item)
entry_keyword = "FIXME"
else:
unmatched_citekeys.append(e.rstrip())
entry_keyword = "FILLMEIN"
annotation = "No candidate found in formatted bibliography"
elif len(candidates) > 1:
sys.stdout.write("Found \n")
for i in candidates:
sys.stdout.write("%s. %s" % (count, citekey_dict[i]))
count += 1
answer = input("""Does one of those (1 - %s) fit?\nTo abort, press 'n': """ % (count - 1, ))
print("The answer is", answer)
if answer.lower() == 'n':
annotation = "No candidate found in formatted bibliography"
entry_keyword = "FILLMEIN"
elif int(answer) in range(1, count):
chosen_citekey = candidates[int(answer) - 1 ]
annotation = citekey_dict[chosen_citekey]
entry_keyword = "FIXME"
removed_item = citekey_dict.pop(chosen_citekey)
removed_items.append(removed_item)
else:
sys.stdout.write("No candidate found in formatted bibliography for citekey %s.\n" % e.rstrip())
unmatched_citekeys.append(e.rstrip())
annotation = "No candidate found in formatted bibliography"
entry_keyword = "FILLMEIN"
entrydict = {}
if args.keyword is not None:
all_keywords = entry_keyword.split(", ")
if args.keyword in all_keywords:
print("keyword already in ")
continue
else:
print("option keyword")
entry_keyword = entry_keyword + ", " + args.keyword
entrydict["keywords"] = entry_keyword
entrydict["ENTRYTYPE"] = "book"
entrydict["title"] = "Faketitle"
entrydict["comments"] = annotation.rstrip()
entrydict["ID"] = e.rstrip()
try:
entrydict["author"], entrydict["year"] = e.split("_")
except ValueError:
print("Could not split %s. Please fix." % e.rstrip())
sys.exit()
print("Removed items: {}".format(len(removed_items)))
db_list.append(entrydict)
report["unmatched"] = unmatched_citekeys
report["formatted_removed"] = removed_items
report["formatted_notremoved"] = [x for x in citekey_dict.values() if not x in removed_items]
formatted_not_found_path = TMP_DIR + os.path.sep + "unassigned_formatted_entries.txt"
with open(formatted_not_found_path, 'w') as unassigned:
unassigned.write("\n".join([x for x in citekey_dict.values() if not x in removed_items]))
bibfile_path = TMP_DIR + os.path.sep + "prelim.bib"
write_report(report)
db.entries = db_list
writer = BibTexWriter()
writer.indent = ' ' # indent entries with 4 spaces instead of one
with open(bibfile_path, 'w') as fakebibfile:
fakebibfile.write(writer.write(db))
logging.debug("Wrote a preliminary bibtex file")
if args.fix_entry:
with open(bibfile_path) as btf:
btb = bibtexparser.load(btf)
tmp_dict = btb.entries
fix_bibentry(tmp_dict)
else:
logging.debug("Finished")
# def main ends here
if __name__ == '__main__':
main()
# finis