-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Helper tool for creating a BibTeX database
- Loading branch information
Klaus Thoden
committed
Mar 22, 2018
1 parent
f557c91
commit fd2613e
Showing
1 changed file
with
236 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,236 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8; mode: python -*- | ||
|
||
__version__ = "1.0" | ||
__date__ = "20180321" | ||
__author__ = "kthoden@mpiwg-berlin.mpg.de" | ||
|
||
import os | ||
import sys | ||
import argparse | ||
import logging | ||
import re | ||
import pickle | ||
import difflib | ||
import libeoaconvert | ||
import bibtexparser | ||
from bibtexparser.bwriter import BibTexWriter | ||
from bibtexparser.bibdatabase import BibDatabase | ||
|
||
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') | ||
|
||
TMP_DIR = os.path.expanduser("tmp_files") | ||
|
||
def write_report(report): | ||
"""Output statistics""" | ||
|
||
print("="*60) | ||
print(' '*4, "Report") | ||
print("-"*60) | ||
|
||
print("Number of entries in the formatted bibliography: {}".format(report["found_formatted"])) | ||
print("Number of unique citekeys in the publication: {}".format(report["found_citekeys"])) | ||
|
||
if len(report["unmatched"]) > 0: | ||
print("{} {} could not be matched to entries from the formatted bibliography.".format(len(report["unmatched"]), libeoaconvert.plural(len(report["unmatched"]), "citekey"))) | ||
for item in report["unmatched"]: | ||
print(' '*4, item) | ||
else: | ||
print("All citekeys found in the publication could be linked successfully.") | ||
if len(report["formatted_notremoved"]) > 0: | ||
print("{} {} in the formatted bibliography could not be matched to a citekey found in the publication:".format(len(report["formatted_notremoved"]), libeoaconvert.plural(len(report["formatted_notremoved"]), "citation"))) | ||
for item in report["formatted_notremoved"]: | ||
print(' '*4, item) | ||
else: | ||
print("All entries from the formatted bibliography could be matched to a citekey.") | ||
print("="*60) | ||
# def write_report ends here | ||
|
||
def tmp_citations(formatted_bibliography): | ||
"""Create temporary citekeys out of formatted bibliography | ||
Return a dictionary containing temporary citekey and full citation.""" | ||
|
||
authoryear_pattern = re.compile(r"^(?P<author>.+?),.+?\((?P<year>)((1[0-9]|20)[0-9]{2}[a-z]?|forth.*?)\).*?$") | ||
tmp_citekey_dict = {} | ||
nonematcher = 1 | ||
|
||
with open(formatted_bibliography, "r") as formbib: | ||
citations = formbib.readlines() | ||
|
||
for citation in citations: | ||
matches = re.match(authoryear_pattern, citation) | ||
try: | ||
tmp_citekey = "{}_{}".format(matches.group(1), matches.group(3)) | ||
tmp_citekey_dict[tmp_citekey] = citation | ||
except AttributeError: | ||
tmp_citekey_dict["None_{:03d}".format(nonematcher)] = citation | ||
nonematcher += 1 | ||
|
||
return tmp_citekey_dict | ||
# def tmp_citations ends here | ||
|
||
def fix_bibentry(bib_entries): | ||
"""Fix the bibentry in interactive mode""" | ||
|
||
available_types = {"a" : "article", "i" : "incollection", "m" : | ||
"misc", "b" : "book", "k" : "booklet", "r" : "report", "t" : | ||
"thesis", "p" : "inproceedings", "n" : "newspaper"} | ||
|
||
db = BibDatabase() | ||
db_list = [] | ||
|
||
for entry in bib_entries: | ||
if len(entry["keyword"]) != 0: | ||
entry["keywords"] = entry["keyword"] | ||
entry.pop("keyword") | ||
|
||
if len(entry["comments"]) == 0: | ||
logging.info("Skipping {}".format(entry["ID"])) | ||
else: | ||
print("{}\nEntry type is currently set to _{}_. This is the formatted entry:\n\n{}\n\nDo you want to change the entry type?".format("="*20, entry["ENTRYTYPE"], entry["comments"])) | ||
for xx in available_types: | ||
print(xx, " -> ", available_types[xx]) | ||
entrytype = input("Your selection: ") | ||
|
||
if entrytype in available_types.keys(): | ||
entry["ENTRYTYPE"] = available_types[entrytype] | ||
elif entrytype == "": | ||
entry["ENTRYTYPE"] = entry["ENTRYTYPE"] | ||
else: | ||
print("Wrong key. Continuing.") | ||
|
||
db_list.append(entry) | ||
|
||
bibfile_path = TMP_DIR + os.path.sep + "fixed.bib" | ||
|
||
db.entries = db_list | ||
|
||
writer = BibTexWriter() | ||
writer.indent = ' ' # indent entries with 4 spaces instead of one | ||
with open(bibfile_path, 'w') as fakebibfile: | ||
fakebibfile.write(writer.write(db)) | ||
logging.debug("Fixed the entry types and wrote %s." % bibfile_path) | ||
# def fix_bibentry ends here | ||
|
||
def main(): | ||
"""The main bit""" | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-k", "--keyword", help="Assign a keyword.") | ||
parser.add_argument("-c", "--citekeys", help="Specify a file containing one citekey per line. Otherwise use data gathered by fix_tei.") | ||
parser.add_argument("-f", "--fix_entry", help="In a second round, assign the correct entry type.", action="store_true") | ||
parser.add_argument("formattedbib", help="A text file containing a formatted bibliography.") | ||
args = parser.parse_args() | ||
|
||
if not os.path.exists(TMP_DIR): | ||
os.makedirs(TMP_DIR) | ||
|
||
report = {} | ||
|
||
citekey_dict = tmp_citations(args.formattedbib) | ||
tmp_keys = citekey_dict.keys() | ||
|
||
if args.citekeys is not None: | ||
citekeys = args.citekeys | ||
with open(citekeys, "r") as bf: | ||
entries = bf.readlines() | ||
else: | ||
with open('tmp_files/data.pickle', 'rb') as f: | ||
data = pickle.load(f) | ||
entries = list(set(data["notcited"])) | ||
|
||
num_tmp_keys = len(tmp_keys) | ||
num_entries = len(entries) | ||
|
||
report["found_formatted"] = num_tmp_keys | ||
report["found_citekeys"] = num_entries | ||
|
||
removed_items = [] | ||
unmatched_citekeys = [] | ||
|
||
db = BibDatabase() | ||
db_list = [] | ||
|
||
for e in entries: | ||
|
||
tmp_keys = citekey_dict.keys() | ||
candidates = difflib.get_close_matches(e, tmp_keys) | ||
sys.stdout.write("{} Citekey {} of {} {}\nLooking at {}.\n".format("="*10, entries.index(e) + 1, len(entries) + 1, "="*10, e.rstrip())) | ||
|
||
count = 1 | ||
if len(candidates) == 1: | ||
sys.stdout.write("Found only one match: {}\n".format(citekey_dict[candidates[0]])) | ||
yesno = input("""Assign that entry to the found citekey? [Y/n]""") | ||
print("input", yesno) | ||
if yesno.lower() in ["y", ""]: | ||
annotation = citekey_dict[candidates[0]] | ||
removed_item = citekey_dict.pop(candidates[0]) | ||
print("RI", removed_item) | ||
removed_items.append(removed_item) | ||
else: | ||
annotation = "" | ||
elif len(candidates) > 1: | ||
sys.stdout.write("Found \n") | ||
for i in candidates: | ||
sys.stdout.write("%s. %s" % (count, citekey_dict[i])) | ||
count += 1 | ||
answer = input("""Does one of those (1 - %s) fit?\nTo abort, just press Enter: """ % (count - 1, )) | ||
if answer: | ||
chosen_citekey = candidates[int(answer) - 1 ] | ||
annotation = citekey_dict[chosen_citekey] | ||
removed_item = citekey_dict.pop(chosen_citekey) | ||
print(removed_item) | ||
removed_items.append(removed_item) | ||
else: | ||
annotation = "" | ||
else: | ||
sys.stdout.write("No candidate found.") | ||
unmatched_citekeys.append(e.rstrip()) | ||
annotation = "" | ||
|
||
# logging.debug(e.rstrip()) | ||
entrydict = {} | ||
if args.keyword is not None: | ||
entrydict["keywords"] = args.keyword | ||
entrydict["ENTRYTYPE"] = "book" | ||
entrydict["title"] = "Faketitle" | ||
entrydict["comments"] = annotation.rstrip() | ||
entrydict["ID"] = e.rstrip() | ||
try: | ||
entrydict["author"], entrydict["year"] = e.split("_") | ||
except ValueError: | ||
print("Could not split %s. Please fix." % e.rstrip()) | ||
sys.exit() | ||
|
||
print("Removed items: {}".format(len(removed_items))) | ||
db_list.append(entrydict) | ||
|
||
report["unmatched"] = unmatched_citekeys | ||
report["formatted_removed"] = removed_items | ||
report["formatted_notremoved"] = [x for x in citekey_dict.values() if not x in removed_items] | ||
|
||
bibfile_path = TMP_DIR + os.path.sep + "prelim.bib" | ||
|
||
write_report(report) | ||
|
||
db.entries = db_list | ||
|
||
writer = BibTexWriter() | ||
writer.indent = ' ' # indent entries with 4 spaces instead of one | ||
with open(bibfile_path, 'w') as fakebibfile: | ||
fakebibfile.write(writer.write(db)) | ||
logging.debug("Wrote a preliminary bibtex file") | ||
|
||
if args.fix_entry: | ||
with open(bibfile_path) as btf: | ||
btb = bibtexparser.load(btf) | ||
tmp_dict = btb.entries | ||
fix_bibentry(tmp_dict) | ||
else: | ||
logging.debug("Finished") | ||
# def main ends here | ||
|
||
if __name__ == '__main__': | ||
main() | ||
# finis |