Skip to content

Commit

Permalink
Helper tool for creating a BibTeX database
Browse files Browse the repository at this point in the history
  • Loading branch information
Klaus Thoden committed Mar 22, 2018
1 parent f557c91 commit fd2613e
Showing 1 changed file with 236 additions and 0 deletions.
236 changes: 236 additions & 0 deletions create_tmpbib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-

__version__ = "1.0"
__date__ = "20180321"
__author__ = "kthoden@mpiwg-berlin.mpg.de"

import os
import sys
import argparse
import logging
import re
import pickle
import difflib
import libeoaconvert
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase

logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')

TMP_DIR = os.path.expanduser("tmp_files")

def write_report(report):
"""Output statistics"""

print("="*60)
print(' '*4, "Report")
print("-"*60)

print("Number of entries in the formatted bibliography: {}".format(report["found_formatted"]))
print("Number of unique citekeys in the publication: {}".format(report["found_citekeys"]))

if len(report["unmatched"]) > 0:
print("{} {} could not be matched to entries from the formatted bibliography.".format(len(report["unmatched"]), libeoaconvert.plural(len(report["unmatched"]), "citekey")))
for item in report["unmatched"]:
print(' '*4, item)
else:
print("All citekeys found in the publication could be linked successfully.")
if len(report["formatted_notremoved"]) > 0:
print("{} {} in the formatted bibliography could not be matched to a citekey found in the publication:".format(len(report["formatted_notremoved"]), libeoaconvert.plural(len(report["formatted_notremoved"]), "citation")))
for item in report["formatted_notremoved"]:
print(' '*4, item)
else:
print("All entries from the formatted bibliography could be matched to a citekey.")
print("="*60)
# def write_report ends here

def tmp_citations(formatted_bibliography):
"""Create temporary citekeys out of formatted bibliography
Return a dictionary containing temporary citekey and full citation."""

authoryear_pattern = re.compile(r"^(?P<author>.+?),.+?\((?P<year>)((1[0-9]|20)[0-9]{2}[a-z]?|forth.*?)\).*?$")
tmp_citekey_dict = {}
nonematcher = 1

with open(formatted_bibliography, "r") as formbib:
citations = formbib.readlines()

for citation in citations:
matches = re.match(authoryear_pattern, citation)
try:
tmp_citekey = "{}_{}".format(matches.group(1), matches.group(3))
tmp_citekey_dict[tmp_citekey] = citation
except AttributeError:
tmp_citekey_dict["None_{:03d}".format(nonematcher)] = citation
nonematcher += 1

return tmp_citekey_dict
# def tmp_citations ends here

def fix_bibentry(bib_entries):
"""Fix the bibentry in interactive mode"""

available_types = {"a" : "article", "i" : "incollection", "m" :
"misc", "b" : "book", "k" : "booklet", "r" : "report", "t" :
"thesis", "p" : "inproceedings", "n" : "newspaper"}

db = BibDatabase()
db_list = []

for entry in bib_entries:
if len(entry["keyword"]) != 0:
entry["keywords"] = entry["keyword"]
entry.pop("keyword")

if len(entry["comments"]) == 0:
logging.info("Skipping {}".format(entry["ID"]))
else:
print("{}\nEntry type is currently set to _{}_. This is the formatted entry:\n\n{}\n\nDo you want to change the entry type?".format("="*20, entry["ENTRYTYPE"], entry["comments"]))
for xx in available_types:
print(xx, " -> ", available_types[xx])
entrytype = input("Your selection: ")

if entrytype in available_types.keys():
entry["ENTRYTYPE"] = available_types[entrytype]
elif entrytype == "":
entry["ENTRYTYPE"] = entry["ENTRYTYPE"]
else:
print("Wrong key. Continuing.")

db_list.append(entry)

bibfile_path = TMP_DIR + os.path.sep + "fixed.bib"

db.entries = db_list

writer = BibTexWriter()
writer.indent = ' ' # indent entries with 4 spaces instead of one
with open(bibfile_path, 'w') as fakebibfile:
fakebibfile.write(writer.write(db))
logging.debug("Fixed the entry types and wrote %s." % bibfile_path)
# def fix_bibentry ends here

def main():
"""The main bit"""

parser = argparse.ArgumentParser()
parser.add_argument("-k", "--keyword", help="Assign a keyword.")
parser.add_argument("-c", "--citekeys", help="Specify a file containing one citekey per line. Otherwise use data gathered by fix_tei.")
parser.add_argument("-f", "--fix_entry", help="In a second round, assign the correct entry type.", action="store_true")
parser.add_argument("formattedbib", help="A text file containing a formatted bibliography.")
args = parser.parse_args()

if not os.path.exists(TMP_DIR):
os.makedirs(TMP_DIR)

report = {}

citekey_dict = tmp_citations(args.formattedbib)
tmp_keys = citekey_dict.keys()

if args.citekeys is not None:
citekeys = args.citekeys
with open(citekeys, "r") as bf:
entries = bf.readlines()
else:
with open('tmp_files/data.pickle', 'rb') as f:
data = pickle.load(f)
entries = list(set(data["notcited"]))

num_tmp_keys = len(tmp_keys)
num_entries = len(entries)

report["found_formatted"] = num_tmp_keys
report["found_citekeys"] = num_entries

removed_items = []
unmatched_citekeys = []

db = BibDatabase()
db_list = []

for e in entries:

tmp_keys = citekey_dict.keys()
candidates = difflib.get_close_matches(e, tmp_keys)
sys.stdout.write("{} Citekey {} of {} {}\nLooking at {}.\n".format("="*10, entries.index(e) + 1, len(entries) + 1, "="*10, e.rstrip()))

count = 1
if len(candidates) == 1:
sys.stdout.write("Found only one match: {}\n".format(citekey_dict[candidates[0]]))
yesno = input("""Assign that entry to the found citekey? [Y/n]""")
print("input", yesno)
if yesno.lower() in ["y", ""]:
annotation = citekey_dict[candidates[0]]
removed_item = citekey_dict.pop(candidates[0])
print("RI", removed_item)
removed_items.append(removed_item)
else:
annotation = ""
elif len(candidates) > 1:
sys.stdout.write("Found \n")
for i in candidates:
sys.stdout.write("%s. %s" % (count, citekey_dict[i]))
count += 1
answer = input("""Does one of those (1 - %s) fit?\nTo abort, just press Enter: """ % (count - 1, ))
if answer:
chosen_citekey = candidates[int(answer) - 1 ]
annotation = citekey_dict[chosen_citekey]
removed_item = citekey_dict.pop(chosen_citekey)
print(removed_item)
removed_items.append(removed_item)
else:
annotation = ""
else:
sys.stdout.write("No candidate found.")
unmatched_citekeys.append(e.rstrip())
annotation = ""

# logging.debug(e.rstrip())
entrydict = {}
if args.keyword is not None:
entrydict["keywords"] = args.keyword
entrydict["ENTRYTYPE"] = "book"
entrydict["title"] = "Faketitle"
entrydict["comments"] = annotation.rstrip()
entrydict["ID"] = e.rstrip()
try:
entrydict["author"], entrydict["year"] = e.split("_")
except ValueError:
print("Could not split %s. Please fix." % e.rstrip())
sys.exit()

print("Removed items: {}".format(len(removed_items)))
db_list.append(entrydict)

report["unmatched"] = unmatched_citekeys
report["formatted_removed"] = removed_items
report["formatted_notremoved"] = [x for x in citekey_dict.values() if not x in removed_items]

bibfile_path = TMP_DIR + os.path.sep + "prelim.bib"

write_report(report)

db.entries = db_list

writer = BibTexWriter()
writer.indent = ' ' # indent entries with 4 spaces instead of one
with open(bibfile_path, 'w') as fakebibfile:
fakebibfile.write(writer.write(db))
logging.debug("Wrote a preliminary bibtex file")

if args.fix_entry:
with open(bibfile_path) as btf:
btb = bibtexparser.load(btf)
tmp_dict = btb.entries
fix_bibentry(tmp_dict)
else:
logging.debug("Finished")
# def main ends here

if __name__ == '__main__':
main()
# finis

0 comments on commit fd2613e

Please sign in to comment.