create_tmpbib.py

#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-

"""A helper script that creates a temporary bibtex file from a
formatted list of references.

In cases where authors hand in a formatted version of the bibliography
(rather than a reference database), this tool can help creating a
database in BibTeX format.

It is being used in conjunction with the DocX workflow where citekeys
are already present in the source file. The script receives a list of
formatted references as input (one entry per line). Optionally, a
keyword can be assigned, for example the name of the chapter author,
or the entry type can be fixed.

We require authors to use shortcuts in their docx manuscript when
citing, including the use of a citekey (`LASTNAME_YEAR`), there should
already be citekeys in the manuscript. When running `fix_tei.py`,
these citekeys are gathered together and can be used as an input to
this tool.

The tool creates temporary citekeys out of the formatted bibliography
and in an interactive session, the user selects the most likely entry.
With this, rudimentary entries can be created.
"""

__version__ = "1.0"
__date__ = "20180321"
__author__ = "kthoden@mpiwg-berlin.mpg.de"

import os
import sys
import argparse
import logging
import re
import pickle
import difflib
import libeoaconvert
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase

logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')

TMP_DIR = os.path.expanduser("tmp_files")

def write_report(report):
    """Output statistics"""

    print("="*60)
    print(' '*4, "Report")
    print("-"*60)

    print("Number of entries in the formatted bibliography: {}".format(report["found_formatted"]))
    print("Number of unique citekeys in the publication: {}".format(report["found_citekeys"]))

    if len(report["unmatched"]) > 0:
        print("{} {} found in the publication could not be matched to entries from the formatted bibliography.".format(len(report["unmatched"]), libeoaconvert.plural(len(report["unmatched"]), "citekey")))
        for item in report["unmatched"]:
            print(' '*4, item)
    else:
        print("All citekeys found in the publication could be linked successfully.")
    if len(report["formatted_notremoved"]) > 0:
        print("{} {} in the formatted bibliography could not be matched to a citekey found in the publication:".format(len(report["formatted_notremoved"]), libeoaconvert.plural(len(report["formatted_notremoved"]), "citation")))
        for item in report["formatted_notremoved"]:
            print(' '*4, item)
    else:
        print("All entries from the formatted bibliography could be matched to a citekey.")
    print("="*60)
# def write_report ends here

def tmp_citations(formatted_bibliography):
    """Create temporary citekeys out of formatted bibliography

    Return a dictionary containing temporary citekey and full citation."""

    authoryear_pattern = re.compile(r"^(?P<author>.+?),.+?\((?P<year>)((1[0-9]|20)[0-9]{2}[a-z]?|forth.*?)\).*?$")
    tmp_citekey_dict = {}
    nonematcher = 1

    with open(formatted_bibliography, "r") as formbib:
        citations = formbib.readlines()

    for citation in citations:
        if len(citation) > 2:
            logging.debug("Creating temporary citekey from %s." % citation)
            matches = re.match(authoryear_pattern, citation)
            try:
                tmp_citekey = "{}_{}".format(matches.group(1), matches.group(3))
                tmp_citekey_dict[tmp_citekey] = citation
            except AttributeError:
                tmp_citekey_dict["None_{:03d}".format(nonematcher)] = citation
                nonematcher += 1

        else:
            logging.debug("Skipping blank line")

    return tmp_citekey_dict
# def tmp_citations ends here

def fix_bibentry(bib_entries):
    """Fix the bibentry in interactive mode"""

    available_types = {"a" : "article", "i" : "incollection", "m" :
    "misc", "b" : "book", "k" : "booklet", "r" : "report", "t" :
    "thesis", "p" : "inproceedings", "n" : "newspaper"}

    db = BibDatabase()
    db_list = []

    for entry in bib_entries:
        if len(entry["keyword"]) != 0:
            entry["keywords"] = entry["keyword"]
            entry.pop("keyword")

        if len(entry["comments"]) == 0:
            logging.info("Skipping {}".format(entry["ID"]))
        else:
            print("{}\nEntry type is currently set to _{}_. This is the formatted entry:\n\n{}\n\nDo you want to change the entry type?".format("="*20, entry["ENTRYTYPE"], entry["comments"]))
        for xx in available_types:
            print(xx, " -> ", available_types[xx])
        entrytype = input("Your selection: ")

        if entrytype in available_types.keys():
            entry["ENTRYTYPE"] = available_types[entrytype]
        elif entrytype == "":
            entry["ENTRYTYPE"] = entry["ENTRYTYPE"]
        else:
            print("Wrong key. Continuing.")

        db_list.append(entry)

    bibfile_path = TMP_DIR + os.path.sep + "fixed.bib"

    db.entries = db_list

    writer = BibTexWriter()
    writer.indent = '    '     # indent entries with 4 spaces instead of one
    with open(bibfile_path, 'w') as fakebibfile:
        fakebibfile.write(writer.write(db))
    logging.debug("Fixed the entry types and wrote %s." % bibfile_path)
# def fix_bibentry ends here

def main():
    """The main bit"""

    parser = argparse.ArgumentParser()
    parser.add_argument("-k", "--keyword", help="Assign a keyword.")
    parser.add_argument("-f", "--fix_entry", help="In a second round, assign the correct entry type.", action="store_true")

    group = parser.add_mutually_exclusive_group()
    group.add_argument("-c", "--citekeys", help="Specify a file containing one citekey per line. Otherwise use data gathered by fix_tei.")
    group.add_argument("-m", "--makekeyfile", help="If there is no citekey file.", action="store_true")

    parser.add_argument("formattedbib", help="A text file containing a formatted bibliography.")
    args = parser.parse_args()

    if not os.path.exists(TMP_DIR):
        os.makedirs(TMP_DIR)

    report = {}

    citekey_dict = tmp_citations(args.formattedbib)
    tmp_keys = citekey_dict.keys()

    if args.makekeyfile is not None:
        entry_list = tmp_keys
        entries = list(entry_list)
    elif args.citekeys is not None:
        citekeys = args.citekeys
        with open(citekeys, "r") as bf:
            entries = bf.readlines()
    else:
        with open('tmp_files/data.pickle', 'rb') as f:
            data = pickle.load(f)
            entries = list(set(data["citekey_not_in_bib"]))

    num_tmp_keys = len(tmp_keys)
    num_entries = len(entries)

    report["found_formatted"] = num_tmp_keys
    report["found_citekeys"] = num_entries

    removed_items = []
    unmatched_citekeys = []

    db = BibDatabase()
    db_list = []

    for e in entries:
        tmp_keys = citekey_dict.keys()
        if args.makekeyfile is None:
            candidates = difflib.get_close_matches(e, tmp_keys)
            sys.stdout.write("{} Citekey {} of {} {}\nLooking at {}.\n".format("="*10, entries.index(e) + 1, len(entries) + 1, "="*10, e.rstrip()))

            count = 1
            if len(candidates) == 1:
                sys.stdout.write("Found only one match: {}\n".format(citekey_dict[candidates[0]]))
                yesno = input("""Assign that entry to the found citekey? [Y/n] """)
                if yesno.lower() in ["y", ""]:
                    annotation = citekey_dict[candidates[0]]
                    removed_item = citekey_dict.pop(candidates[0])
                    removed_items.append(removed_item)
                    entry_keyword = "FIXME"
                else:
                    unmatched_citekeys.append(e.rstrip())
                    entry_keyword = "FILLMEIN"
                    annotation = "No candidate found in formatted bibliography"
            elif len(candidates) > 1:
                sys.stdout.write("Found \n")
                for i in candidates:
                    sys.stdout.write("%s. %s" % (count, citekey_dict[i]))
                    count += 1
                answer = input("""Does one of those (1 - %s) fit?\nTo abort, press 'n': """ % (count - 1, ))
                print("The answer is", answer)
                if answer.lower() == 'n':
                    annotation = "No candidate found in formatted bibliography"
                    entry_keyword = "FILLMEIN"
                elif int(answer) in range(1, count):
                    chosen_citekey = candidates[int(answer) - 1 ]
                    annotation = citekey_dict[chosen_citekey]
                    entry_keyword = "FIXME"
                    removed_item = citekey_dict.pop(chosen_citekey)
                    removed_items.append(removed_item)
            else:
                sys.stdout.write("No candidate found in formatted bibliography for citekey %s.\n" % e.rstrip())
                unmatched_citekeys.append(e.rstrip())
                annotation = "No candidate found in formatted bibliography"
                entry_keyword = "FILLMEIN"

            entrydict = {}
            if args.keyword is not None:
                all_keywords = entry_keyword.split(", ")
                if args.keyword in all_keywords:
                    print("keyword already in ")
                    continue
                else:
                    print("option keyword")
                    entry_keyword = entry_keyword + ", " + args.keyword
            entrydict["keywords"] = entry_keyword
            entrydict["ENTRYTYPE"] = "book"
            entrydict["title"] = "Faketitle"
            entrydict["comments"] = annotation.rstrip()
            entrydict["ID"] = e.rstrip()
            try:
                entrydict["author"], entrydict["year"] = e.split("_")
            except ValueError:
                print("Could not split %s. Please fix." % e.rstrip())
                sys.exit()

            print("Removed items: {}".format(len(removed_items)))
            db_list.append(entrydict)
        else:
            entrydict = {}
            annotation = citekey_dict[e]
            entry_keyword = "FILLMEIN"
            if args.keyword is not None:
                all_keywords = entry_keyword.split(", ")
                if args.keyword in all_keywords:
                    print("keyword already in ")
                    continue
                else:
                    print("option keyword")
                    entry_keyword = entry_keyword + ", " + args.keyword
            entrydict["keywords"] = entry_keyword
            entrydict["ENTRYTYPE"] = "book"
            entrydict["title"] = "Faketitle"
            entrydict["comments"] = annotation.rstrip()
            entrydict["ID"] = e.rstrip()
            try:
                entrydict["author"], entrydict["year"] = e.split("_")
            except ValueError:
                print("Could not split %s. Please fix." % e.rstrip())
                sys.exit()

            db_list.append(entrydict)

    report["unmatched"] = unmatched_citekeys
    report["formatted_removed"] = removed_items
    report["formatted_notremoved"] = [x for x in citekey_dict.values() if not x in removed_items]

    formatted_not_found_path = TMP_DIR + os.path.sep + "unassigned_formatted_entries.txt"
    with open(formatted_not_found_path, 'w') as unassigned:
        unassigned.write("\n".join([x for x in citekey_dict.values() if not x in removed_items]))

    bibfile_path = TMP_DIR + os.path.sep + "prelim.bib"

    write_report(report)

    db.entries = db_list

    writer = BibTexWriter()
    writer.indent = '    '     # indent entries with 4 spaces instead of one
    with open(bibfile_path, 'w') as fakebibfile:
        fakebibfile.write(writer.write(db))
    logging.debug("Wrote a preliminary bibtex file")

    if args.fix_entry:
        with open(bibfile_path) as btf:
            btb = bibtexparser.load(btf)
            tmp_dict = btb.entries
        fix_bibentry(tmp_dict)
    else:
        logging.debug("Finished")
# def main ends here

if __name__ == '__main__':
    main()
# finis
	#!/usr/bin/env python3
	# -- coding: utf-8; mode: python --

	"""A helper script that creates a temporary bibtex file from a
	formatted list of references.

	In cases where authors hand in a formatted version of the bibliography
	(rather than a reference database), this tool can help creating a
	database in BibTeX format.

	It is being used in conjunction with the DocX workflow where citekeys
	are already present in the source file. The script receives a list of
	formatted references as input (one entry per line). Optionally, a
	keyword can be assigned, for example the name of the chapter author,
	or the entry type can be fixed.

	We require authors to use shortcuts in their docx manuscript when
	citing, including the use of a citekey (`LASTNAME_YEAR`), there should
	already be citekeys in the manuscript. When running `fix_tei.py`,
	these citekeys are gathered together and can be used as an input to
	this tool.

	The tool creates temporary citekeys out of the formatted bibliography
	and in an interactive session, the user selects the most likely entry.
	With this, rudimentary entries can be created.
	"""

	__version__ = "1.0"
	__date__ = "20180321"
	__author__ = "kthoden@mpiwg-berlin.mpg.de"

	import os
	import sys
	import argparse
	import logging
	import re
	import pickle
	import difflib
	import libeoaconvert
	import bibtexparser
	from bibtexparser.bwriter import BibTexWriter
	from bibtexparser.bibdatabase import BibDatabase

	logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')

	TMP_DIR = os.path.expanduser("tmp_files")

	def write_report(report):
	"""Output statistics"""

	print("="*60)
	print(' '*4, "Report")
	print("-"*60)

	print("Number of entries in the formatted bibliography: {}".format(report["found_formatted"]))
	print("Number of unique citekeys in the publication: {}".format(report["found_citekeys"]))

	if len(report["unmatched"]) > 0:
	print("{} {} found in the publication could not be matched to entries from the formatted bibliography.".format(len(report["unmatched"]), libeoaconvert.plural(len(report["unmatched"]), "citekey")))
	for item in report["unmatched"]:
	print(' '*4, item)
	else:
	print("All citekeys found in the publication could be linked successfully.")
	if len(report["formatted_notremoved"]) > 0:
	print("{} {} in the formatted bibliography could not be matched to a citekey found in the publication:".format(len(report["formatted_notremoved"]), libeoaconvert.plural(len(report["formatted_notremoved"]), "citation")))
	for item in report["formatted_notremoved"]:
	print(' '*4, item)
	else:
	print("All entries from the formatted bibliography could be matched to a citekey.")
	print("="*60)
	# def write_report ends here

	def tmp_citations(formatted_bibliography):
	"""Create temporary citekeys out of formatted bibliography

	Return a dictionary containing temporary citekey and full citation."""

	authoryear_pattern = re.compile(r"^(?P<author>.+?),.+?\((?P<year>)((1[0-9]\|20)[0-9]{2}[a-z]?\|forth.?)\).?$")
	tmp_citekey_dict = {}
	nonematcher = 1

	with open(formatted_bibliography, "r") as formbib:
	citations = formbib.readlines()

	for citation in citations:
	if len(citation) > 2:
	logging.debug("Creating temporary citekey from %s." % citation)
	matches = re.match(authoryear_pattern, citation)
	try:
	tmp_citekey = "{}_{}".format(matches.group(1), matches.group(3))
	tmp_citekey_dict[tmp_citekey] = citation
	except AttributeError:
	tmp_citekey_dict["None_{:03d}".format(nonematcher)] = citation
	nonematcher += 1

	else:
	logging.debug("Skipping blank line")

	return tmp_citekey_dict
	# def tmp_citations ends here

	def fix_bibentry(bib_entries):
	"""Fix the bibentry in interactive mode"""

	available_types = {"a" : "article", "i" : "incollection", "m" :
	"misc", "b" : "book", "k" : "booklet", "r" : "report", "t" :
	"thesis", "p" : "inproceedings", "n" : "newspaper"}

	db = BibDatabase()
	db_list = []

	for entry in bib_entries:
	if len(entry["keyword"]) != 0:
	entry["keywords"] = entry["keyword"]
	entry.pop("keyword")

	if len(entry["comments"]) == 0:
	logging.info("Skipping {}".format(entry["ID"]))
	else:
	print("{}\nEntry type is currently set to _{}_. This is the formatted entry:\n\n{}\n\nDo you want to change the entry type?".format("="*20, entry["ENTRYTYPE"], entry["comments"]))
	for xx in available_types:
	print(xx, " -> ", available_types[xx])
	entrytype = input("Your selection: ")

	if entrytype in available_types.keys():
	entry["ENTRYTYPE"] = available_types[entrytype]
	elif entrytype == "":
	entry["ENTRYTYPE"] = entry["ENTRYTYPE"]
	else:
	print("Wrong key. Continuing.")

	db_list.append(entry)

	bibfile_path = TMP_DIR + os.path.sep + "fixed.bib"

	db.entries = db_list

	writer = BibTexWriter()
	writer.indent = ' ' # indent entries with 4 spaces instead of one
	with open(bibfile_path, 'w') as fakebibfile:
	fakebibfile.write(writer.write(db))
	logging.debug("Fixed the entry types and wrote %s." % bibfile_path)
	# def fix_bibentry ends here

	def main():
	"""The main bit"""

	parser = argparse.ArgumentParser()
	parser.add_argument("-k", "--keyword", help="Assign a keyword.")
	parser.add_argument("-f", "--fix_entry", help="In a second round, assign the correct entry type.", action="store_true")

	group = parser.add_mutually_exclusive_group()
	group.add_argument("-c", "--citekeys", help="Specify a file containing one citekey per line. Otherwise use data gathered by fix_tei.")
	group.add_argument("-m", "--makekeyfile", help="If there is no citekey file.", action="store_true")

	parser.add_argument("formattedbib", help="A text file containing a formatted bibliography.")
	args = parser.parse_args()

	if not os.path.exists(TMP_DIR):
	os.makedirs(TMP_DIR)

	report = {}

	citekey_dict = tmp_citations(args.formattedbib)
	tmp_keys = citekey_dict.keys()

	if args.makekeyfile is not None:
	entry_list = tmp_keys
	entries = list(entry_list)
	elif args.citekeys is not None:
	citekeys = args.citekeys
	with open(citekeys, "r") as bf:
	entries = bf.readlines()
	else:
	with open('tmp_files/data.pickle', 'rb') as f:
	data = pickle.load(f)
	entries = list(set(data["citekey_not_in_bib"]))

	num_tmp_keys = len(tmp_keys)
	num_entries = len(entries)

	report["found_formatted"] = num_tmp_keys
	report["found_citekeys"] = num_entries

	removed_items = []
	unmatched_citekeys = []

	db = BibDatabase()
	db_list = []

	for e in entries:
	tmp_keys = citekey_dict.keys()
	if args.makekeyfile is None:
	candidates = difflib.get_close_matches(e, tmp_keys)
	sys.stdout.write("{} Citekey {} of {} {}\nLooking at {}.\n".format("="10, entries.index(e) + 1, len(entries) + 1, "="10, e.rstrip()))

	count = 1
	if len(candidates) == 1:
	sys.stdout.write("Found only one match: {}\n".format(citekey_dict[candidates[0]]))
	yesno = input("""Assign that entry to the found citekey? [Y/n] """)
	if yesno.lower() in ["y", ""]:
	annotation = citekey_dict[candidates[0]]
	removed_item = citekey_dict.pop(candidates[0])
	removed_items.append(removed_item)
	entry_keyword = "FIXME"
	else:
	unmatched_citekeys.append(e.rstrip())
	entry_keyword = "FILLMEIN"
	annotation = "No candidate found in formatted bibliography"
	elif len(candidates) > 1:
	sys.stdout.write("Found \n")
	for i in candidates:
	sys.stdout.write("%s. %s" % (count, citekey_dict[i]))
	count += 1
	answer = input("""Does one of those (1 - %s) fit?\nTo abort, press 'n': """ % (count - 1, ))
	print("The answer is", answer)
	if answer.lower() == 'n':
	annotation = "No candidate found in formatted bibliography"
	entry_keyword = "FILLMEIN"
	elif int(answer) in range(1, count):
	chosen_citekey = candidates[int(answer) - 1 ]
	annotation = citekey_dict[chosen_citekey]
	entry_keyword = "FIXME"
	removed_item = citekey_dict.pop(chosen_citekey)
	removed_items.append(removed_item)
	else:
	sys.stdout.write("No candidate found in formatted bibliography for citekey %s.\n" % e.rstrip())
	unmatched_citekeys.append(e.rstrip())
	annotation = "No candidate found in formatted bibliography"
	entry_keyword = "FILLMEIN"

	entrydict = {}
	if args.keyword is not None:
	all_keywords = entry_keyword.split(", ")
	if args.keyword in all_keywords:
	print("keyword already in ")
	continue
	else:
	print("option keyword")
	entry_keyword = entry_keyword + ", " + args.keyword
	entrydict["keywords"] = entry_keyword
	entrydict["ENTRYTYPE"] = "book"
	entrydict["title"] = "Faketitle"
	entrydict["comments"] = annotation.rstrip()
	entrydict["ID"] = e.rstrip()
	try:
	entrydict["author"], entrydict["year"] = e.split("_")
	except ValueError:
	print("Could not split %s. Please fix." % e.rstrip())
	sys.exit()

	print("Removed items: {}".format(len(removed_items)))
	db_list.append(entrydict)
	else:
	entrydict = {}
	annotation = citekey_dict[e]
	entry_keyword = "FILLMEIN"
	if args.keyword is not None:
	all_keywords = entry_keyword.split(", ")
	if args.keyword in all_keywords:
	print("keyword already in ")
	continue
	else:
	print("option keyword")
	entry_keyword = entry_keyword + ", " + args.keyword
	entrydict["keywords"] = entry_keyword
	entrydict["ENTRYTYPE"] = "book"
	entrydict["title"] = "Faketitle"
	entrydict["comments"] = annotation.rstrip()
	entrydict["ID"] = e.rstrip()
	try:
	entrydict["author"], entrydict["year"] = e.split("_")
	except ValueError:
	print("Could not split %s. Please fix." % e.rstrip())
	sys.exit()

	db_list.append(entrydict)

	report["unmatched"] = unmatched_citekeys
	report["formatted_removed"] = removed_items
	report["formatted_notremoved"] = [x for x in citekey_dict.values() if not x in removed_items]

	formatted_not_found_path = TMP_DIR + os.path.sep + "unassigned_formatted_entries.txt"
	with open(formatted_not_found_path, 'w') as unassigned:
	unassigned.write("\n".join([x for x in citekey_dict.values() if not x in removed_items]))

	bibfile_path = TMP_DIR + os.path.sep + "prelim.bib"

	write_report(report)

	db.entries = db_list

	writer = BibTexWriter()
	writer.indent = ' ' # indent entries with 4 spaces instead of one
	with open(bibfile_path, 'w') as fakebibfile:
	fakebibfile.write(writer.write(db))
	logging.debug("Wrote a preliminary bibtex file")

	if args.fix_entry:
	with open(bibfile_path) as btf:
	btb = bibtexparser.load(btf)
	tmp_dict = btb.entries
	fix_bibentry(tmp_dict)
	else:
	logging.debug("Finished")
	# def main ends here

	if __name__ == '__main__':
	main()
	# finis