diff --git a/parsezotero.py b/parsezotero.py new file mode 100644 index 0000000..6fafe96 --- /dev/null +++ b/parsezotero.py @@ -0,0 +1,383 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8; mode: python -*- + +"""Using a Zotero database for publications and their word plugin, +users don't need to put in citekey shorthands into the word +manuscripts. They can use the functionality of the plugin, which also +allows to cite only a year, add page ranges and, if needed, add a +prefix or a suffix. + +The TEI converter renders the Zotero data as a processing instruction +that contains JSON code. This module parses out the relevant data and +replaces the JSON with a TEI element. + +One last step to overcome is the citekey. When exporting the Zotero +database to bibtex, a citekey is created (following the rules in +https://github.com/zotero/translators/blob/master/BibTeX.js, see also +https://tex.stackexchange.com/questions/398521/custom-citation-keys-in-programs-like-zotero-or-mendeley/398749). +While this is present in the exported bibtex data, it is not present +in the JSON file. + +The solution is a Zotero translation server +(https://github.com/zotero/translation-server, also described in +https://forums.zotero.org/discussion/73694/is-there-a-zbib-api-that-returns-bibtex-entry-string). +This is a translation service (can be run locally) that can handle a +Zotero API JSON and is able to export into biblatex. + +Although BetterBibTeX +(https://github.com/retorquere/zotero-better-bibtex) allows for the +creation of unique citekeys across the whole Zotero database, the JSON +export does not contain this citekey. + +""" + +__version__ = "1.0" +__date__ = "20190607" +__author__ = "kthoden@mpiwg-berlin.mpg.de" + +import argparse +import logging +import json +import os +import sys +import requests +import bibtexparser +from lxml import etree +import utils.libeoaconvert as libeoaconvert + +logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') + +ns_tei = "http://www.tei-c.org/ns/1.0" +NS_MAP = {"t" : ns_tei} +ZOTERO_CSL_STRING = "ADDIN ZOTERO_ITEM CSL_CITATION" +CITATION_SEPARATOR = ";" +TRANSLATOR_URL = "http://127.0.0.1:1969" +BIBTEX_FILE = "bibliography.bib" + + + +def find_citation_pis(xmltree): + """Find processing instructions in XML tree""" + + biblio_pis = xmltree.xpath("//processing-instruction('biblio')") + logging.info(f"Found {libeoaconvert.plural(len(biblio_pis), 'processing instruction')} called 'biblio'.") + + # retain only citations + citation_pis = [x for x in biblio_pis if x.text.startswith(ZOTERO_CSL_STRING)] + logging.info(f"Found {libeoaconvert.plural(len(citation_pis), 'zotero citation')}.") + + return citation_pis +# def find_citation_pis ends here + + +def parse_citation_pi(pi_text): + """Parse the text of a Zotero processing instruction. + + It contains a string and json code. There can be more than one + citation in citationItems: + + (Bulatovic et al. 2016) und + """ + + json_part = pi_text.replace(ZOTERO_CSL_STRING, "") + + return json_part.strip() +# def parse_citation_pi ends here + + +def parse_json(citation_json): + """Parse string into JSON object""" + + json_object = json.loads(citation_json) + + return json_object +# def parse_json ends here + + +def get_info_from_json(parsed_json): + """Extract the relevant parts from JSON object.""" + + citation_dict = {} + + properties = parsed_json.get("properties") + items = parsed_json.get("citationItems")[0] + itemdata = items.get('itemData') + issued = itemdata.get('issued') + try: + year = issued['date-parts'][0][0] + except KeyError: + year = issued['literal'] + + citation_dict["formatted"] = properties.get("plainCitation") + citation_dict["pagerange"] = items.get("locator") + citation_dict["prefix"] = items.get("prefix") + citation_dict["suffix"] = items.get("suffix") + citation_dict["zotero_url"] = items.get("uri")[0] + citation_dict["year"] = year + + return citation_dict +# def get_info_from_json ends here + + +def convert_to_csl_json(items_dict): + """Convert CSL JSON to Zotero API JSON. + + JSON data from a citation item in the processing instruction needs + to modified to the CSL JSON format. Currently, this JSON is + written to a temporary file + + The data consists of the itemData block, with the ID being the + URI: + + [{"id": "http://zotero.org/users/915539/items/4W8TZXCQ", "type": "paper-conference", "title": "Discussion on Radiation", "container-title": "Report of the Eighty-Third Meeting of the British Association for the Advancement of Science. Birmingham: 1913, September 10-17", "publisher": "John Murray", "publisher-place": "London", "page": "376–386", "event-place": "London", "author": [ { "literal": "Anonymous" } ], "issued": { "date-parts": [ [ "1914" ] ] }}] + + """ + + zotero_id = items_dict.get("uri")[0] + itemdata = items_dict.get('itemData') + issued = itemdata.get('issued') + + itemdata["id"] = zotero_id + + file_id = zotero_id.split("/")[-1] + + tmp_filename = f"{file_id}.json" + + # write itemdata to new file + with open(tmp_filename, "w") as data_file: + itemtojson = itemdata + json.dump(itemtojson, data_file, indent=2) + + logging.debug(f"Wrote {tmp_filename}.") + + return f"{file_id}.json" +# def convert_to_csl_json ends here + + +def import_csl_json(csl_json): + """Convert CSL JSON to Zotero API JSON + + The format looks like this: + + [ { "key": "PXKZK2WF", "version": 0, "itemType": "conferencePaper", "creators": [ { "name": "Anonymous", "creatorType": "author" } ], "tags": [], "title": "Discussion on Radiation", "proceedingsTitle": "Report of the Eighty-Third Meeting of the British Association for the Advancement of Science. Birmingham: 1913, September 10-17", "publisher": "John Murray", "place": "London", "pages": "376–386", "date": "1914" }] + """ + + # curl --data-binary @cslstylefile.json -H 'Content-Type: text/plain' 'http://127.0.0.1:1969/import' + + headers = { 'Content-Type': 'text/plain', } + data = open(csl_json, 'rb').read() + logging.debug(f"Trying to communicate with {TRANSLATOR_URL}") + + try: + response = requests.post(f'{TRANSLATOR_URL}/import', headers=headers, data=data) + except: + logging.error(f"No connection possible to {TRANSLATOR_URL}. Maybe the translation service is down? Exiting.") + sys.exit(0) + + return response.content +# def import_csl_json ends here + + +def create_bibtex(zotero_api_json): + """Get bibtex entry from Zotero translation server""" + + # translation made by https://curl.trillworks.com/ + # source: curl -d @items.json -H 'Content-Type: application/json' 'http://127.0.0.1:1969/export?format=bibtex' + + headers = { 'Content-Type': 'application/json', } + params = ( ('format', 'biblatex'), ) + data = zotero_api_json + try: + response = requests.post(f'{TRANSLATOR_URL}/export', headers=headers, params=params, data=data) + except: + logging.error(f"No connection possible to {TRANSLATOR_URL}. Maybe the translation service is down? Exiting.") + sys.exit(0) + + bibtex = response.content.decode('utf-8') + + return bibtex +# def create_bibtex ends here + + +def write_to_bibfile(bibtex_entry): + """Append entry to bibfile""" + + with open(BIBTEX_FILE, "a") as bibfile: + bibfile.write(bibtex_entry) +# def write_to_bibfile ends here + + +def get_citekey(bibtex_entry): + """Parse bibtex entry for citekey""" + + parser = bibtexparser.bparser.BibTexParser() + # be a bit lax about nonstandard entry types + parser.ignore_nonstandard_types = False + + parsed_entry = bibtexparser.loads(bibtex_entry, parser=parser) + + citekey = parsed_entry.entries[0]["ID"] + + return citekey +# def get_citekey ends here + + +def create_citation_element(citation_dict, total_items, index_item): + """Create an XML element with zotero data""" + + bibl = etree.Element("bibl") + + if total_items > 1: + formatted_citation = citation_dict['formatted'].split(CITATION_SEPARATOR)[index_item] + if not index_item + 1 == total_items: + bibl.tail = CITATION_SEPARATOR + else: + formatted_citation = citation_dict['formatted'] + + bibl.text = formatted_citation + ref = etree.Element("ref") + bibl.insert(0, ref) + # add year or authoryear, need a good heuristic here + if citation_dict["formatted"] == f"({citation_dict['year']})": + ref.set("type", "year") + else: + ref.set("type", "authoryear") + + if citation_dict["citekey"]: + ref.set("url", f"#{citation_dict['citekey']}") + else: + ref.set("url", f"#{citation_dict['zotero_url']}") + if citation_dict["pagerange"]: + citedrange = etree.Element("citedRange") + citedrange.text = citation_dict["pagerange"] + ref.insert(0, citedrange) + + # if citation_dict["prefix"]: + # element_string = element_string.replace('', f'{citation_dict["prefix"]} ') + # if citation_dict["suffix"]: + # element_string = element_string.replace('', f' {citation_dict["suffix"]}') + return bibl +# def create_citation_element ends here + + +def citation_item_to_bibl(citation_item, parsed_json, citekey_list, number_of_items, item_position): + """Wrapper function for citation items. + + This function makes use of the Zotero translation server + (https://github.com/zotero/translation-server) for format + conversion. + + The JSON found in the each citationItem of the + processing instruction is + - modified to CSL JSON format + - translated into Zotero API JSON format + - exported to BibTeX + + An tei:bibl element is written that will replace the processing instruction in the XML. + """ + + citation_dict = get_info_from_json(parsed_json) + citation_id = parsed_json.get("citationID") + citation_dict["id"] = citation_id + + # convert csl json to zotero api json + zotero_api_json_filename = convert_to_csl_json(citation_item) + # first call to translation server + zotero_api_json = import_csl_json(zotero_api_json_filename) + os.unlink(zotero_api_json_filename) + # second call to translation server + bibtex_entry = create_bibtex(zotero_api_json) + citekey = get_citekey(bibtex_entry) + citation_dict["citekey"] = citekey + if citekey not in citekey_list: + citekey_list.append(citekey) + write_to_bibfile(bibtex_entry) + + citation_element = create_citation_element(citation_dict, number_of_items, item_position) + + return citation_element, citation_dict['formatted'] +# def citation_item_to_bibl ends here + + +def turn_pi_into_bibl(pi, citekey_list): + """Wrapper function for the conversion steps.""" + + # wrap a temporary element around citations + tmp_element = etree.Element("tmp_bib") + pi_json = parse_citation_pi(pi.text) + parsed_json = parse_json(pi_json) + # there can be more than one citation in one processing instruction + items = parsed_json.get("citationItems") + number_of_items = len(items) + logging.info(f"Found {libeoaconvert.plural(number_of_items, 'item')} in this zotero citation.") + for citation_item in items: + item_position = items.index(citation_item) + citation_element, formatted_citation = citation_item_to_bibl(citation_item, parsed_json, citekey_list, number_of_items, item_position) + tmp_element.append(citation_element) + + # remove formatted citation from tail + pi_tail = pi.tail + tmp_element.tail = pi_tail.replace(formatted_citation, "") + # replace processing instruction with bibl elements + parent_element = pi.getparent() + parent_element.replace(pi, tmp_element) +# def turn_pi_into_bibl ends here + + +def add_bib_to_header(xmltree, BIBTEX_FILE): + """Add a reference to bibfile to header""" + + sourcedesc = xmltree.xpath("/t:TEI/t:teiHeader/t:fileDesc/t:sourceDesc", namespaces=NS_MAP)[0] + ab_element = etree.Element("ab", type="database") + ref = etree.SubElement(ab_element, "ref") + ref.set("target", BIBTEX_FILE) + ref.set("type", "please-specify-anthology-or-monograph") + + sourcedesc.append(ab_element) +# def add_bib_to_header ends here + + +def cleanup_xml(xmltree): + """Perform some cleanups""" + + etree.strip_tags(xmltree, "tmp_bib") + literaturverzeichnis = xmltree.xpath("//*[@rend='Literaturverzeichnis1']") + for element in literaturverzeichnis: + element.attrib.pop("rend") +# def cleanup_xml ends here + + +def write_xml_output(tree, filename): + """Write modified tree to file""" + + tree.write(filename, pretty_print=True, xml_declaration=True, encoding="utf-8") + logging.info(f"Wrote {filename}.") +# def write_xml_output ends here + + +def main(): + """The main bit""" + + parser = argparse.ArgumentParser() + parser.add_argument("xmlfile", help="XML file converted from Word, containing Zotero citations.") + args = parser.parse_args() + + xmltree = etree.parse(args.xmlfile) + citation_pis = find_citation_pis(xmltree) + citekey_list = [] + + for pi in citation_pis: + turn_pi_into_bibl(pi, citekey_list) + + cleanup_xml(xmltree) + add_bib_to_header(xmltree, BIBTEX_FILE) + write_xml_output(xmltree, args.xmlfile.replace(".xml", "-biblrefs.xml")) +# def main ends here + + +if __name__ == '__main__': + main() +# finis + +# Noch machen +# prefix and suffix