New tool added: parsezotero

EditionOpenAccess · Jun 12, 2019 · 2113e98 · 2113e98
1 parent 6c738b8
commit 2113e98
Showing 1 changed file with 383 additions and 0 deletions.
diff --git a/parsezotero.py b/parsezotero.py
@@ -0,0 +1,383 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8; mode: python -*-
+
+"""Using a Zotero database for publications and their word plugin,
+users don't need to put in citekey shorthands into the word
+manuscripts. They can use the functionality of the plugin, which also
+allows to cite only a year, add page ranges and, if needed, add a
+prefix or a suffix.
+
+The TEI converter renders the Zotero data as a processing instruction
+that contains JSON code. This module parses out the relevant data and
+replaces the JSON with a TEI element.
+
+One last step to overcome is the citekey. When exporting the Zotero
+database to bibtex, a citekey is created (following the rules in
+https://github.com/zotero/translators/blob/master/BibTeX.js, see also
+https://tex.stackexchange.com/questions/398521/custom-citation-keys-in-programs-like-zotero-or-mendeley/398749).
+While this is present in the exported bibtex data, it is not present
+in the JSON file.
+
+The solution is a Zotero translation server
+(https://github.com/zotero/translation-server, also described in
+https://forums.zotero.org/discussion/73694/is-there-a-zbib-api-that-returns-bibtex-entry-string).
+This is a translation service (can be run locally) that can handle a
+Zotero API JSON and is able to export into biblatex.
+
+Although BetterBibTeX
+(https://github.com/retorquere/zotero-better-bibtex) allows for the
+creation of unique citekeys across the whole Zotero database, the JSON
+export does not contain this citekey.
+
+"""
+
+__version__ = "1.0"
+__date__ = "20190607"
+__author__ = "kthoden@mpiwg-berlin.mpg.de"
+
+import argparse
+import logging
+import json
+import os
+import sys
+import requests
+import bibtexparser
+from lxml import etree
+import utils.libeoaconvert as libeoaconvert
+
+logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
+
+ns_tei = "http://www.tei-c.org/ns/1.0"
+NS_MAP = {"t" : ns_tei}
+ZOTERO_CSL_STRING = "ADDIN ZOTERO_ITEM CSL_CITATION"
+CITATION_SEPARATOR = ";"
+TRANSLATOR_URL = "http://127.0.0.1:1969"
+BIBTEX_FILE = "bibliography.bib"
+
+
+
+def find_citation_pis(xmltree):
+    """Find processing instructions in XML tree"""
+
+    biblio_pis = xmltree.xpath("//processing-instruction('biblio')")
+    logging.info(f"Found {libeoaconvert.plural(len(biblio_pis), 'processing instruction')} called 'biblio'.")
+
+    # retain only citations
+    citation_pis = [x for x in biblio_pis if x.text.startswith(ZOTERO_CSL_STRING)]
+    logging.info(f"Found {libeoaconvert.plural(len(citation_pis), 'zotero citation')}.")
+
+    return citation_pis
+# def find_citation_pis ends here
+
+
+def parse_citation_pi(pi_text):
+    """Parse the text of a Zotero processing instruction.
+
+    It contains a string and json code. There can be more than one
+    citation in citationItems:
+
+    <?biblio ADDIN ZOTERO_ITEM CSL_CITATION {"citationID":"xcRF1gM9","properties":{"formattedCitation":"(Bulatovic et al. 2016)","plainCitation":"(Bulatovic et al. 2016)","noteIndex":0},"citationItems":[{"id":7569,"uris":["http://zotero.org/users/915539/items/GBBVGF4J"],"uri":["http://zotero.org/users/915539/items/GBBVGF4J"],"itemData":{"id":7569,"type":"report","title":"Usability von DH-Tools und -Services (DARIAH2 R 1.2.3)","publisher-place":"Göttingen","page":"Deutsch","event-place":"Göttingen","URL":"https://wiki.de.dariah.eu/download/attachments/14651583/AP1.2.3_Usability_von_DH-Tools_und-Services_final.pdf","number":"1.2.3","author":[{"family":"Bulatovic","given":"Natasa"},{"family":"Gnadt","given":"Timo"},{"family":"Romanello","given":"Matteo"},{"family":"Schmitt","given":"Viola"},{"family":"Stiller","given":"Juliane"},{"family":"Thoden","given":"Klaus"}],"issued":{"date-parts":[["2016"]]}}}],"schema":"https://github.com/citation-style-language/schema/raw/master/csl-citation.json"}?>(Bulatovic et al. 2016) und
+    """
+
+    json_part = pi_text.replace(ZOTERO_CSL_STRING, "")
+
+    return json_part.strip()
+# def parse_citation_pi ends here
+
+
+def parse_json(citation_json):
+    """Parse string into JSON object"""
+
+    json_object = json.loads(citation_json)
+
+    return json_object
+# def parse_json ends here
+
+
+def get_info_from_json(parsed_json):
+    """Extract the relevant parts from JSON object."""
+
+    citation_dict = {}
+
+    properties = parsed_json.get("properties")
+    items = parsed_json.get("citationItems")[0]
+    itemdata = items.get('itemData')
+    issued = itemdata.get('issued')
+    try:
+        year = issued['date-parts'][0][0]
+    except KeyError:
+        year = issued['literal']
+
+    citation_dict["formatted"] = properties.get("plainCitation")
+    citation_dict["pagerange"] = items.get("locator")
+    citation_dict["prefix"] = items.get("prefix")
+    citation_dict["suffix"] = items.get("suffix")
+    citation_dict["zotero_url"] = items.get("uri")[0]
+    citation_dict["year"] = year
+
+    return citation_dict
+# def get_info_from_json ends here
+
+
+def convert_to_csl_json(items_dict):
+    """Convert CSL JSON to Zotero API JSON.
+
+    JSON data from a citation item in the processing instruction needs
+    to modified to the CSL JSON format. Currently, this JSON is
+    written to a temporary file
+
+    The data consists of the itemData block, with the ID being the
+    URI:
+
+    [{"id": "http://zotero.org/users/915539/items/4W8TZXCQ", "type": "paper-conference", "title": "Discussion on Radiation", "container-title": "Report of the Eighty-Third Meeting of the British Association for the Advancement of Science. Birmingham: 1913, September 10-17", "publisher": "John Murray", "publisher-place": "London", "page": "376–386", "event-place": "London", "author": [ { "literal": "Anonymous" } ], "issued": { "date-parts": [ [ "1914" ] ] }}]
+
+    """
+
+    zotero_id = items_dict.get("uri")[0]
+    itemdata = items_dict.get('itemData')
+    issued = itemdata.get('issued')
+
+    itemdata["id"] = zotero_id
+
+    file_id = zotero_id.split("/")[-1]
+
+    tmp_filename = f"{file_id}.json"
+
+    # write itemdata to new file
+    with open(tmp_filename, "w") as data_file:
+        itemtojson = itemdata
+        json.dump(itemtojson, data_file, indent=2)
+
+    logging.debug(f"Wrote {tmp_filename}.")
+
+    return f"{file_id}.json"
+# def convert_to_csl_json ends here
+
+
+def import_csl_json(csl_json):
+    """Convert CSL JSON to Zotero API JSON
+
+    The format looks like this:
+
+    [ { "key": "PXKZK2WF", "version": 0, "itemType": "conferencePaper", "creators": [ { "name": "Anonymous", "creatorType": "author" } ], "tags": [], "title": "Discussion on Radiation", "proceedingsTitle": "Report of the Eighty-Third Meeting of the British Association for the Advancement of Science. Birmingham: 1913, September 10-17", "publisher": "John Murray", "place": "London", "pages": "376–386", "date": "1914" }]
+    """
+
+    # curl --data-binary @cslstylefile.json -H 'Content-Type: text/plain' 'http://127.0.0.1:1969/import'
+
+    headers = { 'Content-Type': 'text/plain', }
+    data = open(csl_json, 'rb').read()
+    logging.debug(f"Trying to communicate with {TRANSLATOR_URL}")
+
+    try:
+        response = requests.post(f'{TRANSLATOR_URL}/import', headers=headers, data=data)
+    except:
+        logging.error(f"No connection possible to {TRANSLATOR_URL}. Maybe the translation service is down? Exiting.")
+        sys.exit(0)
+
+    return response.content
+# def import_csl_json ends here
+
+
+def create_bibtex(zotero_api_json):
+    """Get bibtex entry from Zotero translation server"""
+
+    # translation made by https://curl.trillworks.com/
+    # source: curl -d @items.json -H 'Content-Type: application/json' 'http://127.0.0.1:1969/export?format=bibtex'
+
+    headers = { 'Content-Type': 'application/json', }
+    params = ( ('format', 'biblatex'), )
+    data = zotero_api_json
+    try:
+        response = requests.post(f'{TRANSLATOR_URL}/export', headers=headers, params=params, data=data)
+    except:
+        logging.error(f"No connection possible to {TRANSLATOR_URL}. Maybe the translation service is down? Exiting.")
+        sys.exit(0)
+
+    bibtex = response.content.decode('utf-8')
+
+    return bibtex
+# def create_bibtex ends here
+
+
+def write_to_bibfile(bibtex_entry):
+    """Append entry to bibfile"""
+
+    with open(BIBTEX_FILE, "a") as bibfile:
+        bibfile.write(bibtex_entry)
+# def write_to_bibfile ends here
+
+
+def get_citekey(bibtex_entry):
+    """Parse bibtex entry for citekey"""
+
+    parser = bibtexparser.bparser.BibTexParser()
+    # be a bit lax about nonstandard entry types
+    parser.ignore_nonstandard_types = False
+
+    parsed_entry = bibtexparser.loads(bibtex_entry, parser=parser)
+
+    citekey = parsed_entry.entries[0]["ID"]
+
+    return citekey
+# def get_citekey ends here
+
+
+def create_citation_element(citation_dict, total_items, index_item):
+    """Create an XML element with zotero data"""
+
+    bibl = etree.Element("bibl")
+
+    if total_items > 1:
+        formatted_citation = citation_dict['formatted'].split(CITATION_SEPARATOR)[index_item]
+        if not index_item + 1 == total_items:
+            bibl.tail = CITATION_SEPARATOR
+    else:
+        formatted_citation = citation_dict['formatted']
+
+    bibl.text = formatted_citation
+    ref = etree.Element("ref")
+    bibl.insert(0, ref)
+    # add year or authoryear, need a good heuristic here
+    if citation_dict["formatted"] == f"({citation_dict['year']})":
+        ref.set("type", "year")
+    else:
+        ref.set("type", "authoryear")
+
+    if citation_dict["citekey"]:
+        ref.set("url", f"#{citation_dict['citekey']}")
+    else:
+        ref.set("url", f"#{citation_dict['zotero_url']}")
+    if citation_dict["pagerange"]:
+        citedrange = etree.Element("citedRange")
+        citedrange.text = citation_dict["pagerange"]
+        ref.insert(0, citedrange)
+
+    # if citation_dict["prefix"]:
+    #     element_string = element_string.replace('<bibl>', f'<bibl>{citation_dict["prefix"]} ')
+    # if citation_dict["suffix"]:
+    #     element_string = element_string.replace('</bibl>', f' {citation_dict["suffix"]}</bibl>')
+    return bibl
+# def create_citation_element ends here
+
+
+def citation_item_to_bibl(citation_item, parsed_json, citekey_list, number_of_items, item_position):
+    """Wrapper function for citation items.
+
+    This function makes use of the Zotero translation server
+    (https://github.com/zotero/translation-server) for format
+    conversion.
+
+    The JSON found in the each citationItem of the
+    processing instruction is
+    - modified to CSL JSON format
+    - translated into Zotero API JSON format
+    - exported to BibTeX
+
+    An tei:bibl element is written that will replace the processing instruction in the XML.
+    """
+
+    citation_dict = get_info_from_json(parsed_json)
+    citation_id = parsed_json.get("citationID")
+    citation_dict["id"] = citation_id
+
+    # convert csl json to zotero api json
+    zotero_api_json_filename = convert_to_csl_json(citation_item)
+    # first call to translation server
+    zotero_api_json = import_csl_json(zotero_api_json_filename)
+    os.unlink(zotero_api_json_filename)
+    # second call to translation server
+    bibtex_entry = create_bibtex(zotero_api_json)
+    citekey = get_citekey(bibtex_entry)
+    citation_dict["citekey"] = citekey
+    if citekey not in citekey_list:
+        citekey_list.append(citekey)
+        write_to_bibfile(bibtex_entry)
+
+    citation_element = create_citation_element(citation_dict, number_of_items, item_position)
+
+    return citation_element, citation_dict['formatted']
+# def citation_item_to_bibl ends here
+
+
+def turn_pi_into_bibl(pi, citekey_list):
+    """Wrapper function for the conversion steps."""
+
+    # wrap a temporary element around citations
+    tmp_element = etree.Element("tmp_bib")
+    pi_json = parse_citation_pi(pi.text)
+    parsed_json = parse_json(pi_json)
+    # there can be more than one citation in one processing instruction
+    items = parsed_json.get("citationItems")
+    number_of_items = len(items)
+    logging.info(f"Found {libeoaconvert.plural(number_of_items, 'item')} in this zotero citation.")
+    for citation_item in items:
+        item_position = items.index(citation_item)
+        citation_element, formatted_citation = citation_item_to_bibl(citation_item, parsed_json, citekey_list, number_of_items, item_position)
+        tmp_element.append(citation_element)
+
+    # remove formatted citation from tail
+    pi_tail = pi.tail
+    tmp_element.tail = pi_tail.replace(formatted_citation, "")
+    # replace processing instruction with bibl elements
+    parent_element = pi.getparent()
+    parent_element.replace(pi, tmp_element)
+# def turn_pi_into_bibl ends here
+
+
+def add_bib_to_header(xmltree, BIBTEX_FILE):
+    """Add a reference to bibfile to header"""
+
+    sourcedesc = xmltree.xpath("/t:TEI/t:teiHeader/t:fileDesc/t:sourceDesc", namespaces=NS_MAP)[0]
+    ab_element = etree.Element("ab", type="database")
+    ref = etree.SubElement(ab_element, "ref")
+    ref.set("target", BIBTEX_FILE)
+    ref.set("type", "please-specify-anthology-or-monograph")
+
+    sourcedesc.append(ab_element)
+# def add_bib_to_header ends here
+
+
+def cleanup_xml(xmltree):
+    """Perform some cleanups"""
+
+    etree.strip_tags(xmltree, "tmp_bib")
+    literaturverzeichnis = xmltree.xpath("//*[@rend='Literaturverzeichnis1']")
+    for element in literaturverzeichnis:
+        element.attrib.pop("rend")
+# def cleanup_xml ends here
+
+
+def write_xml_output(tree, filename):
+    """Write modified tree to file"""
+
+    tree.write(filename, pretty_print=True, xml_declaration=True, encoding="utf-8")
+    logging.info(f"Wrote {filename}.")
+# def write_xml_output ends here
+
+
+def main():
+    """The main bit"""
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("xmlfile", help="XML file converted from Word, containing Zotero citations.")
+    args = parser.parse_args()
+
+    xmltree = etree.parse(args.xmlfile)
+    citation_pis = find_citation_pis(xmltree)
+    citekey_list = []
+
+    for pi in citation_pis:
+        turn_pi_into_bibl(pi, citekey_list)
+
+    cleanup_xml(xmltree)
+    add_bib_to_header(xmltree, BIBTEX_FILE)
+    write_xml_output(xmltree, args.xmlfile.replace(".xml", "-biblrefs.xml"))
+# def main ends here
+
+
+if __name__ == '__main__':
+    main()
+# finis
+
+# Noch machen
+# prefix and suffix