Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
New tool added: parsezotero
  • Loading branch information
kthoden committed Jun 12, 2019
1 parent 6c738b8 commit 2113e98
Showing 1 changed file with 383 additions and 0 deletions.
383 changes: 383 additions & 0 deletions parsezotero.py
@@ -0,0 +1,383 @@
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-

"""Using a Zotero database for publications and their word plugin,
users don't need to put in citekey shorthands into the word
manuscripts. They can use the functionality of the plugin, which also
allows to cite only a year, add page ranges and, if needed, add a
prefix or a suffix.
The TEI converter renders the Zotero data as a processing instruction
that contains JSON code. This module parses out the relevant data and
replaces the JSON with a TEI element.
One last step to overcome is the citekey. When exporting the Zotero
database to bibtex, a citekey is created (following the rules in
https://github.com/zotero/translators/blob/master/BibTeX.js, see also
https://tex.stackexchange.com/questions/398521/custom-citation-keys-in-programs-like-zotero-or-mendeley/398749).
While this is present in the exported bibtex data, it is not present
in the JSON file.
The solution is a Zotero translation server
(https://github.com/zotero/translation-server, also described in
https://forums.zotero.org/discussion/73694/is-there-a-zbib-api-that-returns-bibtex-entry-string).
This is a translation service (can be run locally) that can handle a
Zotero API JSON and is able to export into biblatex.
Although BetterBibTeX
(https://github.com/retorquere/zotero-better-bibtex) allows for the
creation of unique citekeys across the whole Zotero database, the JSON
export does not contain this citekey.
"""

__version__ = "1.0"
__date__ = "20190607"
__author__ = "kthoden@mpiwg-berlin.mpg.de"

import argparse
import logging
import json
import os
import sys
import requests
import bibtexparser
from lxml import etree
import utils.libeoaconvert as libeoaconvert

logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')

ns_tei = "http://www.tei-c.org/ns/1.0"
NS_MAP = {"t" : ns_tei}
ZOTERO_CSL_STRING = "ADDIN ZOTERO_ITEM CSL_CITATION"
CITATION_SEPARATOR = ";"
TRANSLATOR_URL = "http://127.0.0.1:1969"
BIBTEX_FILE = "bibliography.bib"



def find_citation_pis(xmltree):
"""Find processing instructions in XML tree"""

biblio_pis = xmltree.xpath("//processing-instruction('biblio')")
logging.info(f"Found {libeoaconvert.plural(len(biblio_pis), 'processing instruction')} called 'biblio'.")

# retain only citations
citation_pis = [x for x in biblio_pis if x.text.startswith(ZOTERO_CSL_STRING)]
logging.info(f"Found {libeoaconvert.plural(len(citation_pis), 'zotero citation')}.")

return citation_pis
# def find_citation_pis ends here


def parse_citation_pi(pi_text):
"""Parse the text of a Zotero processing instruction.
It contains a string and json code. There can be more than one
citation in citationItems:
<?biblio ADDIN ZOTERO_ITEM CSL_CITATION {"citationID":"xcRF1gM9","properties":{"formattedCitation":"(Bulatovic et al. 2016)","plainCitation":"(Bulatovic et al. 2016)","noteIndex":0},"citationItems":[{"id":7569,"uris":["http://zotero.org/users/915539/items/GBBVGF4J"],"uri":["http://zotero.org/users/915539/items/GBBVGF4J"],"itemData":{"id":7569,"type":"report","title":"Usability von DH-Tools und -Services (DARIAH2 R 1.2.3)","publisher-place":"Göttingen","page":"Deutsch","event-place":"Göttingen","URL":"https://wiki.de.dariah.eu/download/attachments/14651583/AP1.2.3_Usability_von_DH-Tools_und-Services_final.pdf","number":"1.2.3","author":[{"family":"Bulatovic","given":"Natasa"},{"family":"Gnadt","given":"Timo"},{"family":"Romanello","given":"Matteo"},{"family":"Schmitt","given":"Viola"},{"family":"Stiller","given":"Juliane"},{"family":"Thoden","given":"Klaus"}],"issued":{"date-parts":[["2016"]]}}}],"schema":"https://github.com/citation-style-language/schema/raw/master/csl-citation.json"}?>(Bulatovic et al. 2016) und
"""

json_part = pi_text.replace(ZOTERO_CSL_STRING, "")

return json_part.strip()
# def parse_citation_pi ends here


def parse_json(citation_json):
"""Parse string into JSON object"""

json_object = json.loads(citation_json)

return json_object
# def parse_json ends here


def get_info_from_json(parsed_json):
"""Extract the relevant parts from JSON object."""

citation_dict = {}

properties = parsed_json.get("properties")
items = parsed_json.get("citationItems")[0]
itemdata = items.get('itemData')
issued = itemdata.get('issued')
try:
year = issued['date-parts'][0][0]
except KeyError:
year = issued['literal']

citation_dict["formatted"] = properties.get("plainCitation")
citation_dict["pagerange"] = items.get("locator")
citation_dict["prefix"] = items.get("prefix")
citation_dict["suffix"] = items.get("suffix")
citation_dict["zotero_url"] = items.get("uri")[0]
citation_dict["year"] = year

return citation_dict
# def get_info_from_json ends here


def convert_to_csl_json(items_dict):
"""Convert CSL JSON to Zotero API JSON.
JSON data from a citation item in the processing instruction needs
to modified to the CSL JSON format. Currently, this JSON is
written to a temporary file
The data consists of the itemData block, with the ID being the
URI:
[{"id": "http://zotero.org/users/915539/items/4W8TZXCQ", "type": "paper-conference", "title": "Discussion on Radiation", "container-title": "Report of the Eighty-Third Meeting of the British Association for the Advancement of Science. Birmingham: 1913, September 10-17", "publisher": "John Murray", "publisher-place": "London", "page": "376–386", "event-place": "London", "author": [ { "literal": "Anonymous" } ], "issued": { "date-parts": [ [ "1914" ] ] }}]
"""

zotero_id = items_dict.get("uri")[0]
itemdata = items_dict.get('itemData')
issued = itemdata.get('issued')

itemdata["id"] = zotero_id

file_id = zotero_id.split("/")[-1]

tmp_filename = f"{file_id}.json"

# write itemdata to new file
with open(tmp_filename, "w") as data_file:
itemtojson = itemdata
json.dump(itemtojson, data_file, indent=2)

logging.debug(f"Wrote {tmp_filename}.")

return f"{file_id}.json"
# def convert_to_csl_json ends here


def import_csl_json(csl_json):
"""Convert CSL JSON to Zotero API JSON
The format looks like this:
[ { "key": "PXKZK2WF", "version": 0, "itemType": "conferencePaper", "creators": [ { "name": "Anonymous", "creatorType": "author" } ], "tags": [], "title": "Discussion on Radiation", "proceedingsTitle": "Report of the Eighty-Third Meeting of the British Association for the Advancement of Science. Birmingham: 1913, September 10-17", "publisher": "John Murray", "place": "London", "pages": "376–386", "date": "1914" }]
"""

# curl --data-binary @cslstylefile.json -H 'Content-Type: text/plain' 'http://127.0.0.1:1969/import'

headers = { 'Content-Type': 'text/plain', }
data = open(csl_json, 'rb').read()
logging.debug(f"Trying to communicate with {TRANSLATOR_URL}")

try:
response = requests.post(f'{TRANSLATOR_URL}/import', headers=headers, data=data)
except:
logging.error(f"No connection possible to {TRANSLATOR_URL}. Maybe the translation service is down? Exiting.")
sys.exit(0)

return response.content
# def import_csl_json ends here


def create_bibtex(zotero_api_json):
"""Get bibtex entry from Zotero translation server"""

# translation made by https://curl.trillworks.com/
# source: curl -d @items.json -H 'Content-Type: application/json' 'http://127.0.0.1:1969/export?format=bibtex'

headers = { 'Content-Type': 'application/json', }
params = ( ('format', 'biblatex'), )
data = zotero_api_json
try:
response = requests.post(f'{TRANSLATOR_URL}/export', headers=headers, params=params, data=data)
except:
logging.error(f"No connection possible to {TRANSLATOR_URL}. Maybe the translation service is down? Exiting.")
sys.exit(0)

bibtex = response.content.decode('utf-8')

return bibtex
# def create_bibtex ends here


def write_to_bibfile(bibtex_entry):
"""Append entry to bibfile"""

with open(BIBTEX_FILE, "a") as bibfile:
bibfile.write(bibtex_entry)
# def write_to_bibfile ends here


def get_citekey(bibtex_entry):
"""Parse bibtex entry for citekey"""

parser = bibtexparser.bparser.BibTexParser()
# be a bit lax about nonstandard entry types
parser.ignore_nonstandard_types = False

parsed_entry = bibtexparser.loads(bibtex_entry, parser=parser)

citekey = parsed_entry.entries[0]["ID"]

return citekey
# def get_citekey ends here


def create_citation_element(citation_dict, total_items, index_item):
"""Create an XML element with zotero data"""

bibl = etree.Element("bibl")

if total_items > 1:
formatted_citation = citation_dict['formatted'].split(CITATION_SEPARATOR)[index_item]
if not index_item + 1 == total_items:
bibl.tail = CITATION_SEPARATOR
else:
formatted_citation = citation_dict['formatted']

bibl.text = formatted_citation
ref = etree.Element("ref")
bibl.insert(0, ref)
# add year or authoryear, need a good heuristic here
if citation_dict["formatted"] == f"({citation_dict['year']})":
ref.set("type", "year")
else:
ref.set("type", "authoryear")

if citation_dict["citekey"]:
ref.set("url", f"#{citation_dict['citekey']}")
else:
ref.set("url", f"#{citation_dict['zotero_url']}")
if citation_dict["pagerange"]:
citedrange = etree.Element("citedRange")
citedrange.text = citation_dict["pagerange"]
ref.insert(0, citedrange)

# if citation_dict["prefix"]:
# element_string = element_string.replace('<bibl>', f'<bibl>{citation_dict["prefix"]} ')
# if citation_dict["suffix"]:
# element_string = element_string.replace('</bibl>', f' {citation_dict["suffix"]}</bibl>')
return bibl
# def create_citation_element ends here


def citation_item_to_bibl(citation_item, parsed_json, citekey_list, number_of_items, item_position):
"""Wrapper function for citation items.
This function makes use of the Zotero translation server
(https://github.com/zotero/translation-server) for format
conversion.
The JSON found in the each citationItem of the
processing instruction is
- modified to CSL JSON format
- translated into Zotero API JSON format
- exported to BibTeX
An tei:bibl element is written that will replace the processing instruction in the XML.
"""

citation_dict = get_info_from_json(parsed_json)
citation_id = parsed_json.get("citationID")
citation_dict["id"] = citation_id

# convert csl json to zotero api json
zotero_api_json_filename = convert_to_csl_json(citation_item)
# first call to translation server
zotero_api_json = import_csl_json(zotero_api_json_filename)
os.unlink(zotero_api_json_filename)
# second call to translation server
bibtex_entry = create_bibtex(zotero_api_json)
citekey = get_citekey(bibtex_entry)
citation_dict["citekey"] = citekey
if citekey not in citekey_list:
citekey_list.append(citekey)
write_to_bibfile(bibtex_entry)

citation_element = create_citation_element(citation_dict, number_of_items, item_position)

return citation_element, citation_dict['formatted']
# def citation_item_to_bibl ends here


def turn_pi_into_bibl(pi, citekey_list):
"""Wrapper function for the conversion steps."""

# wrap a temporary element around citations
tmp_element = etree.Element("tmp_bib")
pi_json = parse_citation_pi(pi.text)
parsed_json = parse_json(pi_json)
# there can be more than one citation in one processing instruction
items = parsed_json.get("citationItems")
number_of_items = len(items)
logging.info(f"Found {libeoaconvert.plural(number_of_items, 'item')} in this zotero citation.")
for citation_item in items:
item_position = items.index(citation_item)
citation_element, formatted_citation = citation_item_to_bibl(citation_item, parsed_json, citekey_list, number_of_items, item_position)
tmp_element.append(citation_element)

# remove formatted citation from tail
pi_tail = pi.tail
tmp_element.tail = pi_tail.replace(formatted_citation, "")
# replace processing instruction with bibl elements
parent_element = pi.getparent()
parent_element.replace(pi, tmp_element)
# def turn_pi_into_bibl ends here


def add_bib_to_header(xmltree, BIBTEX_FILE):
"""Add a reference to bibfile to header"""

sourcedesc = xmltree.xpath("/t:TEI/t:teiHeader/t:fileDesc/t:sourceDesc", namespaces=NS_MAP)[0]
ab_element = etree.Element("ab", type="database")
ref = etree.SubElement(ab_element, "ref")
ref.set("target", BIBTEX_FILE)
ref.set("type", "please-specify-anthology-or-monograph")

sourcedesc.append(ab_element)
# def add_bib_to_header ends here


def cleanup_xml(xmltree):
"""Perform some cleanups"""

etree.strip_tags(xmltree, "tmp_bib")
literaturverzeichnis = xmltree.xpath("//*[@rend='Literaturverzeichnis1']")
for element in literaturverzeichnis:
element.attrib.pop("rend")
# def cleanup_xml ends here


def write_xml_output(tree, filename):
"""Write modified tree to file"""

tree.write(filename, pretty_print=True, xml_declaration=True, encoding="utf-8")
logging.info(f"Wrote {filename}.")
# def write_xml_output ends here


def main():
"""The main bit"""

parser = argparse.ArgumentParser()
parser.add_argument("xmlfile", help="XML file converted from Word, containing Zotero citations.")
args = parser.parse_args()

xmltree = etree.parse(args.xmlfile)
citation_pis = find_citation_pis(xmltree)
citekey_list = []

for pi in citation_pis:
turn_pi_into_bibl(pi, citekey_list)

cleanup_xml(xmltree)
add_bib_to_header(xmltree, BIBTEX_FILE)
write_xml_output(xmltree, args.xmlfile.replace(".xml", "-biblrefs.xml"))
# def main ends here


if __name__ == '__main__':
main()
# finis

# Noch machen
# prefix and suffix

0 comments on commit 2113e98

Please sign in to comment.