Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
"""Using a Zotero database for publications and their word plugin,
users don't need to put in citekey shorthands into the word
manuscripts. They can use the functionality of the plugin, which also
allows to cite only a year, add page ranges and, if needed, add a
prefix or a suffix.
The TEI converter renders the Zotero data as a processing instruction
that contains JSON code. This module parses out the relevant data and
replaces the JSON with a TEI element.
One last step to overcome is the citekey. When exporting the Zotero
database to bibtex, a citekey is created (following the rules in
https://github.com/zotero/translators/blob/master/BibTeX.js, see also
https://tex.stackexchange.com/questions/398521/custom-citation-keys-in-programs-like-zotero-or-mendeley/398749).
While this is present in the exported bibtex data, it is not present
in the JSON file.
The solution is a Zotero translation server
(https://github.com/zotero/translation-server, also described in
https://forums.zotero.org/discussion/73694/is-there-a-zbib-api-that-returns-bibtex-entry-string).
This is a translation service (can be run locally) that can handle a
Zotero API JSON and is able to export into biblatex.
Although BetterBibTeX
(https://github.com/retorquere/zotero-better-bibtex) allows for the
creation of unique citekeys across the whole Zotero database, the JSON
export does not contain this citekey.
"""
__version__ = "1.0"
__date__ = "20190607"
__author__ = "kthoden@mpiwg-berlin.mpg.de"
import argparse
import logging
import json
import os
import sys
import requests
import bibtexparser
from lxml import etree
import utils.libeoaconvert as libeoaconvert
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
ns_tei = "http://www.tei-c.org/ns/1.0"
NS_MAP = {"t" : ns_tei}
ZOTERO_CSL_STRING = "ADDIN ZOTERO_ITEM CSL_CITATION"
CITATION_SEPARATOR = ";"
TRANSLATOR_URL = "http://127.0.0.1:1969"
BIBTEX_FILE = "bibliography.bib"
BRACKET = "()"
def find_citation_pis(xmltree):
"""Find processing instructions in XML tree"""
biblio_pis = xmltree.xpath("//processing-instruction('biblio')")
logging.info(f"Found {libeoaconvert.plural(len(biblio_pis), 'processing instruction')} called 'biblio'.")
# retain only citations
citation_pis = [x for x in biblio_pis if x.text.startswith(ZOTERO_CSL_STRING)]
logging.info(f"Found {libeoaconvert.plural(len(citation_pis), 'zotero citation')}.")
return citation_pis
# def find_citation_pis ends here
def parse_citation_pi(pi_text):
"""Parse the text of a Zotero processing instruction.
It contains a string and json code. There can be more than one
citation in citationItems:
<?biblio ADDIN ZOTERO_ITEM CSL_CITATION {"citationID":"xcRF1gM9","properties":{"formattedCitation":"(Bulatovic et al. 2016)","plainCitation":"(Bulatovic et al. 2016)","noteIndex":0},"citationItems":[{"id":7569,"uris":["http://zotero.org/users/915539/items/GBBVGF4J"],"uri":["http://zotero.org/users/915539/items/GBBVGF4J"],"itemData":{"id":7569,"type":"report","title":"Usability von DH-Tools und -Services (DARIAH2 R 1.2.3)","publisher-place":"Göttingen","page":"Deutsch","event-place":"Göttingen","URL":"https://wiki.de.dariah.eu/download/attachments/14651583/AP1.2.3_Usability_von_DH-Tools_und-Services_final.pdf","number":"1.2.3","author":[{"family":"Bulatovic","given":"Natasa"},{"family":"Gnadt","given":"Timo"},{"family":"Romanello","given":"Matteo"},{"family":"Schmitt","given":"Viola"},{"family":"Stiller","given":"Juliane"},{"family":"Thoden","given":"Klaus"}],"issued":{"date-parts":[["2016"]]}}}],"schema":"https://github.com/citation-style-language/schema/raw/master/csl-citation.json"}?>(Bulatovic et al. 2016) und
"""
json_part = pi_text.replace(ZOTERO_CSL_STRING, "")
return json_part.strip()
# def parse_citation_pi ends here
def parse_json(citation_json):
"""Parse string into JSON object"""
json_object = json.loads(citation_json)
return json_object
# def parse_json ends here
def get_info_from_json(parsed_json):
"""Extract the relevant parts from JSON object."""
citation_dict = {}
properties = parsed_json.get("properties")
items = parsed_json.get("citationItems")[0]
itemdata = items.get('itemData')
issued = itemdata.get('issued')
try:
year = issued['date-parts'][0][0]
except KeyError:
year = issued['literal']
citation_dict["formatted"] = properties.get("plainCitation")
citation_dict["pagerange"] = items.get("locator")
citation_dict["prefix"] = items.get("prefix")
citation_dict["suffix"] = items.get("suffix")
citation_dict["zotero_url"] = items.get("uri")[0]
citation_dict["year"] = year
return citation_dict
# def get_info_from_json ends here
def convert_to_csl_json(items_dict):
"""Convert CSL JSON to Zotero API JSON.
JSON data from a citation item in the processing instruction needs
to modified to the CSL JSON format. Currently, this JSON is
written to a temporary file
The data consists of the itemData block, with the ID being the
URI:
[{"id": "http://zotero.org/users/915539/items/4W8TZXCQ", "type": "paper-conference", "title": "Discussion on Radiation", "container-title": "Report of the Eighty-Third Meeting of the British Association for the Advancement of Science. Birmingham: 1913, September 10-17", "publisher": "John Murray", "publisher-place": "London", "page": "376–386", "event-place": "London", "author": [ { "literal": "Anonymous" } ], "issued": { "date-parts": [ [ "1914" ] ] }}]
"""
zotero_id = items_dict.get("uri")[0]
itemdata = items_dict.get('itemData')
issued = itemdata.get('issued')
itemdata["id"] = zotero_id
file_id = zotero_id.split("/")[-1]
tmp_filename = f"{file_id}.json"
# write itemdata to new file
with open(tmp_filename, "w") as data_file:
itemtojson = itemdata
json.dump(itemtojson, data_file, indent=2)
logging.debug(f"Wrote {tmp_filename}.")
return f"{file_id}.json"
# def convert_to_csl_json ends here
def import_csl_json(csl_json):
"""Convert CSL JSON to Zotero API JSON
The format looks like this:
[ { "key": "PXKZK2WF", "version": 0, "itemType": "conferencePaper", "creators": [ { "name": "Anonymous", "creatorType": "author" } ], "tags": [], "title": "Discussion on Radiation", "proceedingsTitle": "Report of the Eighty-Third Meeting of the British Association for the Advancement of Science. Birmingham: 1913, September 10-17", "publisher": "John Murray", "place": "London", "pages": "376–386", "date": "1914" }]
"""
# curl --data-binary @cslstylefile.json -H 'Content-Type: text/plain' 'http://127.0.0.1:1969/import'
headers = { 'Content-Type': 'text/plain', }
data = open(csl_json, 'rb').read()
logging.debug(f"Trying to communicate with {TRANSLATOR_URL}")
try:
response = requests.post(f'{TRANSLATOR_URL}/import', headers=headers, data=data)
except:
logging.error(f"No connection possible to {TRANSLATOR_URL}. Maybe the translation service is down? Exiting.")
sys.exit(0)
return response.content
# def import_csl_json ends here
def create_bibtex(zotero_api_json):
"""Get bibtex entry from Zotero translation server"""
# translation made by https://curl.trillworks.com/
# source: curl -d @items.json -H 'Content-Type: application/json' 'http://127.0.0.1:1969/export?format=bibtex'
headers = { 'Content-Type': 'application/json', }
params = ( ('format', 'biblatex'), )
data = zotero_api_json
try:
response = requests.post(f'{TRANSLATOR_URL}/export', headers=headers, params=params, data=data)
except:
logging.error(f"No connection possible to {TRANSLATOR_URL}. Maybe the translation service is down? Exiting.")
sys.exit(0)
bibtex = response.content.decode('utf-8')
return bibtex
# def create_bibtex ends here
def write_to_bibfile(bibtex_entry):
"""Append entry to bibfile"""
with open(BIBTEX_FILE, "a") as bibfile:
bibfile.write(bibtex_entry)
# def write_to_bibfile ends here
def get_citekey(bibtex_entry):
"""Parse bibtex entry for citekey"""
parser = bibtexparser.bparser.BibTexParser()
# be a bit lax about nonstandard entry types
parser.ignore_nonstandard_types = False
parsed_entry = bibtexparser.loads(bibtex_entry, parser=parser)
citekey = parsed_entry.entries[0]["ID"]
return citekey
# def get_citekey ends here
def modify_preceding_text(bibl_element, prefix_text, BRACKET="[]"):
"""Retrieve and modify preceding text with bibliographic prefix"""
preceding_element = bibl_element.getprevious()
if preceding_element is None:
preceding_element = bibl_element.getparent()
print(preceding_element)
preceding_text = preceding_element.text
if preceding_text is None:
preceding_element.text = f"{prefix_text} {BRACKET[0]}"
else:
preceding_element.text = f"{preceding_text}{prefix_text} {BRACKET[0]}"
else:
preceding_text = preceding_element.tail
if preceding_text is None:
preceding_element.tail = f"{prefix_text} {BRACKET[0]}"
else:
preceding_element.tail = f"{preceding_text}{prefix_text} {BRACKET[0]}"
return
# def modify_preceding_text ends here
def create_citation_element(citation_dict, total_items, index_item, print_formatted_citation):
"""Create an XML element with zotero data"""
bibl = etree.Element("bibl")
if total_items > 1:
formatted_citation = citation_dict['formatted'].split(CITATION_SEPARATOR)[index_item]
if not index_item + 1 == total_items:
bibl.tail = CITATION_SEPARATOR
else:
formatted_citation = citation_dict['formatted']
# bibl.text = formatted_citation
ref = etree.Element("ref")
if print_formatted_citation:
ref.tail = formatted_citation
else:
pass
bibl.insert(0, ref)
# add year or authoryear, need a good heuristic here
if citation_dict["formatted"] == f"({citation_dict['year']})":
ref.set("type", "year")
else:
ref.set("type", "authoryear")
if citation_dict["citekey"]:
ref.set("target", f"#{citation_dict['citekey']}")
else:
ref.set("target", f"#{citation_dict['zotero_url']}")
if citation_dict["pagerange"]:
citedrange = etree.Element("citedRange")
citedrange.text = citation_dict["pagerange"]
bibl.append(citedrange)
return bibl
# def create_citation_element ends here
def citation_item_to_bibl(citation_item, parsed_json, citekey_list, number_of_items, item_position, print_formatted_citation):
"""Wrapper function for citation items.
This function makes use of the Zotero translation server
(https://github.com/zotero/translation-server) for format
conversion.
The JSON found in the each citationItem of the
processing instruction is
- modified to CSL JSON format
- translated into Zotero API JSON format
- exported to BibTeX
An tei:bibl element is written that will replace the processing instruction in the XML.
"""
citation_dict = get_info_from_json(parsed_json)
citation_id = parsed_json.get("citationID")
citation_dict["id"] = citation_id
# convert csl json to zotero api json
zotero_api_json_filename = convert_to_csl_json(citation_item)
# first call to translation server
zotero_api_json = import_csl_json(zotero_api_json_filename)
os.unlink(zotero_api_json_filename)
# second call to translation server
bibtex_entry = create_bibtex(zotero_api_json)
citekey = get_citekey(bibtex_entry)
citation_dict["citekey"] = citekey
if citekey not in citekey_list:
citekey_list.append(citekey)
write_to_bibfile(bibtex_entry)
citation_element = create_citation_element(citation_dict, number_of_items, item_position, print_formatted_citation)
return citation_element, citation_dict
# def citation_item_to_bibl ends here
def turn_pi_into_bibl(pi, citekey_list, print_formatted_citation):
"""Wrapper function for the conversion steps."""
# wrap a temporary element around citations
tmp_element = etree.Element("tmp_bib")
pi_json = parse_citation_pi(pi.text)
parsed_json = parse_json(pi_json)
# there can be more than one citation in one processing instruction
items = parsed_json.get("citationItems")
number_of_items = len(items)
logging.info(f"Found {libeoaconvert.plural(number_of_items, 'item')} in this zotero citation.")
for citation_item in items:
item_position = items.index(citation_item)
citation_element, citation_dict = citation_item_to_bibl(citation_item, parsed_json, citekey_list, number_of_items, item_position, print_formatted_citation)
tmp_element.append(citation_element)
formatted_citation = citation_dict['formatted']
# remove formatted citation from tail
pi_tail = pi.tail
if citation_dict["prefix"]:
prefix_text = citation_dict["prefix"]
else:
prefix_text = ""
if citation_dict["suffix"]:
suffix_text = citation_dict["suffix"]
else:
suffix_text = ""
tmp_element.tail = pi_tail.replace(formatted_citation, f"{BRACKET[1]}{suffix_text}")
# replace processing instruction with bibl elements
parent_element = pi.getparent()
parent_element.replace(pi, tmp_element)
modify_preceding_text(tmp_element, prefix_text, BRACKET)
# def turn_pi_into_bibl ends here
def add_bib_to_header(xmltree, BIBTEX_FILE):
"""Add a reference to bibfile to header"""
sourcedesc = xmltree.xpath("/t:TEI/t:teiHeader/t:fileDesc/t:sourceDesc", namespaces=NS_MAP)[0]
ab_element = etree.Element("ab", type="database")
ref = etree.SubElement(ab_element, "ref")
ref.set("target", BIBTEX_FILE)
ref.set("type", "please-specify-anthology-or-monograph")
sourcedesc.append(ab_element)
# def add_bib_to_header ends here
def cleanup_xml(xmltree):
"""Perform some cleanups"""
etree.strip_tags(xmltree, "tmp_bib")
literaturverzeichnis = xmltree.xpath("//*[@rend='Literaturverzeichnis1']")
for element in literaturverzeichnis:
element.attrib.pop("rend")
# def cleanup_xml ends here
def write_xml_output(tree, filename):
"""Write modified tree to file"""
tree.write(filename, pretty_print=True, xml_declaration=True, encoding="utf-8")
logging.info(f"Wrote {filename}.")
# def write_xml_output ends here
def main():
"""The main bit"""
parser = argparse.ArgumentParser()
parser.add_argument("xmlfile", help="XML file converted from Word, containing Zotero citations.")
parser.add_argument("-f", "--format-citations", help="Print formatted citation in XML.", action="store_true")
args = parser.parse_args()
xmltree = etree.parse(args.xmlfile)
citation_pis = find_citation_pis(xmltree)
citekey_list = []
for pi in citation_pis:
turn_pi_into_bibl(pi, citekey_list, args.format_citations)
cleanup_xml(xmltree)
add_bib_to_header(xmltree, BIBTEX_FILE)
write_xml_output(xmltree, args.xmlfile.replace(".xml", "-biblrefs.xml"))
# def main ends here
if __name__ == '__main__':
main()
# finis
# Noch machen
# prefix and suffix