Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
EOASkripts/src/parsezotero.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
422 lines (307 sloc)
15.1 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8; mode: python -*- | |
"""Using a Zotero database for publications and their word plugin, | |
users don't need to put in citekey shorthands into the word | |
manuscripts. They can use the functionality of the plugin, which also | |
allows to cite only a year, add page ranges and, if needed, add a | |
prefix or a suffix. | |
The TEI converter renders the Zotero data as a processing instruction | |
that contains JSON code. This module parses out the relevant data and | |
replaces the JSON with a TEI element. | |
One last step to overcome is the citekey. When exporting the Zotero | |
database to bibtex, a citekey is created (following the rules in | |
https://github.com/zotero/translators/blob/master/BibTeX.js, see also | |
https://tex.stackexchange.com/questions/398521/custom-citation-keys-in-programs-like-zotero-or-mendeley/398749). | |
While this is present in the exported bibtex data, it is not present | |
in the JSON file. | |
The solution is a Zotero translation server | |
(https://github.com/zotero/translation-server, also described in | |
https://forums.zotero.org/discussion/73694/is-there-a-zbib-api-that-returns-bibtex-entry-string). | |
This is a translation service (can be run locally) that can handle a | |
Zotero API JSON and is able to export into biblatex. | |
Although BetterBibTeX | |
(https://github.com/retorquere/zotero-better-bibtex) allows for the | |
creation of unique citekeys across the whole Zotero database, the JSON | |
export does not contain this citekey. | |
""" | |
__version__ = "1.0" | |
__date__ = "20190607" | |
__author__ = "kthoden@mpiwg-berlin.mpg.de" | |
import argparse | |
import logging | |
import json | |
import os | |
import sys | |
import requests | |
import bibtexparser | |
from lxml import etree | |
import utils.libeoaconvert as libeoaconvert | |
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') | |
ns_tei = "http://www.tei-c.org/ns/1.0" | |
NS_MAP = {"t" : ns_tei} | |
ZOTERO_CSL_STRING = "ADDIN ZOTERO_ITEM CSL_CITATION" | |
CITATION_SEPARATOR = ";" | |
TRANSLATOR_URL = "http://127.0.0.1:1969" | |
BIBTEX_FILE = "bibliography.bib" | |
BRACKET = "()" | |
def find_citation_pis(xmltree): | |
"""Find processing instructions in XML tree""" | |
biblio_pis = xmltree.xpath("//processing-instruction('biblio')") | |
logging.info(f"Found {libeoaconvert.plural(len(biblio_pis), 'processing instruction')} called 'biblio'.") | |
# retain only citations | |
citation_pis = [x for x in biblio_pis if x.text.startswith(ZOTERO_CSL_STRING)] | |
logging.info(f"Found {libeoaconvert.plural(len(citation_pis), 'zotero citation')}.") | |
return citation_pis | |
# def find_citation_pis ends here | |
def parse_citation_pi(pi_text): | |
"""Parse the text of a Zotero processing instruction. | |
It contains a string and json code. There can be more than one | |
citation in citationItems: | |
<?biblio ADDIN ZOTERO_ITEM CSL_CITATION {"citationID":"xcRF1gM9","properties":{"formattedCitation":"(Bulatovic et al. 2016)","plainCitation":"(Bulatovic et al. 2016)","noteIndex":0},"citationItems":[{"id":7569,"uris":["http://zotero.org/users/915539/items/GBBVGF4J"],"uri":["http://zotero.org/users/915539/items/GBBVGF4J"],"itemData":{"id":7569,"type":"report","title":"Usability von DH-Tools und -Services (DARIAH2 R 1.2.3)","publisher-place":"Göttingen","page":"Deutsch","event-place":"Göttingen","URL":"https://wiki.de.dariah.eu/download/attachments/14651583/AP1.2.3_Usability_von_DH-Tools_und-Services_final.pdf","number":"1.2.3","author":[{"family":"Bulatovic","given":"Natasa"},{"family":"Gnadt","given":"Timo"},{"family":"Romanello","given":"Matteo"},{"family":"Schmitt","given":"Viola"},{"family":"Stiller","given":"Juliane"},{"family":"Thoden","given":"Klaus"}],"issued":{"date-parts":[["2016"]]}}}],"schema":"https://github.com/citation-style-language/schema/raw/master/csl-citation.json"}?>(Bulatovic et al. 2016) und | |
""" | |
json_part = pi_text.replace(ZOTERO_CSL_STRING, "") | |
return json_part.strip() | |
# def parse_citation_pi ends here | |
def parse_json(citation_json): | |
"""Parse string into JSON object""" | |
json_object = json.loads(citation_json) | |
return json_object | |
# def parse_json ends here | |
def get_info_from_json(parsed_json): | |
"""Extract the relevant parts from JSON object.""" | |
citation_dict = {} | |
properties = parsed_json.get("properties") | |
items = parsed_json.get("citationItems")[0] | |
itemdata = items.get('itemData') | |
issued = itemdata.get('issued') | |
try: | |
year = issued['date-parts'][0][0] | |
except KeyError: | |
year = issued['literal'] | |
citation_dict["formatted"] = properties.get("plainCitation") | |
citation_dict["pagerange"] = items.get("locator") | |
citation_dict["prefix"] = items.get("prefix") | |
citation_dict["suffix"] = items.get("suffix") | |
citation_dict["zotero_url"] = items.get("uri")[0] | |
citation_dict["year"] = year | |
return citation_dict | |
# def get_info_from_json ends here | |
def convert_to_csl_json(items_dict): | |
"""Convert CSL JSON to Zotero API JSON. | |
JSON data from a citation item in the processing instruction needs | |
to modified to the CSL JSON format. Currently, this JSON is | |
written to a temporary file | |
The data consists of the itemData block, with the ID being the | |
URI: | |
[{"id": "http://zotero.org/users/915539/items/4W8TZXCQ", "type": "paper-conference", "title": "Discussion on Radiation", "container-title": "Report of the Eighty-Third Meeting of the British Association for the Advancement of Science. Birmingham: 1913, September 10-17", "publisher": "John Murray", "publisher-place": "London", "page": "376–386", "event-place": "London", "author": [ { "literal": "Anonymous" } ], "issued": { "date-parts": [ [ "1914" ] ] }}] | |
""" | |
zotero_id = items_dict.get("uri")[0] | |
itemdata = items_dict.get('itemData') | |
issued = itemdata.get('issued') | |
itemdata["id"] = zotero_id | |
file_id = zotero_id.split("/")[-1] | |
tmp_filename = f"{file_id}.json" | |
# write itemdata to new file | |
with open(tmp_filename, "w") as data_file: | |
itemtojson = itemdata | |
json.dump(itemtojson, data_file, indent=2) | |
logging.debug(f"Wrote {tmp_filename}.") | |
return f"{file_id}.json" | |
# def convert_to_csl_json ends here | |
def import_csl_json(csl_json): | |
"""Convert CSL JSON to Zotero API JSON | |
The format looks like this: | |
[ { "key": "PXKZK2WF", "version": 0, "itemType": "conferencePaper", "creators": [ { "name": "Anonymous", "creatorType": "author" } ], "tags": [], "title": "Discussion on Radiation", "proceedingsTitle": "Report of the Eighty-Third Meeting of the British Association for the Advancement of Science. Birmingham: 1913, September 10-17", "publisher": "John Murray", "place": "London", "pages": "376–386", "date": "1914" }] | |
""" | |
# curl --data-binary @cslstylefile.json -H 'Content-Type: text/plain' 'http://127.0.0.1:1969/import' | |
headers = { 'Content-Type': 'text/plain', } | |
data = open(csl_json, 'rb').read() | |
logging.debug(f"Trying to communicate with {TRANSLATOR_URL}") | |
try: | |
response = requests.post(f'{TRANSLATOR_URL}/import', headers=headers, data=data) | |
except: | |
logging.error(f"No connection possible to {TRANSLATOR_URL}. Maybe the translation service is down? Exiting.") | |
sys.exit(0) | |
return response.content | |
# def import_csl_json ends here | |
def create_bibtex(zotero_api_json): | |
"""Get bibtex entry from Zotero translation server""" | |
# translation made by https://curl.trillworks.com/ | |
# source: curl -d @items.json -H 'Content-Type: application/json' 'http://127.0.0.1:1969/export?format=bibtex' | |
headers = { 'Content-Type': 'application/json', } | |
params = ( ('format', 'biblatex'), ) | |
data = zotero_api_json | |
try: | |
response = requests.post(f'{TRANSLATOR_URL}/export', headers=headers, params=params, data=data) | |
except: | |
logging.error(f"No connection possible to {TRANSLATOR_URL}. Maybe the translation service is down? Exiting.") | |
sys.exit(0) | |
bibtex = response.content.decode('utf-8') | |
return bibtex | |
# def create_bibtex ends here | |
def write_to_bibfile(bibtex_entry): | |
"""Append entry to bibfile""" | |
with open(BIBTEX_FILE, "a") as bibfile: | |
bibfile.write(bibtex_entry) | |
# def write_to_bibfile ends here | |
def get_citekey(bibtex_entry): | |
"""Parse bibtex entry for citekey""" | |
parser = bibtexparser.bparser.BibTexParser() | |
# be a bit lax about nonstandard entry types | |
parser.ignore_nonstandard_types = False | |
parsed_entry = bibtexparser.loads(bibtex_entry, parser=parser) | |
citekey = parsed_entry.entries[0]["ID"] | |
return citekey | |
# def get_citekey ends here | |
def modify_preceding_text(bibl_element, prefix_text, BRACKET="[]"): | |
"""Retrieve and modify preceding text with bibliographic prefix""" | |
preceding_element = bibl_element.getprevious() | |
if preceding_element is None: | |
preceding_element = bibl_element.getparent() | |
print(preceding_element) | |
preceding_text = preceding_element.text | |
if preceding_text is None: | |
preceding_element.text = f"{prefix_text} {BRACKET[0]}" | |
else: | |
preceding_element.text = f"{preceding_text}{prefix_text} {BRACKET[0]}" | |
else: | |
preceding_text = preceding_element.tail | |
if preceding_text is None: | |
preceding_element.tail = f"{prefix_text} {BRACKET[0]}" | |
else: | |
preceding_element.tail = f"{preceding_text}{prefix_text} {BRACKET[0]}" | |
return | |
# def modify_preceding_text ends here | |
def create_citation_element(citation_dict, total_items, index_item, print_formatted_citation): | |
"""Create an XML element with zotero data""" | |
bibl = etree.Element("bibl") | |
if total_items > 1: | |
formatted_citation = citation_dict['formatted'].split(CITATION_SEPARATOR)[index_item] | |
if not index_item + 1 == total_items: | |
bibl.tail = CITATION_SEPARATOR | |
else: | |
formatted_citation = citation_dict['formatted'] | |
# bibl.text = formatted_citation | |
ref = etree.Element("ref") | |
if print_formatted_citation: | |
ref.tail = formatted_citation | |
else: | |
pass | |
bibl.insert(0, ref) | |
# add year or authoryear, need a good heuristic here | |
if citation_dict["formatted"] == f"({citation_dict['year']})": | |
ref.set("type", "year") | |
else: | |
ref.set("type", "authoryear") | |
if citation_dict["citekey"]: | |
ref.set("target", f"#{citation_dict['citekey']}") | |
else: | |
ref.set("target", f"#{citation_dict['zotero_url']}") | |
if citation_dict["pagerange"]: | |
citedrange = etree.Element("citedRange") | |
citedrange.text = citation_dict["pagerange"] | |
bibl.append(citedrange) | |
return bibl | |
# def create_citation_element ends here | |
def citation_item_to_bibl(citation_item, parsed_json, citekey_list, number_of_items, item_position, print_formatted_citation): | |
"""Wrapper function for citation items. | |
This function makes use of the Zotero translation server | |
(https://github.com/zotero/translation-server) for format | |
conversion. | |
The JSON found in the each citationItem of the | |
processing instruction is | |
- modified to CSL JSON format | |
- translated into Zotero API JSON format | |
- exported to BibTeX | |
An tei:bibl element is written that will replace the processing instruction in the XML. | |
""" | |
citation_dict = get_info_from_json(parsed_json) | |
citation_id = parsed_json.get("citationID") | |
citation_dict["id"] = citation_id | |
# convert csl json to zotero api json | |
zotero_api_json_filename = convert_to_csl_json(citation_item) | |
# first call to translation server | |
zotero_api_json = import_csl_json(zotero_api_json_filename) | |
os.unlink(zotero_api_json_filename) | |
# second call to translation server | |
bibtex_entry = create_bibtex(zotero_api_json) | |
citekey = get_citekey(bibtex_entry) | |
citation_dict["citekey"] = citekey | |
if citekey not in citekey_list: | |
citekey_list.append(citekey) | |
write_to_bibfile(bibtex_entry) | |
citation_element = create_citation_element(citation_dict, number_of_items, item_position, print_formatted_citation) | |
return citation_element, citation_dict | |
# def citation_item_to_bibl ends here | |
def turn_pi_into_bibl(pi, citekey_list, print_formatted_citation): | |
"""Wrapper function for the conversion steps.""" | |
# wrap a temporary element around citations | |
tmp_element = etree.Element("tmp_bib") | |
pi_json = parse_citation_pi(pi.text) | |
parsed_json = parse_json(pi_json) | |
# there can be more than one citation in one processing instruction | |
items = parsed_json.get("citationItems") | |
number_of_items = len(items) | |
logging.info(f"Found {libeoaconvert.plural(number_of_items, 'item')} in this zotero citation.") | |
for citation_item in items: | |
item_position = items.index(citation_item) | |
citation_element, citation_dict = citation_item_to_bibl(citation_item, parsed_json, citekey_list, number_of_items, item_position, print_formatted_citation) | |
tmp_element.append(citation_element) | |
formatted_citation = citation_dict['formatted'] | |
# remove formatted citation from tail | |
pi_tail = pi.tail | |
if citation_dict["prefix"]: | |
prefix_text = citation_dict["prefix"] | |
else: | |
prefix_text = "" | |
if citation_dict["suffix"]: | |
suffix_text = citation_dict["suffix"] | |
else: | |
suffix_text = "" | |
tmp_element.tail = pi_tail.replace(formatted_citation, f"{BRACKET[1]}{suffix_text}") | |
# replace processing instruction with bibl elements | |
parent_element = pi.getparent() | |
parent_element.replace(pi, tmp_element) | |
modify_preceding_text(tmp_element, prefix_text, BRACKET) | |
# def turn_pi_into_bibl ends here | |
def add_bib_to_header(xmltree, BIBTEX_FILE): | |
"""Add a reference to bibfile to header""" | |
sourcedesc = xmltree.xpath("/t:TEI/t:teiHeader/t:fileDesc/t:sourceDesc", namespaces=NS_MAP)[0] | |
ab_element = etree.Element("ab", type="database") | |
ref = etree.SubElement(ab_element, "ref") | |
ref.set("target", BIBTEX_FILE) | |
ref.set("type", "please-specify-anthology-or-monograph") | |
sourcedesc.append(ab_element) | |
# def add_bib_to_header ends here | |
def cleanup_xml(xmltree): | |
"""Perform some cleanups""" | |
etree.strip_tags(xmltree, "tmp_bib") | |
literaturverzeichnis = xmltree.xpath("//*[@rend='Literaturverzeichnis1']") | |
for element in literaturverzeichnis: | |
element.attrib.pop("rend") | |
# def cleanup_xml ends here | |
def write_xml_output(tree, filename): | |
"""Write modified tree to file""" | |
tree.write(filename, pretty_print=True, xml_declaration=True, encoding="utf-8") | |
logging.info(f"Wrote {filename}.") | |
# def write_xml_output ends here | |
def main(): | |
"""The main bit""" | |
parser = argparse.ArgumentParser() | |
parser.add_argument("xmlfile", help="XML file converted from Word, containing Zotero citations.") | |
parser.add_argument("-f", "--format-citations", help="Print formatted citation in XML.", action="store_true") | |
args = parser.parse_args() | |
xmltree = etree.parse(args.xmlfile) | |
citation_pis = find_citation_pis(xmltree) | |
citekey_list = [] | |
for pi in citation_pis: | |
turn_pi_into_bibl(pi, citekey_list, args.format_citations) | |
cleanup_xml(xmltree) | |
add_bib_to_header(xmltree, BIBTEX_FILE) | |
write_xml_output(xmltree, args.xmlfile.replace(".xml", "-biblrefs.xml")) | |
# def main ends here | |
if __name__ == '__main__': | |
main() | |
# finis | |
# Noch machen | |
# prefix and suffix |