Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
949 lines (747 sloc) 38.2 KB
#!/usr/bin/python3
# -*- coding: utf-8; mode: python -*-
__version__ = "1.0"
__date__ = "20180116"
__author__ = "kthoden@mpiwg-berlin.mpg.de"
__doc__ = """A converter from TEI to Django."""
import os
import sys
import logging
import json
import subprocess
import pickle
import shlex
import configparser
import libeoaconvert
from datetime import datetime
from bs4 import BeautifulSoup
from lxml import etree, objectify
from lxml.html import soupparser
# things to be done
# assign ids top to bottom for the following elements:
# div1 div2 div3 note item table EOAfigure EOAequation formula theorem
CONFIG_FILE = os.path.dirname(sys.argv[0]) + os.path.sep + "config" + os.path.sep +"eoaconvert.cfg"
# Reading the configuration file
CONFIG = configparser.ConfigParser()
CONFIG.read(CONFIG_FILE)
logging.basicConfig(level=logging.INFO, format=' %(asctime)s - %(levelname)s - %(message)s')
ns_tei = "http://www.tei-c.org/ns/1.0"
NS_MAP = {"t" : ns_tei}
TMP_DIR = os.path.expanduser("tmp_files")
OUTPUT_DIR = os.path.expanduser("CONVERT")
CSL_FILE = CONFIG['Auxiliaries']['CSL_FILE']
def get_publication_info(xml_tree):
"""Query the TEI document for metadata fields.
Return a dictionary"""
info_dict = {}
ns_tei = "http://www.tei-c.org/ns/1.0"
ns_cc = "http://web.resource.org/cc/"
ns_rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
NS_MAP = {"t" : ns_tei, "c" : ns_cc, "r" : ns_rdf}
def get_field(xml_tree, query_path, mandatory=False, findall=False):
"""Query XML for metadata fields.
Default behaviour is if it fails, move on, if mandatory is set
to True, exit the program
"""
if findall is True:
find_several = xml_tree.findall(query_path, namespaces=NS_MAP)
if len(find_several) == 1:
return_string = [find_several[0].text]
else:
return_string = [x.text for x in find_several]
else:
tmp_field = xml_tree.xpath(query_path, namespaces=NS_MAP)
if len(tmp_field) > 0:
return_string = tmp_field[0]
else:
if mandatory is True:
sys.exit("Field stored in %s is mandatory. Exiting." % query_path)
else:
return_string = ""
return return_string
# def get_field ends here
# Mandatory values (according to database schema)
info_dict['eoa_publicationdate'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:date/@when", mandatory=True)
info_dict['eoa_language'] = get_field(xml_tree, "//t:teiHeader/t:profileDesc/t:langUsage/t:language/@ident", mandatory=True)
info_dict['eoa_license'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:availability/t:licence/text()", mandatory=True)
info_dict['eoa_number'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@level='s']/@n", mandatory=True)
info_dict['eoa_series'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@level='s']/text()", mandatory=True)
info_dict['eoa_title'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@type='main']/text()", mandatory=True)
# Optional (according to database schema)
info_dict['eoa_subtitle'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@type='sub']/text()")
info_dict['eoa_isbn'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:idno[@type='ISBN']/text()")
info_dict['eoa_price'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:extent/t:measure[@unit='EUR']/@quantity")
info_dict['eoa_shoplink_url'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:distributor/@xml:base")
info_dict['eoa_shoplink_id'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:distributor/@xml:id")
info_dict['eoa_shoplink_text'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:distributor/text()")
info_dict['eoa_brief_desc'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='BriefDescription']/text()")
info_dict['eoa_detail_desc'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='DetailedDescription']/text()")
info_dict['eoa_additional_info'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='additionalinformation']/text()")
info_dict['eoa_dedication'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='dedication']/text()")
info_dict['eoa_submitters'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='submitter']", findall=True)
info_dict['eoa_publicationmanagers'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='publicationmanager']", findall=True)
info_dict['eoa_publicationassistants'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='publicationassistant']", findall=True)
info_dict['eoa_editorialcoordinators'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='editorialcoordinator']", findall=True)
info_dict['eoa_copyeditors'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='copyeditor']", findall=True)
info_dict['eoa_translators'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='translator']", findall=True)
info_dict['eoa_keywords'] = get_field(xml_tree, "//t:teiHeader/t:profileDesc/t:textClass/t:keywords/t:list/t:item", findall=True)
info_dict['eoa_authors'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:author", findall=True)
info_dict['eoa_editors'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor", findall=True)
return info_dict
# def get_publication_info ends here
def make_publication_cfg(info_dict):
"""Main function"""
config = configparser.ConfigParser(delimiters=(':'))
# https://stackoverflow.com/questions/1611799/preserve-case-in-configparser
config.optionxform=str
# set up three main bits
config['Technical'] = {}
technical_config = config['Technical']
config['General'] = {}
general_config = config['General']
config['Authors'] = {}
authors_config = config['Authors']
date_object = datetime.strptime(info_dict['eoa_publicationdate'], "%Y-%m-%d")
# fill in the fields
technical_config['Serie'] = info_dict['eoa_series'] #ok
technical_config['Number'] = info_dict['eoa_number'] #ok
technical_config['Title'] = info_dict['eoa_title'] #ok
technical_config['Subtitle'] = info_dict['eoa_subtitle'] #ok
technical_config['PublicationDate'] = info_dict['eoa_publicationdate'] #ok
technical_config['PublicationYear'] = datetime.strftime(date_object, "%Y")
technical_config['ISBN'] = info_dict['eoa_isbn'] #ok
technical_config['Price'] = info_dict['eoa_price'] #ok
technical_config['Shoplink'] = """<a href="{0}{1}">{2}</a>""".format(info_dict['eoa_shoplink_url'], info_dict['eoa_shoplink_id'].replace("id_", ""), info_dict['eoa_shoplink_text']) #ok
technical_config['Language'] = info_dict['eoa_language'] #ok
technical_config['License'] = info_dict['eoa_license'] #ok
general_config['BriefDescription'] = info_dict['eoa_brief_desc'] #ok
general_config['Submitter'] = ", ".join(info_dict['eoa_submitters']) #ok
general_config['PublicationManagment'] = ", ".join(info_dict['eoa_publicationmanagers'])
general_config['PublicationAssistants'] = ", ".join(info_dict['eoa_publicationassistants'])
if len(info_dict['eoa_keywords']) > 8:
sys.exit("Too many Keywords. Up to 8 are allowed. Exiting.")
else:
for keyword in info_dict['eoa_keywords']:
keyword_label = "Keyword" + str(info_dict['eoa_keywords'].index(keyword) + 1)
general_config[keyword_label] = keyword
general_config['DetailedDescription'] = info_dict['eoa_detail_desc'] #ok
general_config['AdditionalInformation'] = info_dict['eoa_additional_info'] #ok
general_config['EditorialCoordination'] = ", ".join(info_dict['eoa_editorialcoordinators'])
general_config['Copyediting'] = ", ".join(info_dict['eoa_copyeditors'])
general_config['Dedication'] = info_dict['eoa_dedication'] #ok
general_config['Translator'] = ", ".join(info_dict['eoa_translators'])
number_of_authors = len(info_dict['eoa_authors'])
number_of_editors = len(info_dict['eoa_editors'])
if number_of_authors > 0 and number_of_editors > 0:
print("Found both editor and authors. This is not permitted. Exiting")
sys.exit()
elif number_of_authors == 0 and number_of_editors == 0:
print("Found neither editor nor authors. Please fill in. Exiting")
sys.exit()
elif number_of_authors > 5 or number_of_editors > 5:
print("Only a maximum of 5 authors or editors allowed. Exiting")
sys.exit()
elif number_of_authors == 0 and number_of_editors in range(1,6):
EDITED_VOLUME = True
elif number_of_authors in range(1,6) and number_of_editors == 0:
EDITED_VOLUME = False
else:
print("num authors: ", number_of_authors)
print("num editors: ", number_of_editors)
print("Something went wrong with the number of authors end editors. Please check. Exiting")
sys.exit()
for entry in range(0, 5):
author_label = "Author" + str(entry + 1)
try:
if EDITED_VOLUME == True:
authors_config[author_label] = info_dict['eoa_editors'][entry]
if number_of_editors == 1:
authors_config['Zusatz'] = "({})".format(libeoaconvert.dict_ed[info_dict['eoa_language']].capitalize())
else:
authors_config['Zusatz'] = "({})".format(libeoaconvert.dict_eds[info_dict['eoa_language']].capitalize())
else:
authors_config[author_label] = info_dict['eoa_authors'][entry]
authors_config['Zusatz'] = ""
except IndexError:
authors_config[author_label] = ""
output_filename = OUTPUT_DIR + os.path.sep + "publication.cfg"
with open(output_filename, 'w') as configfile:
config.write(configfile)
print("Wrote", output_filename)
# def make_publication_cfg ends here
def render_reference(list_of_xml_elements, cited_data):
"""Provide an attribute for a formatted version of Reference.
This will be used for output formats that don't have a bibliographic
formatter themselves
"""
for reference in list_of_xml_elements:
citekey = reference.xpath("t:ref/@target", namespaces=NS_MAP)[0][1:]
# here we need to get a formatted version of the entry, like it
# would appear in the typeset version.
# looked at: bibulous
# pandoc-citeproc, maybe
element = etree.SubElement(reference, "abbr", type="authoryear")
element.text = cited_data[citekey][1]
element = etree.SubElement(reference, "abbr", type="title")
element.text = cited_data[citekey][2]
# def render_reference ends here
def write_citation_markdown(used_citekeys, citations_filename):
"""Write markdown file with citekeys for bibliography rendering"""
md_file_header = "---\nlang: en\ntitle: Citations\n...\n\n"
with open(TMP_DIR + os.path.sep + citations_filename, "w") as citation_formatter:
citation_formatter.write(md_file_header)
# citation_formatter.write("# Full parentheses\n")
citation_formatter.write("# citeauthoryear\n")
for entry in used_citekeys:
citation_formatter.write("[@%s]\n" % entry)
citation_formatter.write("\n# citeyear\n")
for entry in used_citekeys:
citation_formatter.write("[-@%s]\n" % entry)
# sentencestyle
citation_formatter.write("\n# yearparen\n")
for entry in used_citekeys:
citation_formatter.write("@%s\n" % entry)
citation_formatter.write("\n# References\n")
logging.info("Wrote citation formatter.")
# def write_citation_markdown ends here
def format_reference_list(used_citekeys, html_file):
"""Create an HTML formatted list of references"""
# second part of function
with open(TMP_DIR + os.path.sep + html_file, "r") as ding:
reference_list = soupparser.fromstring(ding, features="html.parser")
references = reference_list.xpath("//div[@class='references']")[0]
return references
# def format_reference_list ends here
def format_citations(used_citekeys, bibdata, html_file):
"""Return a formatted entry of the used citations"""
# print(used_citekeys)
with open(TMP_DIR + os.path.sep + html_file, "r") as ding:
cites = BeautifulSoup(ding, "html.parser")
citation_dict = {}
for entry in used_citekeys:
for entry_2 in bibdata:
if entry_2["id"] == entry:
current_citation = entry
# logging.info("%s: The title %s" % (html_file, entry_2["title"]))
strTitle = entry_2["title"]
title = strTitle
authoryear_citation = cites.select("#citeauthoryear ~ p > span[data-cites='%s']" % entry)[0].text[1:-1]
year_citation = cites.select("#citeyear ~ p > span[data-cites='%s']" % entry)[0].text[1:-1]
citation_dict[entry] = (authoryear_citation, year_citation, title)
return citation_dict
# def format_citations ends here
def format_pagerange(pagerange_start, pagerange_end):
"""Parse valuse of citedRange attributes. Return formatted string"""
return_string = ""
if pagerange_start is not None:
return_string += pagerange_start
if pagerange_end is not None:
return_string += "" + pagerange_end
return return_string
# def format_pagerange ends here
def format_authors(list_author_id, publang):
"""Retrieve author names from respStmt entries and format them."""
author_string = ""
formatted_list = []
for author in list_author_id:
tmp_xpath = "//t:respStmt[@xml:id='%s']" % author[1:]
author_xml = xml_tree.xpath(tmp_xpath, namespaces=NS_MAP)
surname = author_xml[0].find("t:persName/t:surname", namespaces=NS_MAP).text
forename = author_xml[0].find("t:persName/t:forename", namespaces=NS_MAP).text
single_author_string = "{} {}".format(forename, surname)
formatted_list.append(single_author_string)
if len(formatted_list) == 1:
author_string = formatted_list[0]
elif len(formatted_list) == 2:
author_string = "{} {} {}".format(formatted_list[0], libeoaconvert.dict_and[publang], formatted_list[1])
elif len(formatted_list) > 2:
author_string = ", ".join(formatted_list[0:-1])
author_string += " {} {}".format(libeoaconvert.dict_and[publang], formatted_list[-1])
return author_string
# def format_authors ends here
def transform_body(xml_tree, cited_data, publang):
"""Transform the body of XML document into IntermediateXML file"""
logging.info("Performing XML transformations of the body.")
######################
# Document structure #
######################
# unclean solution
# chapter_element = xml_tree[0]
# chapter_element.tag = "div1"
# chapter_element.set("language", publang)
eoa_chapters = xml_tree.xpath("//t:div[@type='chapter']", namespaces=NS_MAP)
for chapter in eoa_chapters:
chapter.tag = "div1"
chapter.set("language", publang)
chapter_title = chapter.find("t:head", namespaces=NS_MAP)
author_ids = chapter.get("resp")
list_author_id = author_ids.split(" ")
if len(list_author_id) > 0:
author_string = format_authors(list_author_id, publang)
# print(author_string)
eoa_author = etree.Element("EOAauthor")
eoa_author.text = author_string
chapter_title.insert(0, eoa_author)
chapter.insert(0, chapter_title)
eoa_sections = xml_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP)
for section in eoa_sections:
section.tag = "div2"
eoa_subsections = xml_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP)
for subsection in eoa_subsections:
subsection.tag = "div3"
eoa_subsubsections = xml_tree.xpath("//t:div[@type='subsubsection']", namespaces=NS_MAP)
for subsubsection in eoa_subsubsections:
subsubsection.tag = "div4"
##############
# Paragraphs #
##############
eoa_paragraphs = xml_tree.xpath("//t:p[not(@rend='footnote text')]", namespaces=NS_MAP)
for paragraph in eoa_paragraphs:
paragraph.tag = "p"
if paragraph.get("rend") == "Quote":
paragraph.set("rend", "quoted")
#############
# Citations #
#############
# we need some data of the references here!
"""
<!--
<span rel="popover" class="citation" data-toggle="popover" html="true" data-placement="bottom" data-title="Descartes 1644, 37–44" data-content="Principia philosophiae.">Descartes 1644, 37–44</span>
-->
Intermediate XML:
<span rel="popover" class="citation" citekey="monti_tradizione_2011" data-toggle="popover" html="true" data-placement="bottom" data-title="Monti " data-content="La tradizione galileiana e lo sperimentalismo naturalistico d&#x2019;Et&#xE0; Moderna. Pratiche, teorie, linguaggi.">Monti </span>
"""
eoa_citations = xml_tree.xpath("//t:bibl", namespaces=NS_MAP)
for citation in eoa_citations:
pagerange = ""
cited_range = citation.xpath("t:citedRange", namespaces=NS_MAP)
citeref = citation.xpath("t:ref", namespaces=NS_MAP)
cite_render = citeref[0].get("type")
citekey = citeref[0].get("target")[1:]
citeref[0].tag = "tagtobestripped"
citation.tag = "span"
citation.set("rel", "popover")
citation.set("class", "citation")
citation.set("citekey", citekey)
citation.set("data-toggle", "popover")
citation.set("html", "true")
citation.set("data-placement", "bottom")
if len(cited_range) > 0:
if cited_range[0].text is not None and cited_range[0].get("from") is not None:
print("You must not use 'from' attribute and text in citedRange at the same time. Exiting.")
sys.exit()
elif cited_range[0].text is not None:
# might contain markup!
pagerange = ", {}".format(cited_range[0].text)
# clear the text
cited_range[0].text = ""
elif cited_range[0].get("from") is not None:
pagerange_start = cited_range[0].get("from")
pagerange_end = cited_range[0].get("to")
pagerange = ", " + format_pagerange(pagerange_start, pagerange_end)
cited_range[0].tag = "tagtobestripped"
if cite_render == 'inline':
try:
formatted_citation = cited_data[citekey][1] + pagerange
except KeyError:
print("Citekey %s was not found in the references. Exiting." % citekey)
sys.exit()
else:
try:
formatted_citation = cited_data[citekey][0] + pagerange
except KeyError:
print("Citekey %s was not found in the references. Exiting." % citekey)
sys.exit()
citation.text = formatted_citation
# try:
# formatted_citation = cited_data[citekey][0] + pagerange
# except KeyError:
# print("Citekey %s was not found in the references. Exiting." % citekey)
# sys.exit()
citation.set("data-title", formatted_citation)
citation.set("data-content", cited_data[citekey][2])
#############
# Footnotes #
#############
eoa_footnotes = xml_tree.xpath("//t:note[@place='bottom']", namespaces=NS_MAP)
"""
<note place="bottom" xml:id="ftn2" n="2">
<note id-text="34" id="uid40" place="Inline"><p>One reads</note>
"""
for footnote in eoa_footnotes:
# re-assign tag here to get rid of namespace
footnote.tag = "note"
footnote.set("place", "Inline")
footnote.set("id-text", footnote.get("n"))
fn_parent = footnote.getparent()
# we assert here that the parent of a footnote is always a paragraph
footnote_id = footnote.xpath("@xml:id")[0]
if fn_parent.tag != "p":
print("This footnote's parent is not a p element: %s. Exiting." % footnote_id)
sys.exit()
fn_paragraphs = footnote.xpath("t:p", namespaces=NS_MAP)
for fn_paragraph in fn_paragraphs:
fn_paragraph.tag = "p"
del fn_paragraph.attrib["rend"]
###########
# Figures #
###########
"""
<figure><graphic url="figures/Fig.3CarceresaccidentalTraceFirenze2017.png"/><head>Latin inscription on a wall in Caceres, Spain. CIL II 697</head></figure>
<EOAfigure id="uid21">
<anchor id-text="1" id="uid21"/>
<p>
<caption>An example of the titles</caption>
<file>images/Figure1-1_BenedettiSignature.jpg</file>
<width>60</width>
</p>
</EOAfigure>
"""
figure_counter = 1
eoa_figures = xml_tree.xpath("//t:figure", namespaces=NS_MAP)
for figure in eoa_figures:
figure.tag = "EOAfigure"
figure.set("id", "anotheruid")
anchor_element = etree.SubElement(figure, "anchor")
# anchor_element.set("id-text", "id-text")
figure_type = figure.get("type")
if figure_type == "hionly":
pass
else:
# careful, caption can contain markup!
caption_element = figure.xpath("t:head", namespaces=NS_MAP)[0]
caption_element.tag = "caption"
fig_p_element = etree.SubElement(figure, "p")
figure_file = etree.SubElement(fig_p_element, "file").text = figure.xpath("t:graphic/@url", namespaces=NS_MAP)[0]
figure_width = etree.SubElement(fig_p_element, "width").text = "60" #whatever
fig_p_element.append(caption_element)
etree.strip_elements(figure, "{%s}graphic" % ns_tei)
##############
# Hi-Element #
##############
eoa_hi = xml_tree.xpath("//t:hi", namespaces=NS_MAP)
for hi in eoa_hi:
rend_attribute = hi.get("rend")
if rend_attribute == "italic":
hi.set("rend", "it")
elif rend_attribute == "sup":
hi.tag = "EOAup"
del hi.attrib["rend"]
elif rend_attribute == "sub":
hi.tag = "EOAdown"
del hi.attrib["rend"]
else:
logging.info("The rend attribute in hi has the value %s. This is not supported" % rend_attribute)
##############
# References #
##############
eoa_ref = xml_tree.xpath("//t:body//t:ref", namespaces=NS_MAP)
for ref in eoa_ref:
ref_parent = ref.getparent()
if ref_parent == "bibl":
continue
target_attribute = ref.get("target")
if len(target_attribute) == 0:
print("Found a ref element without target. Exiting.")
sys.exit()
else:
url_attribute = ref.get("type")
if url_attribute == "url":
del ref.attrib["type"]
del ref.attrib["target"]
ref.tag = "xref"
ref.set("url", target_attribute)
else:
ref.tag = "EOAref"
del ref.attrib["target"]
etree.SubElement(ref, "ref", teitarget=target_attribute)
etree.SubElement(ref, "Label").text = target_attribute
return xml_tree
# def transform_body ends here
def assign_ids(xml_tree, data):
"""Walk the xml tree again. Assign ids to xml and put them into dicts, as well."""
chapterdict = {}
figdict = {}
eqdict = {}
fndict = {}
listdict = {}
pagelabeldict = {}
secdict = {}
tabdict = {}
theoremdict = {}
chapter_counter = 1
xml_chapters = xml_tree.xpath("//div1")
for chapter in xml_chapters:
equation_counter = 1
footnote_counter = 1
list_counter = 1
section_counter = 1
table_counter = 1
theorem_counter = 1
if chapter.get('rend') != "nonumber":
chapter.set("id-text", str(chapter_counter))
chapterdict[chapter.get("id")] = str(chapter_counter)
figure_anchors = chapter.findall(".//EOAfigure/anchor")
figure_counter = 1
for anchor in figure_anchors:
figure_number = "%d.%d" % (chapter_counter, figure_counter)
anchor.set("id-text", figure_number)
figure_counter += 1
figure_element = anchor.getparent()
figure_element.set("id", anchor.get("id"))
figdict[anchor.get("id")] = figure_number
footnotes = chapter.findall(".//note")
for footnote in footnotes:
fndict[footnote.get("id")] = footnote.get("n")
sections = chapter.findall(".//div2")
section_counter = 1
for section in sections:
if section.get('rend') != "nonumber":
section_number = "%d.%d" % (chapter_counter, section_counter)
section.set("id-text", section_number)
secdict[section.get("id")] = section_number
subsection_counter = 1
subsections = section.findall(".//div3")
for subsection in subsections:
if subsection.get('rend') != "nonumber":
subsection_number = "%d.%d.%d" % (chapter_counter, section_counter, subsection_counter)
subsection.set("id-text", subsection_number)
secdict[subsection.get("id")] = subsection_number
subsection_counter += 1
section_counter += 1
chapter_counter += 1
# not implemented yet: equation, list, pagelabel, tab, theorem
data["chapterdict"] = chapterdict
data["figdict"] = figdict
data["eqdict"] = eqdict
data["fndict"] = fndict
data["listdict"] = listdict
data["pagelabeldict"] = pagelabeldict
data["secdict"] = secdict
data["tabdict"] = tabdict
data["theoremdict"] = theoremdict
return xml_tree, data
# def assign_ids ends here
def update_ids(xml_tree):
"""Update the references in EOAref to the id value assigned in assign_ids"""
xmlReferences = xml_tree.findall(".//EOAref")
for xmlReference in xmlReferences:
eoa_reference = xmlReference.find("ref")
label_text = xmlReference.find("Label").text[1:]
logging.debug("label text is %s" % label_text)
# if label_text.endswith("-hi"):
# logging.debug("%s is a hyperimage reference. Leaving out for now." % label_text)
# pass
# else:
corresponding_eoa_id_element = xml_tree.xpath("//*[@xml:id='{}']".format(label_text))
if len(corresponding_eoa_id_element) == 0:
print("There seems to be no corresponding xml:id for %s. Exiting." % label_text)
sys.exit()
elif len(corresponding_eoa_id_element) > 1:
print("The xml:id %s has been assigned more than once. This is not allowed. Exiting." % corresponding_eoa_id_element)
sys.exit()
else:
eoa_id_element = corresponding_eoa_id_element[0]
eoa_id = eoa_id_element.get("id")
eoa_reference.set("target", eoa_id)
return xml_tree
# def update_ids ends here
def prepare_bibliography(bib_data):
"""Create a JSON version of bibliography data, using pandoc-citeproc"""
# json
interim_bib_json_file = TMP_DIR + os.path.sep + "tmp-bib.json"
citeproc_command = "pandoc-citeproc --bib2json %s" % bib_data["source"]
citeproc_arguments = shlex.split(citeproc_command)
citeproc_process = subprocess.Popen(citeproc_arguments, stdout=subprocess.PIPE)
citeproc_json = citeproc_process.stdout.read()
citations_json = json.loads(citeproc_json)
with open(interim_bib_json_file, 'w') as json_file:
json_file.write(citeproc_json.decode('utf-8'))
logging.info("Wrote json file")
return citations_json
# def prepare_bibliography ends here
def add_bibliography_monograph(xml_tree, refs_for_bib_chapter):
"""Add another chapter containing the bibliography."""
root_element = xml_tree.getroot()
xml_chapters = root_element.xpath("//div1")
number_of_chapters = len(xml_chapters)
bibliography_chapter = etree.Element("div1", rend="nonumber", language="english")
# this needs to be configurable by language
bib_head = etree.SubElement(bibliography_chapter, "head").text = "Bibliography"
bib_div_1 = etree.SubElement(bibliography_chapter, "div")
bib_div_2 = etree.SubElement(bib_div_1, "div")
entries = refs_for_bib_chapter.findall(".//div")
for entry in entries:
entry_id = entry.get("id")
entry.set("class", "bibliography")
etree.strip_tags(entry, "p")
entry.tag = "p"
internal_markup = entry.findall(".//em")
for markup in internal_markup:
markup.tag = "i"
bib_div_2.append(entry)
root_element.insert(number_of_chapters + 1, bibliography_chapter)
return root_element
# def add_bibliography_monograph ends here
def add_bibliography_anthology(xml_tree, formatted_references_dict):
"""Add another chapter containing the bibliography."""
for chapter in formatted_references_dict.keys():
chapter_id = chapter.replace("dict_", "")
# print("looking at", chapter_id, formatted_references_dict[chapter])
# tmp_xpath_ns = "//t:div1[@xml:id='%s']//processing-instruction('eoa')" % chapter_id
tmp_xpath = "//div1[@xml:id='%s']//processing-instruction('eoa')" % chapter_id
# print(tmp_xpath)
# eoa_pis_ns = xml_tree.xpath(tmp_xpath, namespaces=NS_MAP)
eoa_pis = xml_tree.xpath(tmp_xpath)
# eoa_pis = xml_tree.xpath("//processing-instruction('eoa')")
# eoa_pis = xml_tree.xpath("//t:div1[@xml:id='chap17_riggs']//processing-instruction('eoa')", namespaces=NS_MAP)
# print(eoa_pis)
for eoa_pi in eoa_pis:
if eoa_pi.text == "printbibliography":
# print("ok cool, printbibliography")
# assuming there's only one
bibliography_parent = eoa_pi.getparent()
fixed_references = fix_bib_entries(formatted_references_dict[chapter])
"""
<div1 rend="nonumber" language="english"><head>Bibliography</head>
<div><div><p id="ref-adami_storia_1737" class="bibliography">
Adami, Andrea (1737). <i>Storia Di Volseno Antica Metropoli Della Toscana Descritta in Quattro Libri</i>. Vol. I. IV vol. In Roma: Per Antonio de’ Rossi, nella Strada del Seminario Romano.
</p>
"""
extra_div = etree.SubElement(bibliography_parent, "div")
extra_div.insert(1, fixed_references)
# bibliography_parent.insert(1, formatted_references_dict[chapter])
bibliography_parent.remove(eoa_pi)
# xml_chapters = xml_tree.xpath("//t:div1", namespaces=NS_MAP)
# xml_chapters_re = root_element.xpath("//t:div1", namespaces=NS_MAP)
# xml_chapters = xml_tree.xpath("//div1")
# yyy.xpath("//t:div1[@xml:id='chap01_caraffa']//processing-instruction('eoa')", namespaces=NS_MAP)
# for chapter in xml_chapters:
# chapter_id = chapter.xpath("@xml:id")
# eoa_pi = chapter.xpath("//processing-instruction('eoa')")
# print("Looking at", chapter_id, eoa_pi)
return xml_tree
# def add_bibliography_anthology ends here
def fix_bib_entries(div_snippet):
"""Modify the html code returned by pandoc-citeproc"""
entries = div_snippet.findall(".//div")
for entry in entries:
entry_id = entry.get("id")
entry.set("class", "bibliography")
etree.strip_tags(entry, "p")
entry.tag = "p"
internal_markup = entry.findall(".//em")
for markup in internal_markup:
markup.tag = "i"
return div_snippet
# def fix_bib_entries ends here
if __name__ == '__main__':
if len(sys.argv) == 1:
print("You must specify an input file!")
sys.exit()
elif len(sys.argv) > 2:
print("You can work with only one publication at a time!")
sys.exit()
if not os.path.exists(TMP_DIR):
os.mkdir(os.path.expanduser(TMP_DIR))
with open(TMP_DIR + os.path.sep + 'data.pickle', 'rb') as f:
data = pickle.load(f)
tei_document = sys.argv[-1]
xml_tree = etree.parse(tei_document)
publication_language = xml_tree.xpath("//t:teiHeader/t:profileDesc/t:langUsage/t:language/@ident", namespaces=NS_MAP)[0]
bib_data = {}
bib_data["source"] = xml_tree.xpath("//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='bibliography']/t:ref/@target", namespaces=NS_MAP)[0]
bib_data["type"] = xml_tree.xpath("//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='bibliography']/t:ref/@type", namespaces=NS_MAP)[0]
logging.info("The bibfile is %s and this publication type is %s." % (bib_data["source"], bib_data["type"]))
if bib_data["type"] not in ["monograph", "anthology", "monograph-numeric", "anthology-numeric"]:
print("The bibliography type %s is not allowed." % bib_data["type"])
citations_json = prepare_bibliography(bib_data)
all_citations = xml_tree.xpath("//t:bibl/t:ref", namespaces=NS_MAP)
all_citekeys = []
for citation in all_citations:
citekey = citation.get("target")[1:]
if citekey not in all_citekeys:
all_citekeys.append(citekey)
used_citekeys = all_citekeys
# used_citekeys = set(data["citekeys"])
citations_filename_markdown = "used_citations.md"
citations_filename_html = "formatted_citations.html"
write_citation_markdown(used_citekeys, citations_filename_markdown)
markdown_command = "pandoc -o %s -t html --filter=pandoc-citeproc --bibliography=%s --csl=%s %s" % (TMP_DIR + os.path.sep + citations_filename_html, bib_data["source"], CSL_FILE, TMP_DIR + os.path.sep + citations_filename_markdown)
arguments = shlex.split(markdown_command)
logging.info("Using external command pandoc: %s." % markdown_command)
subprocess.call(arguments)
logging.info("Finished processing the bibtex file.")
logging.info("Formatting citations now.")
cited_dict = format_citations(used_citekeys, citations_json, citations_filename_html)
if bib_data["type"] == "monograph":
refs_for_bib_chapter = format_reference_list(used_citekeys, citations_filename_html)
elif bib_data["type"] == "anthology":
formatted_references_dict = {}
all_chapter_ids = xml_tree.xpath("//t:div[@type='chapter']/@xml:id", namespaces=NS_MAP)
for chapter_id in all_chapter_ids:
used_citekeys_per_chapter = data["citekeys_by_chapter"][chapter_id]
citations_filename_markdown = chapter_id + ".md"
citations_filename_html = "formatted_citations_" + chapter_id + ".html"
write_citation_markdown(used_citekeys_per_chapter, citations_filename_markdown)
markdown_command = "pandoc -o %s -t html --filter=pandoc-citeproc --bibliography=%s --csl=%s %s" % (TMP_DIR + os.path.sep + citations_filename_html, bib_data["source"], CSL_FILE, TMP_DIR + os.path.sep + citations_filename_markdown)
arguments = shlex.split(markdown_command)
logging.info("Using external command pandoc: %s." % markdown_command)
subprocess.call(arguments)
refs_for_bib_chapter = format_reference_list(used_citekeys_per_chapter, citations_filename_html)
tmp_dict_key = "dict_" + chapter_id
# create a dictionary entry containing the formatted references
formatted_references_dict[tmp_dict_key] = refs_for_bib_chapter
# refs for bib_chapter contains formatted reference entries
# render_reference(all_references, cited_dict)
# print(cited_dict)
tei_body = xml_tree.xpath("//t:body", namespaces=NS_MAP)[0]
body_transformed = transform_body(tei_body, cited_dict, publang=publication_language)
resulting_tree = etree.ElementTree(body_transformed)
tmp_output_filename = TMP_DIR + os.path.sep + "after_transformation.xml"
# resulting_tree.write(output_filename, pretty_print=True, xml_declaration=True,encoding="utf-8")
resulting_tree.write(tmp_output_filename, pretty_print=True, xml_declaration=True, encoding="utf-8")
logging.info("Wrote %s for debugging purposes." % tmp_output_filename)
if bib_data["type"] == "monograph":
xml_add_bib = add_bibliography_monograph(resulting_tree, refs_for_bib_chapter)
elif bib_data["type"] == "anthology":
xml_add_bib = add_bibliography_anthology(resulting_tree, formatted_references_dict)
# xml_add_bib = add_bibliography(resulting_tree, refs_for_bib_chapter)
etree.strip_tags(xml_add_bib, "tagtobestripped")
elements_with_ids = xml_add_bib.xpath("//div1 | //div2 | //div3 | //note | //item | //table | //EOAfigure/anchor | //EOAequation | //formula | //theorem")
element_counter = 1
for element in elements_with_ids:
element.set("id", "uid" + str(element_counter))
element_counter += 1
assigned_ids, data_to_pickle = assign_ids(resulting_tree, data)
updated_xml_tree = update_ids(assigned_ids)
xml_root = updated_xml_tree.getroot()
xml_root.tag = "Book"
final_tree = etree.ElementTree(xml_root)
# objectify.deannotate(final_tree, cleanup_namespaces=True)
# etree.cleanup_namespaces(xml_root)
with open(TMP_DIR + os.path.sep + 'data.pickle', 'wb') as f:
# Pickle the 'data' dictionary using the highest protocol available.
pickle.dump(data_to_pickle, f, pickle.HIGHEST_PROTOCOL)
# if not os.path.exists("CONVERT"):
# os.mkdir(os.path.expanduser("CONVERT"))
# if not os.path.exists("debug"):
# os.mkdir(os.path.expanduser("debug"))
if not os.path.exists(OUTPUT_DIR):
os.mkdir(os.path.expanduser(OUTPUT_DIR))
publication_info = get_publication_info(xml_tree)
make_publication_cfg(publication_info)
if not os.path.exists(TMP_DIR):
os.mkdir(os.path.expanduser(TMP_DIR))
output_filename = TMP_DIR + os.path.sep + "IntermediateXMLFile.xml"
# resulting_tree.write(output_filename, pretty_print=True, xml_declaration=True,encoding="utf-8")
final_tree.write(output_filename, pretty_print=True, xml_declaration=True, encoding="utf-8")
logging.info("Wrote %s." % output_filename)
# Remove namespace info (brute force solution)
bad_ns_string = ' xmlns="http://www.tei-c.org/ns/1.0"'
with open(output_filename, 'r') as textfile:
xml_as_string = textfile.read()
removed_namespace = xml_as_string.replace(bad_ns_string, "")
with open(output_filename, 'w') as amended_textfile:
amended_textfile.write(removed_namespace)
# finis