Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
EOASkripts/tei2imxml.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
691 lines (536 sloc)
27.1 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# -*- coding: utf-8; mode: python -*- | |
__version__ = "1.0" | |
__date__ = "20180116" | |
__author__ = "kthoden@mpiwg-berlin.mpg.de" | |
__doc__ = """A converter from TEI to Django.""" | |
import os | |
import sys | |
import logging | |
import json | |
import subprocess | |
import pickle | |
import shlex | |
import configparser | |
from datetime import datetime | |
from bs4 import BeautifulSoup | |
from lxml import etree, objectify | |
from lxml.html import soupparser | |
# things to be done | |
# assign ids top to bottom for the following elements: | |
# div1 div2 div3 note item table EOAfigure EOAequation formula theorem | |
CONFIG_FILE = os.path.dirname(sys.argv[0]) + os.path.sep + "config" + os.path.sep +"eoaconvert.cfg" | |
# Reading the configuration file | |
CONFIG = configparser.ConfigParser() | |
CONFIG.read(CONFIG_FILE) | |
logging.basicConfig(level=logging.INFO, format=' %(asctime)s - %(levelname)s - %(message)s') | |
ns_tei = "http://www.tei-c.org/ns/1.0" | |
NS_MAP = {"t" : ns_tei} | |
TMP_DIR = os.path.expanduser("tmp_files") | |
OUTPUT_DIR = os.path.expanduser("CONVERT") | |
CSL_FILE = CONFIG['Auxiliaries']['CSL_FILE'] | |
def get_publication_info(xml_tree): | |
"""Query the TEI document for metadata fields. | |
Return a dictionary""" | |
info_dict = {} | |
ns_tei = "http://www.tei-c.org/ns/1.0" | |
ns_cc = "http://web.resource.org/cc/" | |
ns_rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" | |
NS_MAP = {"t" : ns_tei, "c" : ns_cc, "r" : ns_rdf} | |
def get_field(xml_tree, query_path, mandatory=False, findall=False): | |
"""Query XML for metadata fields. | |
Default behaviour is if it fails, move on, if mandatory is set | |
to True, exit the program | |
""" | |
if findall is True: | |
find_several = xml_tree.findall(query_path, namespaces=NS_MAP) | |
if len(find_several) == 1: | |
return_string = [find_several[0].text] | |
else: | |
return_string = [x.text for x in find_several] | |
else: | |
tmp_field = xml_tree.xpath(query_path, namespaces=NS_MAP) | |
if len(tmp_field) > 0: | |
return_string = tmp_field[0] | |
else: | |
if mandatory is True: | |
sys.exit("Field stored in %s is mandatory. Exiting." % query_path) | |
else: | |
return_string = "" | |
return return_string | |
# def get_field ends here | |
# Mandatory values (according to database schema) | |
info_dict['eoa_publicationdate'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:date/@when", mandatory=True) | |
info_dict['eoa_language'] = get_field(xml_tree, "//t:teiHeader/t:profileDesc/t:langUsage/t:language/@ident", mandatory=True) | |
info_dict['eoa_license'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:availability/t:licence/text()", mandatory=True) | |
info_dict['eoa_number'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@level='s']/@n", mandatory=True) | |
info_dict['eoa_series'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@level='s']/text()", mandatory=True) | |
info_dict['eoa_title'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@type='main']/text()", mandatory=True) | |
# Optional (according to database schema) | |
info_dict['eoa_subtitle'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@type='sub']/text()") | |
info_dict['eoa_isbn'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:idno[@type='ISBN']/text()") | |
info_dict['eoa_price'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:extent/t:measure[@unit='EUR']/@quantity") | |
info_dict['eoa_shoplink_url'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:distributor/@xml:base") | |
info_dict['eoa_shoplink_id'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:distributor/@xml:id") | |
info_dict['eoa_shoplink_text'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:distributor/text()") | |
info_dict['eoa_brief_desc'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='BriefDescription']/text()") | |
info_dict['eoa_detail_desc'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='DetailedDescription']/text()") | |
info_dict['eoa_additional_info'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='additionalinformation']/text()") | |
info_dict['eoa_dedication'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='dedication']/text()") | |
info_dict['eoa_submitters'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='submitter']", findall=True) | |
info_dict['eoa_publicationmanagers'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='publicationmanager']", findall=True) | |
info_dict['eoa_publicationassistants'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='publicationassistant']", findall=True) | |
info_dict['eoa_editorialcoordinators'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='editorialcoordinator']", findall=True) | |
info_dict['eoa_copyeditors'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='copyeditor']", findall=True) | |
info_dict['eoa_translators'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='translator']", findall=True) | |
info_dict['eoa_keywords'] = get_field(xml_tree, "//t:teiHeader/t:profileDesc/t:textClass/t:keywords/t:list/t:item", findall=True) | |
info_dict['eoa_authors'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:author", findall=True) | |
return info_dict | |
# def get_publication_info ends here | |
def make_publication_cfg(info_dict): | |
"""Main function""" | |
config = configparser.ConfigParser(delimiters=(':')) | |
# https://stackoverflow.com/questions/1611799/preserve-case-in-configparser | |
config.optionxform=str | |
# set up three main bits | |
config['Technical'] = {} | |
technical_config = config['Technical'] | |
config['General'] = {} | |
general_config = config['General'] | |
config['Authors'] = {} | |
authors_config = config['Authors'] | |
date_object = datetime.strptime(info_dict['eoa_publicationdate'], "%Y-%m-%d") | |
# fill in the fields | |
technical_config['Serie'] = info_dict['eoa_series'] #ok | |
technical_config['Number'] = info_dict['eoa_number'] #ok | |
technical_config['Title'] = info_dict['eoa_title'] #ok | |
technical_config['Subtitle'] = info_dict['eoa_subtitle'] #ok | |
technical_config['PublicationDate'] = info_dict['eoa_publicationdate'] #ok | |
technical_config['PublicationYear'] = datetime.strftime(date_object, "%Y") | |
technical_config['ISBN'] = info_dict['eoa_isbn'] #ok | |
technical_config['Price'] = info_dict['eoa_price'] #ok | |
technical_config['Shoplink'] = """<a href="{0}{1}">{2}</a>""".format(info_dict['eoa_shoplink_url'], info_dict['eoa_shoplink_id'].replace("id_", ""), info_dict['eoa_shoplink_text']) #ok | |
technical_config['Language'] = info_dict['eoa_language'] #ok | |
technical_config['License'] = info_dict['eoa_license'] #ok | |
general_config['BriefDescription'] = info_dict['eoa_brief_desc'] #ok | |
general_config['Submitter'] = ", ".join(info_dict['eoa_submitters']) #ok | |
general_config['PublicationManagment'] = ", ".join(info_dict['eoa_publicationmanagers']) | |
general_config['PublicationAssistants'] = ", ".join(info_dict['eoa_publicationassistants']) | |
if len(info_dict['eoa_keywords']) > 8: | |
sys.exit("Too many Keywords. Up to 8 are allowed. Exiting.") | |
else: | |
for keyword in info_dict['eoa_keywords']: | |
keyword_label = "Keyword" + str(info_dict['eoa_keywords'].index(keyword) + 1) | |
general_config[keyword_label] = keyword | |
general_config['DetailedDescription'] = info_dict['eoa_detail_desc'] #ok | |
general_config['AdditionalInformation'] = info_dict['eoa_additional_info'] #ok | |
general_config['EditorialCoordination'] = ", ".join(info_dict['eoa_editorialcoordinators']) | |
general_config['Copyediting'] = ", ".join(info_dict['eoa_copyeditors']) | |
general_config['Dedication'] = info_dict['eoa_dedication'] #ok | |
general_config['Translator'] = ", ".join(info_dict['eoa_translators']) | |
if len(info_dict['eoa_authors']) > 5: | |
sys.exit("Too many authors. Up to 5 are allowed. Exiting.") | |
else: | |
for entry in range(0, 5): | |
author_label = "Author" + str(entry + 1) | |
try: | |
authors_config[author_label] = info_dict['eoa_authors'][entry] | |
except IndexError: | |
authors_config[author_label] = "" | |
authors_config['Zusatz'] = "" | |
output_filename = OUTPUT_DIR + os.path.sep + "publication.cfg" | |
with open(output_filename, 'w') as configfile: | |
config.write(configfile) | |
print("Wrote", output_filename) | |
# def make_publication_cfg ends here | |
def render_reference(list_of_xml_elements, cited_data): | |
"""Provide an attribute for a formatted version of Reference. | |
This will be used for output formats that don't have a bibliographic | |
formatter themselves | |
""" | |
for reference in list_of_xml_elements: | |
citekey = reference.xpath("t:ref/@target", namespaces=NS_MAP)[0][1:] | |
# here we need to get a formatted version of the entry, like it | |
# would appear in the typeset version. | |
# looked at: bibulous | |
# pandoc-citeproc, maybe | |
element = etree.SubElement(reference, "abbr", type="authoryear") | |
element.text = cited_data[citekey][1] | |
element = etree.SubElement(reference, "abbr", type="title") | |
element.text = cited_data[citekey][2] | |
# def render_reference ends here | |
def write_citation_markdown(used_citekeys): | |
"""Write markdown file with citekeys for bibliography rendering""" | |
md_file_header = "---\nlang: en\ntitle: Citations\n...\n\n" | |
with open(TMP_DIR + os.path.sep + "used_citations.md", "w") as citation_formatter: | |
citation_formatter.write(md_file_header) | |
# citation_formatter.write("# Full parentheses\n") | |
citation_formatter.write("# citeauthoryear\n") | |
for entry in used_citekeys: | |
citation_formatter.write("[@%s]\n" % entry) | |
citation_formatter.write("\n# citeyear\n") | |
for entry in used_citekeys: | |
citation_formatter.write("[-@%s]\n" % entry) | |
# sentencestyle | |
citation_formatter.write("\n# yearparen\n") | |
for entry in used_citekeys: | |
citation_formatter.write("@%s\n" % entry) | |
citation_formatter.write("\n# References\n") | |
logging.info("Wrote citation formatter.") | |
# def write_citation_markdown ends here | |
def format_citations(used_citekeys, bibdata): | |
"""Return a formatted entry of the used citations""" | |
with open(TMP_DIR + os.path.sep + "formatted_citations.html", "r") as ding: | |
cites = BeautifulSoup(ding, "html.parser") | |
with open(TMP_DIR + os.path.sep + "formatted_citations.html", "r") as ding: | |
reference_list = soupparser.fromstring(ding, features="html.parser") | |
# references = dd.xpath("//div[@class='references']") | |
# with open("tmp_files/formatted_citations.html", "r") as ding: | |
references = reference_list.xpath("//div[@class='references']")[0] | |
# full_paren_cites = cites.select("#full-parentheses ~ p > span") | |
# year_paren_cites = cites.select("#year-parentheses ~ p > span") | |
citation_dict = {} | |
for entry in used_citekeys: | |
for entry_2 in bibdata: | |
if entry_2["id"] == entry: | |
current_citation = entry | |
strTitle = entry_2["title"] | |
title = strTitle | |
authoryear_citation = cites.select("#citeauthoryear ~ p > span[data-cites='%s']" % entry)[0].text | |
year_citation = cites.select("#citeyear ~ p > span[data-cites='%s']" % entry)[0].text | |
citation_dict[entry] = (authoryear_citation, year_citation, title) | |
return citation_dict, references | |
# def format_citations ends here | |
def format_pagerange(pagerange_start, pagerange_end): | |
"""Parse valuse of citedRange attributes. Return formatted string""" | |
return_string = "" | |
if pagerange_start is not None: | |
return_string += pagerange_start | |
if pagerange_end is not None: | |
return_string += "–" + pagerange_end | |
return return_string | |
# def format_pagerange ends here | |
def transform_body(xml_tree, cited_data, authors, publang): | |
"""Transform the body of XML document into EOADjango file""" | |
###################### | |
# Document structure # | |
###################### | |
# unclean solution | |
# chapter_element = xml_tree[0] | |
# chapter_element.tag = "div1" | |
# chapter_element.set("language", publang) | |
eoa_chapters = xml_tree.xpath("//t:div[@type='chapter']", namespaces=NS_MAP) | |
for chapter in eoa_chapters: | |
chapter.tag = "div1" | |
chapter.set("language", publang) | |
chapter_title = chapter.find("t:head", namespaces=NS_MAP) | |
if authors is not None: | |
if len(authors) == 1: | |
eoa_author = authors[0] | |
eoa_author.tag = "EOAauthor" | |
chapter_title.insert(0, eoa_author) | |
else: | |
print("Found more than one author. Please advise") | |
chapter.insert(0, chapter_title) | |
eoa_sections = xml_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP) | |
for section in eoa_sections: | |
section.tag = "div2" | |
eoa_subsections = xml_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP) | |
for subsection in eoa_subsections: | |
subsection.tag = "div3" | |
eoa_subsubsections = xml_tree.xpath("//t:div[@type='subsubsection']", namespaces=NS_MAP) | |
for subsubsection in eoa_subsubsections: | |
subsubsection.tag = "div4" | |
############## | |
# Paragraphs # | |
############## | |
eoa_paragraphs = xml_tree.xpath("//t:p[not(@rend='footnote text')]", namespaces=NS_MAP) | |
for paragraph in eoa_paragraphs: | |
paragraph.tag = "p" | |
if paragraph.get("rend") == "Quote": | |
paragraph.set("rend", "quoted") | |
############# | |
# Citations # | |
############# | |
# we need some data of the references here! | |
""" | |
<!-- | |
<span rel="popover" class="citation" data-toggle="popover" html="true" data-placement="bottom" data-title="Descartes 1644, 37–44" data-content="Principia philosophiae.">Descartes 1644, 37–44</span> | |
--> | |
Intermediate XML: | |
<span rel="popover" class="citation" citekey="monti_tradizione_2011" data-toggle="popover" html="true" data-placement="bottom" data-title="Monti " data-content="La tradizione galileiana e lo sperimentalismo naturalistico d’Età Moderna. Pratiche, teorie, linguaggi.">Monti </span> | |
""" | |
eoa_citations = xml_tree.xpath("//t:bibl", namespaces=NS_MAP) | |
for citation in eoa_citations: | |
pagerange = "" | |
cited_range = citation.xpath("t:citedRange", namespaces=NS_MAP) | |
citeref = citation.xpath("t:ref", namespaces=NS_MAP) | |
cite_render = citeref[0].get("type") | |
citekey = citeref[0].get("target")[1:] | |
citeref[0].tag = "tagtobestripped" | |
citation.tag = "span" | |
citation.set("rel", "popover") | |
citation.set("class", "citation") | |
citation.set("citekey", citekey) | |
citation.set("data-toggle", "popover") | |
citation.set("html", "true") | |
citation.set("data-placement", "bottom") | |
if len(cited_range) > 0: | |
pagerange_start = cited_range[0].get("from") | |
pagerange_end = cited_range[0].get("to") | |
pagerange = ", " + format_pagerange(pagerange_start, pagerange_end) | |
cited_range[0].tag = "tagtobestripped" | |
if cite_render == 'inline': | |
try: | |
formatted_citation = cited_data[citekey][1] + pagerange | |
except KeyError: | |
print("Citekey %s was not found in the references. Exiting." % citekey) | |
sys.exit() | |
else: | |
try: | |
formatted_citation = cited_data[citekey][0] + pagerange | |
except KeyError: | |
print("Citekey %s was not found in the references. Exiting." % citekey) | |
sys.exit() | |
citation.text = formatted_citation | |
# try: | |
# formatted_citation = cited_data[citekey][0] + pagerange | |
# except KeyError: | |
# print("Citekey %s was not found in the references. Exiting." % citekey) | |
# sys.exit() | |
citation.set("data-title", formatted_citation) | |
citation.set("data-content", cited_data[citekey][2]) | |
############# | |
# Footnotes # | |
############# | |
eoa_footnotes = xml_tree.xpath("//t:note[@place='bottom']", namespaces=NS_MAP) | |
""" | |
<note place="bottom" xml:id="ftn2" n="2"> | |
<note id-text="34" id="uid40" place="Inline"><p>One reads</note> | |
""" | |
for footnote in eoa_footnotes: | |
# re-assign tag here to get rid of namespace | |
footnote.tag = "note" | |
footnote.set("place", "Inline") | |
footnote.set("id-text", footnote.get("n")) | |
fn_parent = footnote.getparent() | |
# we assert here that the parent of a footnote is always a paragraph | |
assert(fn_parent.tag == "p") | |
fn_paragraphs = footnote.xpath("t:p", namespaces=NS_MAP) | |
for fn_paragraph in fn_paragraphs: | |
fn_paragraph.tag = "p" | |
del fn_paragraph.attrib["rend"] | |
########### | |
# Figures # | |
########### | |
""" | |
<figure><graphic url="figures/Fig.3CarceresaccidentalTraceFirenze2017.png"/><head>Latin inscription on a wall in Caceres, Spain. CIL II 697</head></figure> | |
<EOAfigure id="uid21"> | |
<anchor id-text="1" id="uid21"/> | |
<p> | |
<caption>An example of the titles</caption> | |
<file>images/Figure1-1_BenedettiSignature.jpg</file> | |
<width>60</width> | |
</p> | |
</EOAfigure> | |
""" | |
figure_counter = 1 | |
eoa_figures = xml_tree.xpath("//t:figure", namespaces=NS_MAP) | |
for figure in eoa_figures: | |
figure.tag = "EOAfigure" | |
figure.set("id", "anotheruid") | |
anchor_element = etree.SubElement(figure, "anchor") | |
# anchor_element.set("id-text", "id-text") | |
# careful, caption can contain markup! | |
caption_element = figure.xpath("t:head", namespaces=NS_MAP)[0] | |
caption_element.tag = "caption" | |
fig_p_element = etree.SubElement(figure, "p") | |
figure_file = etree.SubElement(fig_p_element, "file").text = figure.xpath("t:graphic/@url", namespaces=NS_MAP)[0] | |
figure_width = etree.SubElement(fig_p_element, "width").text = "60" #whatever | |
fig_p_element.append(caption_element) | |
etree.strip_elements(figure, "{%s}graphic" % ns_tei) | |
############## | |
# Hi-Element # | |
############## | |
eoa_hi = xml_tree.xpath("//t:hi", namespaces=NS_MAP) | |
for hi in eoa_hi: | |
rend_attribute = hi.get("rend") | |
if rend_attribute == "italic": | |
hi.set("rend", "it") | |
elif rend_attribute == "sup": | |
hi.tag = "EOAup" | |
del hi.attrib["rend"] | |
elif rend_attribute == "sub": | |
hi.tag = "EOAdown" | |
del hi.attrib["rend"] | |
else: | |
logging.info("The rend attribute in hi has the value %s. This is not supported" % rend_attribute) | |
return xml_tree | |
# def transform_body ends here | |
def assign_ids(xml_tree, data): | |
"""Walk the xml tree again. Assign ids to xml and put them into dicts, as well.""" | |
chapterdict = {} | |
figdict = {} | |
eqdict = {} | |
fndict = {} | |
listdict = {} | |
pagelabeldict = {} | |
secdict = {} | |
tabdict = {} | |
theoremdict = {} | |
chapter_counter = 1 | |
xml_chapters = xml_tree.xpath("//div1") | |
for chapter in xml_chapters: | |
equation_counter = 1 | |
footnote_counter = 1 | |
list_counter = 1 | |
section_counter = 1 | |
table_counter = 1 | |
theorem_counter = 1 | |
if chapter.get('rend') != "nonumber": | |
chapter.set("id-text", str(chapter_counter)) | |
chapterdict[chapter.get("id")] = str(chapter_counter) | |
figure_anchors = chapter.findall(".//EOAfigure/anchor") | |
figure_counter = 1 | |
for anchor in figure_anchors: | |
figure_number = "%d.%d" % (chapter_counter, figure_counter) | |
anchor.set("id-text", figure_number) | |
figure_counter += 1 | |
figure_element = anchor.getparent() | |
figure_element.set("id", anchor.get("id")) | |
figdict[anchor.get("id")] = figure_number | |
footnotes = chapter.findall(".//note") | |
for footnote in footnotes: | |
fndict[footnote.get("id")] = footnote.get("n") | |
sections = chapter.findall(".//div2") | |
section_counter = 1 | |
for section in sections: | |
section_number = "%d.%d" % (chapter_counter, section_counter) | |
section.set("id-text", section_number) | |
secdict[section.get("id")] = section_number | |
subsection_counter = 1 | |
subsections = section.findall(".//div3") | |
for subsection in subsections: | |
subsection_number = "%d.%d.%d" % (chapter_counter, section_counter, subsection_counter) | |
subsection.set("id-text", subsection_number) | |
secdict[subsection.get("id")] = subsection_number | |
subsection_counter += 1 | |
section_counter += 1 | |
chapter_counter += 1 | |
# not implemented yet: equation, list, pagelabel, tab, theorem | |
data["chapterdict"] = chapterdict | |
data["figdict"] = figdict | |
data["eqdict"] = eqdict | |
data["fndict"] = fndict | |
data["listdict"] = listdict | |
data["pagelabeldict"] = pagelabeldict | |
data["secdict"] = secdict | |
data["tabdict"] = tabdict | |
data["theoremdict"] = theoremdict | |
return xml_tree, data | |
# def assign_ids ends here | |
def add_bibliography(xml_tree, refs_for_bib_chapter): | |
"""Add another chapter containing the bibliography.""" | |
root_element = xml_tree.getroot() | |
xml_chapters = root_element.xpath("//div1") | |
number_of_chapters = len(xml_chapters) | |
bibliography_chapter = etree.Element("div1", rend="nonumber", language="english") | |
# this needs to be configurable by language | |
bib_head = etree.SubElement(bibliography_chapter, "head").text = "Bibliography" | |
bib_div_1 = etree.SubElement(bibliography_chapter, "div") | |
bib_div_2 = etree.SubElement(bib_div_1, "div") | |
entries = refs_for_bib_chapter.findall(".//div") | |
for entry in entries: | |
entry_id = entry.get("id") | |
entry.set("class", "bibliography") | |
etree.strip_tags(entry, "p") | |
entry.tag = "p" | |
internal_markup = entry.findall(".//em") | |
for markup in internal_markup: | |
markup.tag = "i" | |
bib_div_2.append(entry) | |
root_element.insert(number_of_chapters + 1, bibliography_chapter) | |
return root_element | |
# def add_bibliography ends here | |
if __name__ == '__main__': | |
if len(sys.argv) == 1: | |
print("You must specify an input file!") | |
sys.exit() | |
elif len(sys.argv) > 2: | |
print("You can work with only one publication at a time!") | |
sys.exit() | |
with open(TMP_DIR + os.path.sep + 'data.pickle', 'rb') as f: | |
data = pickle.load(f) | |
used_citekeys = data["citekeys"] | |
tei_document = sys.argv[-1] | |
xml_tree = etree.parse(tei_document) | |
bib_data = {} | |
publication_language = xml_tree.xpath("//t:teiHeader/t:profileDesc/t:langUsage/t:language/@ident", namespaces=NS_MAP)[0] | |
bib_data["source"] = xml_tree.xpath("//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='bibliography']/t:ref/@target", namespaces=NS_MAP)[0] | |
bib_data["type"] = xml_tree.xpath("//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='bibliography']/t:ref/@type", namespaces=NS_MAP)[0] | |
logging.info("The bibfile is %s." % bib_data["source"]) | |
authors = xml_tree.xpath("//t:teiHeader/t:fileDesc/t:titleStmt/t:author", namespaces=NS_MAP) | |
# json | |
interim_bib_json_file = TMP_DIR + os.path.sep + "tmp-bib.json" | |
citeproc_command = "pandoc-citeproc --bib2json %s" % bib_data["source"] | |
citeproc_arguments = shlex.split(citeproc_command) | |
citeproc_process = subprocess.Popen(citeproc_arguments, stdout=subprocess.PIPE) | |
citeproc_json = citeproc_process.stdout.read() | |
citations_json = json.loads(citeproc_json) | |
with open(interim_bib_json_file, 'w') as json_file: | |
json_file.write(citeproc_json.decode('utf-8')) | |
logging.info("Wrote json file") | |
if bib_data["type"] not in ["monograph", "anthology", "monograph-numeric", "anthology-numeric"]: | |
print("The bibliography type %s is not allowed." % bib_data["type"]) | |
# refs for bib_chapter contains formatted reference entries | |
write_citation_markdown(set(used_citekeys)) | |
command = "pandoc -o %sformatted_citations.html -t html --filter=pandoc-citeproc --bibliography=%s --csl=%s %s" % (TMP_DIR + os.path.sep, bib_data["source"], CSL_FILE, TMP_DIR + os.path.sep + "used_citations.md") | |
arguments = shlex.split(command) | |
logging.info("Using external command pandoc: %s." % command) | |
subprocess.call(arguments) | |
cited_dict, refs_for_bib_chapter = format_citations(set(used_citekeys), citations_json) | |
# render_reference(all_references, cited_dict) | |
# chapter_title = xml_tree.xpath("//t:p[@rend='Title']", namespaces=NS_MAP)[0] | |
# chapter_title.tag = "head" | |
# chapter_title.attrib.pop("rend") | |
tei_body = xml_tree.xpath("//t:body", namespaces=NS_MAP)[0] | |
body_transformed = transform_body(tei_body, cited_dict, authors, publang=publication_language) | |
resulting_tree = etree.ElementTree(body_transformed) | |
xml_add_bib = add_bibliography(resulting_tree, refs_for_bib_chapter) | |
etree.strip_tags(xml_add_bib, "tagtobestripped") | |
elements_with_ids = xml_add_bib.xpath("//div1 | //div2 | //div3 | //note | //item | //table | //EOAfigure/anchor | //EOAequation | //formula | //theorem") | |
element_counter = 1 | |
for element in elements_with_ids: | |
element.set("id", "uid" + str(element_counter)) | |
element_counter += 1 | |
assigned_ids, data_to_pickle = assign_ids(resulting_tree, data) | |
xml_root = assigned_ids.getroot() | |
xml_root.tag = "Book" | |
final_tree = etree.ElementTree(xml_root) | |
# objectify.deannotate(final_tree, cleanup_namespaces=True) | |
# etree.cleanup_namespaces(xml_root) | |
with open(TMP_DIR + os.path.sep + 'data.pickle', 'wb') as f: | |
# Pickle the 'data' dictionary using the highest protocol available. | |
pickle.dump(data_to_pickle, f, pickle.HIGHEST_PROTOCOL) | |
# if not os.path.exists("CONVERT"): | |
# os.mkdir(os.path.expanduser("CONVERT")) | |
# if not os.path.exists("debug"): | |
# os.mkdir(os.path.expanduser("debug")) | |
if not os.path.exists(OUTPUT_DIR): | |
os.mkdir(os.path.expanduser(OUTPUT_DIR)) | |
publication_info = get_publication_info(xml_tree) | |
make_publication_cfg(publication_info) | |
if not os.path.exists(TMP_DIR): | |
os.mkdir(os.path.expanduser(TMP_DIR)) | |
output_filename = TMP_DIR + os.path.sep + "IntermediateXMLFile.xml" | |
# resulting_tree.write(output_filename, pretty_print=True, xml_declaration=True,encoding="utf-8") | |
final_tree.write(output_filename, pretty_print=True, xml_declaration=True, encoding="utf-8") | |
logging.info("Wrote %s." % output_filename) | |
# Remove namespace info (brute force solution) | |
bad_ns_string = 'xmlns="http://www.tei-c.org/ns/1.0"' | |
with open(output_filename, 'r') as textfile: | |
xml_as_string = textfile.read() | |
removed_namespace = xml_as_string.replace(bad_ns_string, "") | |
with open(output_filename, 'w') as amended_textfile: | |
amended_textfile.write(removed_namespace) | |
# finis |