Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
EOASkripts/src/tei2imxml.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
1963 lines (1606 sloc)
78.7 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8; mode: python -*- | |
"""A converter from TEI to customized DocBook XML. | |
This program is used to get a TEI XML file into the EOAv1 workflow. | |
Out of the resulting files, the existing programs can be used to | |
create the output formats EPUB and Django. | |
""" | |
__version__ = "1.0" | |
__date__ = "20180116" | |
__author__ = "kthoden@mpiwg-berlin.mpg.de" | |
from utils.load_config import load_config | |
import utils.libeoaconvert as libeoaconvert | |
import utils.bib2html as bib2html | |
import os | |
import sys | |
import logging | |
import json | |
import re | |
import subprocess | |
import pickle | |
import shutil | |
import shlex | |
import string | |
import argparse | |
import configparser | |
from datetime import datetime | |
from bs4 import BeautifulSoup | |
from copy import deepcopy | |
from lxml import etree#, objectify | |
#from lxml.html import soupparser | |
from pathlib import Path | |
# things to be done | |
# assign ids top to bottom for the following elements: | |
# div1 div2 div3 note item table EOAfigure EOAequation formula theorem | |
BIB2HTML_FILENAME = "temp" | |
BASE_DIR = Path( os.path.realpath(__file__) ).parent | |
SCRIPT_NAME = Path( __file__).stem | |
DEFAULT_INPUT_DIR = \ | |
Path(os.environ['INPUT_DIR'] if 'INPUT_DIR' in os.environ else './input') | |
DEFAULT_OUTPUT_DIR = \ | |
Path(os.environ['OUTPUT_DIR'] if 'OUTPUT_DIR' in os.environ else './output') | |
DEFAULT_DEPENDENCIES_DIR = \ | |
Path(os.environ['DEPENDENCIES_DIR'] if 'DEPENDENCIES_DIR' in os.environ else './dependencies') | |
# EOA_SCRIPTS_DIR = \ | |
# Path(os.environ['EOA_SCRIPTS_DIR']) | |
PDFCROP_EXEC = "pdfcrop" # (part of texlive distribution): | |
GM_PATH = "gm" | |
ns_tei = "http://www.tei-c.org/ns/1.0" | |
NS_MAP = {"t" : ns_tei} | |
def process_inline_equations(xml_tree, xml_chapters, template_path, temp_dir, output_dir): | |
# inline_equations = xml_tree.xpath("//t:body//t:formula[@rend='inline' and @notation='tex']", namespaces=NS_MAP) | |
# for equation in inline_equations: | |
# tex_formula = equation.text | |
# formula_tail = equation.tail | |
# equation.clear() | |
# equation.tag = "EOAineq" | |
# equation.set("TeX", tex_formula) | |
# png_file = "oh dear" | |
# equation.set("src", png_file) | |
# equation.tail = formula_tail | |
eoa_ineq_running_order = 1 | |
dict_eoa_ineqs = {} | |
tex_equation = "" | |
all_ineq = xml_tree.xpath("//t:body//t:formula[@rend='inline' and @notation='tex']", namespaces=NS_MAP) | |
if len(all_ineq) > 0: | |
logging.info("Found " + str(len(all_ineq)) + " formulas") | |
for chapter_number, xml_chapter in enumerate(xml_chapters, start=1): | |
logging.info("Chapter " + str(chapter_number)) | |
inline_equations = xml_chapter.xpath(".//t:formula[@rend='inline' and @notation='tex']", namespaces=NS_MAP) | |
inline_equation_number = 1 | |
for equation in inline_equations: | |
tex_formula = equation.text | |
libeoaconvert.progress(inline_equation_number, len(inline_equations),"Processing EOAineq %s of %s." % (inline_equation_number, len(inline_equations))) | |
tex_formula = os.linesep.join([s for s in tex_formula.splitlines() if s]) | |
# this occurred once in sources 11 | |
tex_formula = tex_formula.replace(r"\@root", r"\root") | |
tex_equation = f"""{tex_equation}${tex_formula}$\n\\newpage\n""" | |
# Add intEOAineqRunningOrder : Filename to dictionary | |
equation_filename = f"EOAineq_{str(chapter_number)}_{str(inline_equation_number)}" | |
dict_eoa_ineqs[eoa_ineq_running_order] = equation_filename | |
# Prepare XML | |
equation_tail = equation.tail | |
equation.clear() | |
equation.tag = "EOAineq" | |
equation.tail = equation_tail | |
equation.set("src", f"{equation_filename}.png") | |
equation.set("TeX", tex_formula) | |
# increment integers | |
eoa_ineq_running_order += 1 | |
inline_equation_number +=1 | |
dict_rebound_commands = { | |
"\|ket\|" : r"\\ket", | |
"\|braket\|" : r"\\braket", | |
"\|bra\|" : r"\\bra", | |
"\|Bra\|" : r"\\Bra", | |
"\|Ket\|" : r"\\Ket", | |
"\slashed\|" : r"\\slashed" | |
} | |
for strCommand in dict_rebound_commands.keys(): | |
tex_equation = re.sub(strCommand, dict_rebound_commands[strCommand], tex_equation) | |
formula_file = open(template_path / "formula.tex", "r") | |
template = formula_file.read() | |
formula_file.close() | |
formula_tmp_dir = temp_dir / "formulas2png" | |
if not os.path.exists( formula_tmp_dir): | |
os.mkdir( formula_tmp_dir ) | |
# Make directory items if it doesn't already exist | |
items_dir = output_dir / "items" | |
if not os.path.exists( items_dir): | |
os.mkdir( items_dir ) | |
s = string.Template(template) | |
e = s.substitute(DERINHALT=tex_equation) | |
eoainline_file_path = formula_tmp_dir / "EOAinline.tex" | |
tmp = open(eoainline_file_path, "w") | |
tmp.write(e) | |
tmp.close() | |
logging.info("Typesetting all Inline Equations") | |
xelatex_command = "xelatex --halt-on-error " + str(eoainline_file_path.absolute()) | |
Argumente = shlex.split(xelatex_command) | |
Datei = open(temp_dir / 'xelatex-run.log', 'w') | |
Ergebnis = subprocess.check_call(Argumente,cwd=formula_tmp_dir,stdout=Datei) | |
logging.info("Splitting all Inline Equations") | |
libeoaconvert.pdf_burst("EOAinline.pdf", formula_tmp_dir) | |
logging.info("Converting %s split pages into PNG-Images" % len(dict_eoa_ineqs.keys())) | |
counter_dict_eoa_ineqs = 1 | |
for intRunningOrder in dict_eoa_ineqs.keys(): | |
# provide more status information here in output! | |
libeoaconvert.progress(counter_dict_eoa_ineqs, len(dict_eoa_ineqs.keys()),"Splitting all inline equations, image %s of %s" % (counter_dict_eoa_ineqs, len(dict_eoa_ineqs.keys()))) | |
pdf_crop_command = "{cmd} {arg1} {arg2}".format( | |
cmd = PDFCROP_EXEC, | |
arg1 = (formula_tmp_dir / ("EOAformulas_" + str(intRunningOrder) + ".pdf")).absolute(), | |
arg2 = (formula_tmp_dir / (dict_eoa_ineqs[intRunningOrder] + ".pdf")).absolute() | |
) | |
Argumente = shlex.split(pdf_crop_command) | |
subprocess.check_call(Argumente,cwd=formula_tmp_dir,stdout=Datei) | |
convert_command = "{cmd} convert -density 144 {arg1} {arg2}".format( | |
cmd = GM_PATH, | |
arg1 = (formula_tmp_dir / (dict_eoa_ineqs[intRunningOrder] + ".pdf")).absolute(), | |
arg2 = (items_dir / (dict_eoa_ineqs[intRunningOrder] + ".png")).absolute() | |
) | |
Argumente = shlex.split(convert_command) | |
subprocess.check_call(Argumente,cwd=formula_tmp_dir,stdout=Datei) | |
counter_dict_eoa_ineqs += 1 | |
else: | |
logging.info("Found no EOAineq. Continuing") | |
# def process_inline_equations ends here | |
def get_publication_info(xml_tree, translation_file): | |
"""Query the TEI document for metadata fields. | |
Return a dictionary""" | |
info_dict = {} | |
ns_tei = "http://www.tei-c.org/ns/1.0" | |
ns_cc = "http://web.resource.org/cc/" | |
ns_rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" | |
NS_MAP = {"t" : ns_tei, "c" : ns_cc, "r" : ns_rdf} | |
def get_field(xml_tree, query_path, mandatory=False, findall=False, noformat=False): | |
"""Query XML for metadata fields. | |
Default behaviour is if it fails, move on, if mandatory is set | |
to True, exit the program | |
""" | |
if findall is True: | |
find_several = xml_tree.xpath(query_path, namespaces=NS_MAP) | |
if len(find_several) > 0: | |
if noformat is True: | |
if len(find_several) == 1: | |
return_string = [find_several[0].text] | |
else: | |
return_string = [x.text for x in find_several] | |
else: | |
publang = xml_tree.xpath("//t:teiHeader/t:profileDesc/t:langUsage/t:language/@ident", namespaces=NS_MAP)[0] | |
if len(find_several) == 1: | |
return_string = [format_authors(find_several, publang, xml_tree, translation_file)] | |
else: | |
list_of_formatted_people = [] | |
list_of_people = ([x for x in find_several]) | |
for person in list_of_people: | |
formatted_person = format_authors([person], publang, xml_tree, translation_file) | |
list_of_formatted_people.append(formatted_person) | |
return_string = list_of_formatted_people | |
else: | |
return_string = "" | |
else: | |
tmp_field = xml_tree.xpath(query_path, namespaces=NS_MAP) | |
if len(tmp_field) > 0: | |
return_string = sanitize_data_string(tmp_field[0]) | |
else: | |
if mandatory is True: | |
sys.exit("Field stored in %s is mandatory. Exiting." % query_path) | |
else: | |
return_string = "" | |
return return_string | |
# def get_field ends here | |
# Mandatory values (according to database schema) | |
info_dict['eoa_publicationdate'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:date/@when", mandatory=True) | |
info_dict['eoa_language'] = get_field(xml_tree, "//t:teiHeader/t:profileDesc/t:langUsage/t:language/@ident", mandatory=True) | |
info_dict['eoa_license'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:availability/t:licence/@target", mandatory=True) | |
info_dict['eoa_number'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:seriesStmt/t:idno[@type='number']/text()", mandatory=True) | |
info_dict['eoa_series'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:seriesStmt/t:title/text()", mandatory=True) | |
info_dict['eoa_title'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@type='main']/text()", mandatory=True) | |
# Optional (according to database schema) | |
info_dict['eoa_subtitle'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@type='sub']/text()") | |
info_dict['eoa_isbnprint'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:idno[@type='isbn']/text()") | |
info_dict['eoa_isbnpdf'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:idno[@type='isbnpdf']/text()") | |
info_dict['eoa_isbnepub'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:idno[@type='isbnepub']/text()") | |
info_dict['eoa_doi'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:idno[@type='doi']/text()") | |
info_dict['eoa_price'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:extent/t:measure[@type='price']/@quantity") | |
info_dict['eoa_pages'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:extent/t:measure[@commodity='pages']/@quantity") | |
info_dict['eoa_currency'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:extent/t:measure[@type='price']/@unit") | |
info_dict['eoa_shoplink_url'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:idno[@type='shoplink']/text()") | |
info_dict['eoa_shoplink_text'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:distributor/t:orgName/text()") | |
info_dict['eoa_brief_desc'] = get_field(xml_tree, "//t:teiHeader/t:profileDesc/t:abstract[@n='brief']/p/text()") | |
info_dict['eoa_detail_desc'] = get_field(xml_tree, "//t:teiHeader/t:profileDesc/t:abstract[@n='detailed']/p/text()") | |
info_dict['eoa_additional_info'] = get_field(xml_tree, "//t:teiHeader/t:profileDesc/t:abstract[@n='additional']/p/text()") | |
info_dict['eoa_dedication'] = get_field(xml_tree, "//t:text/t:front/t:div[@type='dedication']/t:ab/text()") | |
info_dict['eoa_landingpage'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:publisher/t:orgName[@n='Press']/@ref") | |
info_dict['eoa_submitters'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='submitter']/@ref", findall=True) | |
info_dict['eoa_publicationmanagers'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='publicationmanager']/@ref", findall=True) | |
info_dict['eoa_publicationassistants'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='publicationassistant']/@ref", findall=True) | |
info_dict['eoa_editorialcoordinators'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='editorialcoordinator']/@ref", findall=True) | |
info_dict['eoa_copyeditors'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='copyeditor']/@ref", findall=True) | |
info_dict['eoa_translators'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='translator']/@ref", findall=True) | |
info_dict['eoa_keywords'] = get_field(xml_tree, "//t:teiHeader/t:profileDesc/t:textClass/t:keywords/t:list/t:item", findall=True, noformat=True) | |
info_dict['eoa_authors'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:author/@ref", findall=True) | |
info_dict['eoa_editors'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='volumeeditor']/@ref", findall=True) | |
return info_dict | |
# def get_publication_info ends here | |
def make_publication_cfg(info_dict, translation_file): | |
"""Main function""" | |
config = configparser.ConfigParser(delimiters=(':')) | |
# https://stackoverflow.com/questions/1611799/preserve-case-in-configparser | |
config.optionxform=str | |
# set up three main bits | |
config['Technical'] = {} | |
technical_config = config['Technical'] | |
config['General'] = {} | |
general_config = config['General'] | |
config['Authors'] = {} | |
authors_config = config['Authors'] | |
date_object = datetime.strptime(info_dict['eoa_publicationdate'], "%Y-%m-%d") | |
# fill in the fields | |
technical_config['Serie'] = info_dict['eoa_series'] #ok | |
technical_config['Number'] = info_dict['eoa_number'] #ok | |
technical_config['Title'] = info_dict['eoa_title'] #ok | |
technical_config['Subtitle'] = info_dict['eoa_subtitle'] #ok | |
technical_config['PublicationDate'] = info_dict['eoa_publicationdate'] #ok | |
technical_config['PublicationYear'] = datetime.strftime(date_object, "%Y") | |
technical_config['ISBN'] = info_dict['eoa_isbnprint'] | |
technical_config['ISBN-pdf'] = info_dict['eoa_isbnpdf'] | |
technical_config['ISBN-epub'] = info_dict['eoa_isbnepub'] | |
technical_config['DOI'] = info_dict['eoa_doi'] | |
technical_config['Price'] = "{} {}".format(info_dict['eoa_price'], info_dict['eoa_currency']) | |
technical_config['Shoplink'] = """<a href="{0}">{1}</a>""".format(info_dict['eoa_shoplink_url'], info_dict['eoa_shoplink_text']) #ok | |
technical_config['Language'] = info_dict['eoa_language'] #ok | |
technical_config['License'] = info_dict['eoa_license'].split("/")[4] #ok | |
technical_config['LandingPage'] = f"{info_dict['eoa_landingpage']}/{info_dict['eoa_series'].lower()}/{info_dict['eoa_number']}/index.html" | |
general_config['BriefDescription'] = info_dict['eoa_brief_desc'] #ok | |
if info_dict['eoa_submitters'] is not None: | |
general_config['Submitter'] = ", ".join(info_dict['eoa_submitters']) #ok | |
general_config['PublicationManagment'] = ", ".join(info_dict['eoa_publicationmanagers']) | |
general_config['PublicationAssistants'] = ", ".join(info_dict['eoa_publicationassistants']) | |
if len(info_dict['eoa_keywords']) > 8: | |
logging.warning("Too many keywords. Up to 8 are allowed. Using the first 8.") | |
else: | |
pass | |
for keyword in info_dict['eoa_keywords'][:7]: | |
keyword_label = "Keyword" + str(info_dict['eoa_keywords'].index(keyword) + 1) | |
general_config[keyword_label] = keyword | |
general_config['DetailedDescription'] = info_dict['eoa_detail_desc'] #ok | |
general_config['AdditionalInformation'] = info_dict['eoa_additional_info'] #ok | |
general_config['EditorialCoordination'] = ", ".join(info_dict['eoa_editorialcoordinators']) | |
general_config['Copyediting'] = ", ".join(info_dict['eoa_copyeditors']) | |
general_config['Dedication'] = info_dict['eoa_dedication'] #ok | |
general_config['Translator'] = ", ".join(info_dict['eoa_translators']) | |
number_of_authors = len(info_dict['eoa_authors']) | |
number_of_editors = len(info_dict['eoa_editors']) | |
if number_of_authors > 0 and number_of_editors > 0: | |
logging.error("Found both editor and authors. This is not permitted. Exiting") | |
sys.exit(1) | |
elif number_of_authors == 0 and number_of_editors == 0: | |
logging.error("Found neither editor nor authors. Please fill in. Exiting") | |
sys.exit(1) | |
elif number_of_authors > 5 or number_of_editors > 5: | |
logging.error("Only a maximum of 5 authors or editors allowed. Exiting") | |
sys.exit(1) | |
elif number_of_authors == 0 and number_of_editors in range(1,6): | |
EDITED_VOLUME = True | |
elif number_of_authors in range(1,6) and number_of_editors == 0: | |
EDITED_VOLUME = False | |
else: | |
logging.error("Something went wrong with the number of authors end editors. Please check. Exiting") | |
sys.exit(1) | |
for entry in range(0, 5): | |
author_label = "Author" + str(entry + 1) | |
try: | |
if EDITED_VOLUME == True: | |
authors_config[author_label] = info_dict['eoa_editors'][entry] | |
if number_of_editors == 1: | |
authors_config['Zusatz'] = "({})".format(libeoaconvert.translate("editor-abbr", info_dict['eoa_language'], translation_file)) | |
else: | |
authors_config['Zusatz'] = "({})".format(libeoaconvert.translate("editors-abbr", info_dict['eoa_language'], translation_file)) | |
else: | |
authors_config[author_label] = info_dict['eoa_authors'][entry] | |
authors_config['Zusatz'] = "" | |
except IndexError: | |
authors_config[author_label] = "" | |
return config | |
# def make_publication_cfg ends here | |
def sanitize_data_string(text_string, newline_to_space=False): | |
"""Remove line breaks and multiple spaces""" | |
if newline_to_space: | |
text_string = text_string.replace('\r', ' ') | |
text_string = text_string.replace('\n', ' ') | |
else: | |
text_string = text_string.replace('\r', '') | |
text_string = text_string.replace('\n', '') | |
return_string = re.sub("\s\s+" , " ", text_string) | |
return return_string.strip() | |
# def sanitize_data_string ends here | |
def check_bibliography(xml_tree): | |
"""Check TEI header for bibliography data, return relevant data as dictionary.""" | |
bib_data = {} | |
bib_data["source"] = xml_tree.xpath("//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='bibdatabase']/t:ref/@target", namespaces=NS_MAP)[0] | |
bib_data["type"] = xml_tree.xpath("//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='bibdatabase']/t:ref/@type", namespaces=NS_MAP)[0] | |
logging.info("The bibfile is %s and this publication type is %s." % (bib_data["source"], bib_data["type"])) | |
if bib_data["type"] not in ["monograph", "anthology", "monograph-numeric", "anthology-numeric"]: | |
logging.error(f"The bibliography type {bib_data['type']} is not allowed. Exiting") | |
sys.exit(1) | |
return bib_data | |
# def check_bibliography ends here | |
def render_reference(list_of_xml_elements, cited_data): | |
"""Provide an attribute for a formatted version of Reference. | |
This will be used for output formats that don't have a bibliographic | |
formatter themselves | |
""" | |
for reference in list_of_xml_elements: | |
citekey = reference.xpath("t:ref/@target", namespaces=NS_MAP)[0][1:] | |
# here we need to get a formatted version of the entry, like it | |
# would appear in the typeset version. | |
# looked at: bibulous | |
# pandoc-citeproc, maybe | |
element = etree.SubElement(reference, "abbr", type="authoryear") | |
element.text = cited_data[citekey][1] | |
element = etree.SubElement(reference, "abbr", type="year") | |
element.text = cited_data[citekey][2] | |
# def render_reference ends here | |
def write_citation_markdown(used_citekeys, citations_filename): | |
"""Write markdown file with citekeys for bibliography rendering""" | |
md_file_header = "---\nlang: en\ntitle: Citations\n...\n\n" | |
with open(citations_filename, "w") as citation_formatter: | |
citation_formatter.write(md_file_header) | |
# citation_formatter.write("# Full parentheses\n") | |
citation_formatter.write("# citeauthoryear\n") | |
for entry in used_citekeys: | |
citation_formatter.write("[@%s]\n" % entry) | |
citation_formatter.write("\n# citeyear\n") | |
for entry in used_citekeys: | |
citation_formatter.write("[-@%s]\n" % entry) | |
# sentencestyle | |
citation_formatter.write("\n# yearparen\n") | |
for entry in used_citekeys: | |
citation_formatter.write("@%s\n" % entry) | |
citation_formatter.write("\n# References\n") | |
logging.info(f"Wrote citation formatter: {citations_filename}") | |
# def write_citation_markdown ends here | |
def format_reference_list(used_citekeys, html_file): | |
"""Create an HTML formatted list of references""" | |
logging.info("Opening %s", html_file) | |
# second part of function | |
reference_list = etree.parse(str(html_file)) | |
references = reference_list.xpath("//div[@class='references']")[0] | |
return references | |
# def format_reference_list ends here | |
def format_citations( | |
used_citekeys, | |
html_file | |
): | |
"""Return a dictionary of the used citations as formatted entries. | |
citation_dict[citekey] = (authoryear_citation, year_citation, title, full_citation) | |
""" | |
def cleanup_full_citation_with_markup(citation_element, entry): | |
"""Generate a cleaned variant of the full citation""" | |
citation_element_string = etree.tostring(citation_element).decode('utf-8') | |
sanitized_citation = sanitize_data_string(citation_element_string, newline_to_space=True) | |
prequel = f"""<p data-cites="{entry}">""" | |
sequel = f"</p>" | |
prequel_end = sanitized_citation.index(prequel) + len(prequel) | |
sequel_begin = sanitized_citation.index(sequel) | |
escaped_citation = libeoaconvert.escape_xml(sanitized_citation[prequel_end:sequel_begin], decode=False) | |
return escaped_citation | |
# def cleanup_full_citation_with_markup ends here | |
def cleanup_full_citation_without_markup(citation_element): | |
"""Generate a cleaned variant of full citation, but without markup""" | |
textnodes = gettext(citation_element) | |
sanitized_text = sanitize_data_string(textnodes, newline_to_space=True) | |
return sanitized_text | |
# def cleanup_full_citation_without_markup ends here | |
def gettext(xmlElement): | |
"""Get text nodes of element.""" | |
xmlText = xmlElement.text or "" | |
for xmlChild in xmlElement: | |
xmlText += gettext(xmlChild) | |
if xmlChild.tail: | |
xmlText += xmlChild.tail | |
return xmlText | |
# def gettext ends here | |
try: | |
cites = etree.parse(str(html_file)) | |
except OSError: | |
logging.error(f"File {str(html_file)} does not exist. Try and create it by running this script without the -n option. Exiting") | |
sys.exit(1) | |
citation_dict = {} | |
for entry in used_citekeys: | |
try: | |
authoryear_citation = cites.xpath(f"//div[@class='authoryear']/p/span[@data-cites='{entry}']")[0].text | |
year_citation = cites.xpath(f"//div[@class='year']/p/span[@data-cites='{entry}']")[0].text | |
title = cites.xpath(f"//div[@class='title']/p/span[@data-cites='{entry}']")[0].text | |
full_citation = cleanup_full_citation_without_markup(cites.xpath(f"//div[@class='full']/p[@data-cites='{entry}']")[0]) | |
except IndexError: | |
logging.error(f"Entry {entry} was not found in HTML file. Maybe you should run the tool again without -n option. Exiting.") | |
sys.exit(1) | |
citation_dict[entry] = (authoryear_citation, year_citation, title, full_citation) | |
return citation_dict | |
# def format_citations ends here | |
def format_pagerange(pagerange_start, pagerange_end): | |
"""Parse values of citedRange attributes. Return formatted string""" | |
return_string = "" | |
if pagerange_start is not None: | |
return_string += pagerange_start | |
if pagerange_end is not None: | |
return_string += "–" + pagerange_end | |
return return_string | |
# def format_pagerange ends here | |
def format_authors(list_author_id, publang, xml_tree, translation_file): | |
"""Retrieve author names from respStmt entries and format them.""" | |
author_string = "" | |
formatted_list = [] | |
for author in list_author_id: | |
tmp_xpath = "//t:respStmt[@xml:id='%s']" % author[1:] | |
author_xml = xml_tree.xpath(tmp_xpath, namespaces=NS_MAP) | |
surname = author_xml[0].find("t:persName/t:surname", namespaces=NS_MAP).text | |
forename = author_xml[0].find("t:persName/t:forename", namespaces=NS_MAP).text | |
single_author_string = "{} {}".format(forename, surname) | |
formatted_list.append(single_author_string) | |
if len(formatted_list) == 1: | |
author_string = formatted_list[0] | |
elif len(formatted_list) == 2: | |
author_string = "{} {} {}".format(formatted_list[0], libeoaconvert.translate("and", publang, translation_file), formatted_list[1]) | |
elif len(formatted_list) > 2: | |
author_string = ", ".join(formatted_list[0:-1]) | |
if publang == "en": | |
author_string += ", {} {}".format(libeoaconvert.translate("and", publang, translation_file), formatted_list[-1]) | |
else: | |
author_string += " {} {}".format(libeoaconvert.translate("and", publang, translation_file), formatted_list[-1]) | |
return author_string | |
# def format_authors ends here | |
def get_hi_data_xml(XML_FILE): | |
"""Get hyperimage code from XML file""" | |
logging.debug("Opening %s", XML_FILE) | |
xml_tree = etree.parse(str(XML_FILE)) | |
return xml_tree | |
# def get_hi_data_xml ends here | |
def hi_lookup_code_xml(hi_xmltree, hitrue_xml_id): | |
"""Get hyperimage code""" | |
try: | |
hi_code_element = hi_xmltree.xpath(f"//t:div[@xml:id='{hitrue_xml_id}']/t:ab", namespaces=NS_MAP)[0] | |
except IndexError: | |
logging.error("Could not find hi code %s in XML file. Exiting.", hitrue_xml_id) | |
sys.exit(1) | |
hi_code = hi_code_element.text | |
return hi_code | |
# def hi_lookup_code_xml ends here | |
def get_hitarget_xml(hi_xmltree, teitarget): | |
"""Find out corresponding hyperimage id for hyperimage link""" | |
try: | |
hi_code_element = hi_xmltree.xpath(f"//t:div[@xml:id='{teitarget}']", namespaces=NS_MAP)[0] | |
except IndexError: | |
logging.error("Could not find hi code %s in XML file. Exiting.", teitarget) | |
sys.exit(1) | |
hi_target = hi_code_element.get("corresp") | |
return hi_target | |
# def get_hitarget_xml ends here | |
def get_hi_data_csv(CSV_FILE): | |
"""Get hyperimage code from CSV file""" | |
nd = {} | |
logging.debug("Opening %s", CSV_FILE) | |
with open(CSV_FILE, newline='') as f: | |
reader = csv.DictReader(f, fieldnames = ("xmlid","hiid","layers","elementstring")) | |
jsonStr = json.dumps(list(reader)[1:]) | |
jsonObj = json.loads(jsonStr) | |
for xx in jsonObj: | |
nd[xx["xmlid"]] = {"hiid": xx["hiid"], "elementstring" : xx["elementstring"], "layers" : xx["layers"]} | |
return nd | |
# def get_hi_data_csv ends here | |
def hi_lookup_code_csv(nd, hitrue_xml_id): | |
"""Get hyperimage code""" | |
try: | |
hi_code = nd[hitrue_xml_id]["elementstring"] | |
except KeyError: | |
logging.error("Could not find hi code %s in CSV file. Exiting.", hitrue_xml_id) | |
sys.exit(1) | |
return hi_code | |
# def hi_lookup_code_csv ends here | |
def get_hitarget_csv(nd, teitarget): | |
"""Find out corresponding hyperimage id for hyperimage link""" | |
try: | |
hi_target = nd[teitarget]["hiid"] | |
except KeyError: | |
logging.error("Could not find hi code %s. Exiting", teitarget) | |
sys.exit(1) | |
return hi_target | |
# def get_hitarget_csv ends here | |
def resolve_choice_to_expan(element_containing_choice, original_element): | |
"""Check whether an element contains choice and if so return expanded version.""" | |
element_children = element_containing_choice.getchildren() | |
if element_children and element_children[0].tag == "{" + ns_tei + "}choice": | |
cleaned_element = element_children[0].find("t:expan", namespaces=NS_MAP) | |
cleaned_element.tag = original_element | |
element_containing_choice.insert(-1, cleaned_element) | |
element_containing_choice.tag = "tagtobestripped" | |
etree.strip_elements(element_containing_choice, "{%s}choice" % ns_tei) | |
else: | |
cleaned_element = element_containing_choice | |
return cleaned_element | |
# def resolve_choice_to_expan ends here | |
def transform_body(xml_tree, cited_data, translation_file, template_path, xml_hyperimagexml_code, olddesign, publang, temp_dir, output_dir, hyperimage=False): | |
"""Transform the body of XML document into IntermediateXML file""" | |
def convert_n_to_rend(div_element): | |
"""Convert n attribute to rend attribute""" | |
n_value = div_element.get("n") | |
if n_value == "nonumber": | |
div_element.set("rend", "nonumber") | |
# def convert_n_to_rend end here | |
def retain_original_contents(ref): | |
"""Store original contents in a separate element""" | |
refcontents = deepcopy(ref) | |
refcontents.tail = "" | |
for attrib in refcontents.attrib: | |
del refcontents.attrib[attrib] | |
refcontents.tag = "originalcontents" | |
ref.append(refcontents) | |
return | |
# def retain_original_contents ends here | |
def handle_refs_default(ref): | |
"""Handle refs the normal way""" | |
target_attribute = ref.get("target") | |
if not target_attribute: | |
logging.error("Found a ref element without target. Exiting.") | |
sys.exit() | |
else: | |
type_attribute = ref.get("type") | |
if type_attribute == "url": | |
del ref.attrib["type"] | |
del ref.attrib["target"] | |
ref.tag = "xref" | |
ref.set("url", target_attribute) | |
elif type_attribute == "hionlycollage": | |
if not hyperimage: | |
logging.warning("This file contains references to Hyperimage. You should run the script with the option 'hyperimage'.") | |
else: | |
pass | |
# <a href="#Fig142and5" class="HILink">14.2 and 14.5</a> | |
ref.tag = "EOAref" | |
ref.set("type", "collage") | |
# reference_children = ref.getchildren() | |
# reference_text = ref.text | |
# if reference_children or reference_text: | |
# retain_original_contents(ref) | |
del ref.attrib["target"] | |
ref_subelement = etree.SubElement(ref, "ref", teitarget=target_attribute) | |
hitarget = get_hitarget_xml(hyperimage_data, target_attribute[1:]) | |
ref_subelement.set("hitarget", hitarget) | |
etree.SubElement(ref, "Label").text = target_attribute[1:] | |
elif type_attribute == "number-hi" or type_attribute == "number-hionly": | |
if not hyperimage: | |
logging.warning("This file contains references to Hyperimage. You should run the script with the option 'hyperimage'.") | |
else: | |
pass | |
# in this case, the hitarget has to be resolved to teitarget | |
ref.tag = "EOAref" | |
ref.set("type", "number") | |
del ref.attrib["target"] | |
ref_subelement = etree.SubElement(ref, "ref", teitarget=target_attribute) | |
hitarget = get_hitarget_xml(hyperimage_data, target_attribute[1:]) | |
ref_subelement.set("hitarget", hitarget) | |
etree.SubElement(ref, "Label").text = target_attribute[1:] | |
if ref.get("select"): | |
hilayer = ref.get("select") | |
logging.debug(f"Found hyperimage annotation layer {hilayer}") | |
ref_subelement.set("data-hilayer", hilayer) | |
elif type_attribute == "text-hi": | |
if not hyperimage: | |
logging.warning("This file contains references to Hyperimage. You should run the script with the option 'hyperimage'.") | |
else: | |
pass | |
# in this case, the hitarget has to be resolved to teitarget | |
ref.tag = "EOAref" | |
ref.set("type", "text") | |
reference_children = ref.getchildren() | |
reference_text = ref.text | |
if reference_children or reference_text: | |
retain_original_contents(ref) | |
del ref.attrib["target"] | |
ref_subelement = etree.SubElement(ref, "ref", teitarget=target_attribute) | |
hitarget = get_hitarget_xml(hyperimage_data, target_attribute[1:]) | |
ref_subelement.set("hitarget", hitarget) | |
etree.SubElement(ref, "Label").text = target_attribute[1:] | |
if ref.get("select"): | |
hilayer = ref.get("select") | |
logging.debug(f"Found hyperimage annotation layer {hilayer}") | |
ref_subelement.set("data-hilayer", hilayer) | |
elif type_attribute == "page": | |
ref.tag = "EOApageref" | |
del ref.attrib["type"] | |
del ref.attrib["target"] | |
etree.SubElement(ref, "ref", teitarget=target_attribute) | |
etree.SubElement(ref, "Label").text = target_attribute[1:] | |
else: | |
ref.tag = "EOAref" | |
del ref.attrib["target"] | |
etree.SubElement(ref, "ref", teitarget=target_attribute) | |
etree.SubElement(ref, "Label").text = target_attribute[1:] | |
return | |
# def handle_refs_default ends here | |
logging.info("Performing XML transformations of the body.") | |
if hyperimage: | |
hyperimage_data = get_hi_data_xml(xml_hyperimagexml_code) | |
else: | |
pass | |
###################### | |
# Document structure # | |
###################### | |
# unclean solution | |
# chapter_element = xml_tree[0] | |
# chapter_element.tag = "div1" | |
# chapter_element.set("language", publang) | |
latex_pis = xml_tree.xpath("//processing-instruction('latex')") | |
for latex_pi in latex_pis: | |
indices = ["\EOAprintindex", "\EOAprintpersonindex", "\EOAprintlocationonindex"] | |
if latex_pi.text in indices: | |
parent_element = latex_pi.getparent() | |
index_command = latex_pi.text[1:] | |
parent_element.append(etree.Element(index_command)) | |
else: | |
pass | |
if latex_pi.getparent() is not None: | |
etree.strip_tags(latex_pi.getparent(), latex_pi.tag) | |
else: | |
logging.warning("Found a processing instruction without parent!") | |
eoa_parts = xml_tree.xpath("//t:div[@type='part']", namespaces=NS_MAP) | |
for part in eoa_parts: | |
part.tag = "div0" | |
xml_id = part.attrib["{http://www.w3.org/XML/1998/namespace}id"] | |
del part.attrib["{http://www.w3.org/XML/1998/namespace}id"] | |
part.set("id", xml_id) | |
eoa_chapters = xml_tree.xpath("//t:div[@type='chapteroriginal' or @type='chapter' or @type='chaptertranslation']", namespaces=NS_MAP) | |
for chapter in eoa_chapters: | |
chapter.tag = "div1" | |
chapter.set("language", publang) | |
convert_n_to_rend(chapter) | |
# n_value = chapter.get("n") | |
# if n_value == "nonumber": | |
# chapter.set("rend", "nonumber") | |
# del chapter.attrib["n"] | |
chapter_title_element = chapter.find("t:head", namespaces=NS_MAP) | |
chapter_title = resolve_choice_to_expan(chapter_title_element, "head") | |
author_ids = chapter.get("resp") | |
if author_ids is not None: | |
list_author_id = author_ids.split(" ") | |
logging.info("Found chapter author shortcuts: {}.".format(", ".join(list_author_id))) | |
if len(list_author_id) > 0: | |
author_string = format_authors(list_author_id, publang, xml_tree, translation_file) | |
eoa_author = etree.Element("EOAauthor") | |
eoa_author.text = author_string | |
chapter_title.insert(0, eoa_author) | |
else: | |
logging.info("No chapter author.") | |
chapter.insert(0, chapter_title_element) | |
eoa_sections = xml_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP) | |
for section in eoa_sections: | |
section.tag = "div2" | |
convert_n_to_rend(section) | |
eoa_milestones = xml_tree.xpath("//t:milestone[@type='divider']", namespaces=NS_MAP) | |
for milestone in eoa_milestones: | |
milestone.tag = "p" | |
del milestone.attrib["type"] | |
milestone.set("class", "divider") | |
eoa_subsections = xml_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP) | |
for subsection in eoa_subsections: | |
subsection.tag = "div3" | |
convert_n_to_rend(subsection) | |
eoa_subsubsections = xml_tree.xpath("//t:div[@type='subsubsection']", namespaces=NS_MAP) | |
for subsubsection in eoa_subsubsections: | |
subsubsection.tag = "div4" | |
convert_n_to_rend(subsubsection) | |
############## | |
# Paragraphs # | |
############## | |
eoa_paragraphs = xml_tree.xpath("//t:p[not(@rend='footnote text')]", namespaces=NS_MAP) | |
for paragraph in eoa_paragraphs: | |
paragraph.tag = "p" | |
########## | |
# Verses # | |
########## | |
eoa_verses = xml_tree.xpath("//t:body//t:lg[@type = 'verse']", namespaces=NS_MAP) | |
for verse in eoa_verses: | |
verse.tag = "EOAverse" | |
verse_children = verse.getchildren() | |
for child in verse_children: | |
child.tag = "p" | |
child.set("rend", "verse") | |
############### | |
# Blockquotes # | |
############### | |
eoa_blockquotes = xml_tree.xpath("//t:quote", namespaces=NS_MAP) | |
for blockquote in eoa_blockquotes: | |
blockquote.tag = "p" | |
blockquote.set("rend", "quoted") | |
############# | |
# Citations # | |
############# | |
# we need some data of the references here! | |
""" | |
<!-- | |
<span rel="popover" class="citation" data-toggle="popover" html="true" data-placement="bottom" data-title="Descartes 1644, 37–44" data-content="Principia philosophiae.">Descartes 1644, 37–44</span> | |
--> | |
Intermediate XML: | |
<span rel="popover" class="citation" citekey="monti_tradizione_2011" data-toggle="popover" html="true" data-placement="bottom" data-title="Monti " data-content="La tradizione galileiana e lo sperimentalismo naturalistico d’Età Moderna. Pratiche, teorie, linguaggi.">Monti </span> | |
""" | |
eoa_citations = xml_tree.xpath("//t:bibl", namespaces=NS_MAP) | |
for citation in eoa_citations: | |
# logging.debug( f"handling citation: {etree.tostring(citation)}" ) | |
pagerange = "" | |
cited_range = citation.xpath("t:citedRange", namespaces=NS_MAP) | |
citeref = citation.xpath("t:ref", namespaces=NS_MAP) | |
cite_render = citeref[0].get("type") | |
citekey = citeref[0].get("target")[1:] | |
citeref[0].tag = "tagtobestripped" | |
if citation.text: | |
if re.match(r"^\s+$", citation.text): | |
citation_tail = citation.tail | |
citation.clear() | |
citation.tail = citation_tail | |
else: | |
citation.text = citation.text.strip() | |
else: | |
pass | |
cited_range_children = False | |
if len(cited_range) > 0: | |
has_content = libeoaconvert.has_text_or_children(cited_range[0]) | |
if has_content and cited_range[0].get("from") is not None: | |
logging.error("You must not use 'from' attribute and text in citedRange at the same time. Exiting.") | |
sys.exit(1) | |
elif has_content: | |
if len(cited_range[0].getchildren()) > 0: | |
cited_range_children = True | |
cited_range[0].tag = "tagtobestripped" | |
pagerange = f""", {etree.tostring(cited_range[0]).decode("utf-8").strip()}""" | |
else: | |
pagerange = ", {}".format(cited_range[0].text) | |
# clear the text | |
cited_range[0].text = "" | |
elif cited_range[0].get("from") is not None: | |
pagerange_start = cited_range[0].get("from") | |
pagerange_end = cited_range[0].get("to") | |
pagerange = ", " + format_pagerange(pagerange_start, pagerange_end) | |
cited_range[0].tag = "tagtobestripped" | |
if cite_render == 'year': | |
try: | |
if cited_range_children: | |
formatted_citation = etree.fromstring(f"<tagtobestripped>{cited_data[citekey][1]}{pagerange}</tagtobestripped>") | |
else: | |
formatted_citation = cited_data[citekey][1] + pagerange | |
except KeyError: | |
logging.error("Citekey %s was not found in the references. Exiting." % citekey) | |
sys.exit(1) | |
else: | |
try: | |
if cited_range_children: | |
formatted_citation = etree.fromstring(f"<tagtobestripped>{cited_data[citekey][0]}{pagerange}</tagtobestripped>") | |
else: | |
formatted_citation = cited_data[citekey][0] + pagerange | |
except KeyError: | |
logging.error("Citekey %s was not found in the references. Exiting." % citekey) | |
sys.exit(1) | |
if cited_range_children: | |
pass | |
else: | |
sanitized_citation_string = sanitize_data_string(formatted_citation) | |
if olddesign == True: | |
""" <span rel="popover" class="citation" citekey="DastonGalison_2010" | |
data-toggle="popover" html="true" data-placement="bottom" | |
data-title="Daston and Galison 2010" | |
data-content="Objectivity" data-original-title="" | |
title="">Daston and Galison 2010</span> """ | |
citation.tag = "span" | |
citation.set("rel", "popover") | |
citation.set("class", "citation") | |
citation.set("citekey", citekey) | |
citation.set("data-toggle", "popover") | |
citation.set("html", "true") | |
citation.set("data-placement", "bottom") | |
else: | |
""" <a class="publications-popup-text" data-title="Halliday and | |
Resnick 1977, 232" data-content="Physics">Halliday and Resnick | |
1977, 232</a>""" | |
citation.tag = "a" | |
citation.set("class", "publications-popup-text") | |
citation.set("citekey", citekey) | |
citation.set("data-title", sanitized_citation_string.strip()) | |
citation.set("data-content", cited_data[citekey][3]) | |
if cited_range_children: | |
citation.append(formatted_citation) | |
else: | |
citation.text = sanitized_citation_string | |
############# | |
# Footnotes # | |
############# | |
eoa_footnotes = xml_tree.xpath("//t:note[@place='bottom']", namespaces=NS_MAP) | |
""" | |
<note place="bottom" xml:id="ftn2" n="2"> | |
<note id-text="34" id="uid40" place="Inline"><p>One reads</note> | |
""" | |
for footnote in eoa_footnotes: | |
# re-assign tag here to get rid of namespace | |
footnote.tag = "note" | |
footnote.set("place", "Inline") | |
footnote.set("id-text", footnote.get("n")) | |
fn_parent = footnote.getparent() | |
# we assert here that the parent of a footnote is always a paragraph or a quote | |
try: | |
footnote_id = footnote.xpath("@xml:id")[0] | |
except IndexError: | |
logging.error("Found footnote without xml:id. Exiting.") | |
sys.exit(1) | |
# logging.debug(f"The prefix of fn_parent is {fn_parent.prefix}.") | |
if fn_parent.prefix is not None: | |
fn_parent_tag = fn_parent.tag.replace(fn_parent.prefix, "") | |
else: | |
fn_parent_tag = fn_parent.tag.replace(f"{{{ns_tei}}}", "") | |
if fn_parent_tag not in ["p", "quote", "item"]: | |
logging.error(f"The parent of footnote '{footnote_id}' is {fn_parent_tag}. Must be a p, quote or item. Exiting.") | |
sys.exit(1) | |
fn_paragraphs = footnote.xpath("t:p", namespaces=NS_MAP) | |
for fn_paragraph in fn_paragraphs: | |
fn_paragraph.tag = "p" | |
del fn_paragraph.attrib["rend"] | |
########### | |
# Figures # | |
########### | |
""" | |
<figure><graphic url="figures/Fig.3CarceresaccidentalTraceFirenze2017.png"/><head>Latin inscription on a wall in Caceres, Spain. CIL II 697</head></figure> | |
<EOAfigure id="uid21"> | |
<anchor id-text="1" id="uid21"/> | |
<p> | |
<caption>An example of the titles</caption> | |
<file>images/Figure1-1_BenedettiSignature.jpg</file> | |
<width>60</width> | |
</p> | |
</EOAfigure> | |
hyperimage | |
<figure xml:id="chap14_fig2" corresp="#chap14_fig2-hi"> | |
<EOAfigure file="imagesTrnkova_CAS_572.jpg" hielement="ewkJfQoJCV0KCX0KfQ==" width="60px;" order="8" number="14.2"> | |
<caption>The town hall in Prague’s Academy of Sciences</caption> | |
</EOAfigure> | |
""" | |
figure_counter = 1 | |
eoa_figures = xml_tree.xpath("//t:figure", namespaces=NS_MAP) | |
for figure in eoa_figures: | |
# careful, caption can contain markup! | |
caption_element = figure.find("t:head", namespaces=NS_MAP) | |
figure_n_attribute = figure.get("n") | |
if figure_n_attribute == "nonumber": | |
figure_is_nonumber = True | |
else: | |
figure_is_nonumber = False | |
figure_type = figure.get("type") | |
if figure_is_nonumber: | |
figure.tag = "EOAfigurenonumber" | |
fig_p_element = etree.SubElement(figure, "p") | |
figure_file = etree.SubElement(fig_p_element, "file").text = figure.xpath("t:graphic/@url", namespaces=NS_MAP)[0] | |
figure_width = etree.SubElement(fig_p_element, "width").text = "60" #whatever | |
else: | |
figure.tag = "EOAfigure" | |
figure.set("id", "anotheruid") | |
# the anchor element is used to determine whether a figure gets an id and can be numbered | |
anchor_element = etree.SubElement(figure, "anchor") | |
fig_p_element = etree.SubElement(figure, "p") | |
figure_file = etree.SubElement(fig_p_element, "file").text = figure.xpath("t:graphic/@url", namespaces=NS_MAP)[0] | |
figure_width = etree.SubElement(fig_p_element, "width").text = "60" #whatever | |
if caption_element is not None: | |
caption_element.tag = "caption" | |
fig_p_element.append(caption_element) | |
else: | |
empty_caption = etree.SubElement(fig_p_element, "caption").text = "" | |
hi_figure_types = ["hitrue", "hionly", "hionlycollage", "hionlysub"] | |
if figure_type in hi_figure_types: | |
# display image in hyperimage viewer, not in lightbox | |
# hitrue_xml_id needs to be looked up in hi_figures.xml | |
logging.debug("Found figure for hiviewer.") | |
if not hyperimage: | |
logging.error("This file contains references to Hyperimage. You should run the script with the option 'hyperimage'. Exiting.") | |
sys.exit(1) | |
else: | |
hitrue_xml_id = figure.attrib["{http://www.w3.org/XML/1998/namespace}id"] | |
logging.info("Getting hielement for %s", hitrue_xml_id) | |
hi_code = hi_lookup_code_xml(hyperimage_data, hitrue_xml_id) | |
figure.set("hielement", hi_code) | |
else: | |
pass | |
etree.strip_elements(figure, "{%s}graphic" % ns_tei) | |
################### | |
# Inline graphics # | |
################### | |
# if no scale or height is given, treat it as an EOAinline graphic, else consider it a small image, | |
# for example in a footnote | |
eoa_graphics = xml_tree.xpath("//t:graphic[not(ancestor::t:figure)]", namespaces=NS_MAP) | |
for graphic in eoa_graphics: | |
graphic_attributes = dict(graphic.attrib).copy() | |
graphic_tail = graphic.tail | |
graphic.clear() | |
# only url attribute | |
if len(graphic_attributes) == 1: | |
graphic.tag = "EOAinline" | |
graphic.text = graphic_attributes["url"] | |
elif len(graphic_attributes) == 2: | |
graphic_parent = graphic.getparent() | |
graphic_siblings = graphic_parent.getchildren() | |
if graphic_parent.tag == "p" and len(graphic_siblings) == 1: | |
graphic_parent.tag = "EOAfigurenonumber" | |
graphic.tag = "p" | |
graphic_url = etree.SubElement(graphic, "file").text = graphic_attributes["url"] | |
try: | |
size_parameter = graphic_attributes["scale"] | |
except: | |
size_parameter = graphic_attributes["height"] | |
graphic_size = etree.SubElement(graphic, "width").text = size_parameter | |
else: | |
logging.error("Illegal circumstances for a non-numbered figure. It should be the only element in a paragraph. Exiting.") | |
logging.error(graphic_attributes) | |
sys.exit(1) | |
############## | |
# Hi-Element # | |
############## | |
eoa_hi = xml_tree.xpath("//t:hi", namespaces=NS_MAP) | |
for hi in eoa_hi: | |
rend_attribute = hi.attrib["rend"] | |
if rend_attribute == "italic": | |
hi.set("rend", "it") | |
elif rend_attribute == "superscript": | |
hi.tag = "EOAup" | |
del hi.attrib["rend"] | |
elif rend_attribute == "subscript": | |
hi.tag = "EOAdown" | |
del hi.attrib["rend"] | |
elif rend_attribute == "smallcaps": | |
hi.tag = "EOAcaps" | |
del hi.attrib["rend"] | |
elif rend_attribute in ["red", "bold"]: | |
pass | |
else: | |
logging.info("The rend attribute in hi has the value %s. This is not supported" % rend_attribute) | |
########## | |
# Tables # | |
########## | |
eoa_tables = xml_tree.xpath("//t:table", namespaces=NS_MAP) | |
for table in eoa_tables: | |
tablechildren = table.findall("t:row", namespaces=NS_MAP) | |
table_caption = table.find("t:head", namespaces=NS_MAP) | |
number_of_cells = len(table.findall("t:row[1]/t:cell", namespaces=NS_MAP)) | |
table.tag = "EOAtable" | |
if table_caption is not None: | |
table_id = table.attrib["{http://www.w3.org/XML/1998/namespace}id"] | |
table.clear() | |
table_label = etree.Element("EOAtablelabel") | |
table.append(table_label) | |
table_label.text = table_id | |
table_caption.tag = "EOAtablecaption" | |
else: | |
table.clear() | |
table_caption = etree.Element("EOAtablecaption") | |
table_caption.text = "nonumber" | |
table.append(table_caption) | |
# not sure if this is evaluated later. | |
table_label = etree.SubElement(table, "EOAtablecolumns").text = "L3cm" * number_of_cells | |
real_table_element = etree.SubElement(table, "table") | |
if table_caption.text != "nonumber": | |
table.insert(1, table_caption) | |
real_table_element.set("place", table_id) | |
real_table_element.attrib["{http://www.w3.org/XML/1998/namespace}id"] = table_id | |
else: | |
pass | |
real_table_element.set("rend", "display") | |
# attributes id-text and id are assigned later | |
for row in tablechildren: | |
if row.get("role") == "label": | |
first_cell = row.find("t:cell", namespaces=NS_MAP) | |
tableheader = etree.Element("tableheader") | |
tableheader.text = "TRUE" | |
first_cell_children = first_cell.getchildren() | |
if first_cell_children: | |
pass | |
else: | |
tableheader.tail = first_cell.text | |
first_cell.text = "" | |
first_cell.insert(0, tableheader) | |
else: | |
pass | |
del row.attrib["role"] | |
cells = row.findall("t:cell", namespaces=NS_MAP) | |
for cell in cells: | |
del cell.attrib["role"] | |
linebreaks = cell.findall("t:lb", namespaces=NS_MAP) | |
for linebreak in linebreaks: | |
linebreak.tag = "br" | |
real_table_element.append(row) | |
libeoaconvert.wrap_into_element(etree.Element("p"), table) | |
######### | |
# Lists # | |
######### | |
eoa_lists = xml_tree.xpath("//t:body//t:list", namespaces=NS_MAP) | |
for eoalist in eoa_lists: | |
items = eoalist.findall("t:item", namespaces=NS_MAP) | |
for listitem in items: | |
listitem.tag = "p" | |
libeoaconvert.wrap_into_element(etree.Element("item"), listitem) | |
if eoalist.get("type") == "ordered": | |
for listitem in items: | |
new_item_element = listitem.getparent() | |
new_item_element.set("id-text", f"{str(items.index(listitem) + 1)}") | |
new_item_element.set("label", f"{str(items.index(listitem) + 1)}.") | |
if eoalist.get("type") == "unordered": | |
eoalist.set("type", "simple") | |
if eoalist.get("type") == "gloss": | |
eoalist.set("type", "description") | |
############# | |
# Epigraphs # | |
############# | |
# Epigraphs are handled in the subsequent steps | |
##################### | |
# Chapter abstracts # | |
##################### | |
chapter_abstracts = xml_tree.xpath("//t:body//t:ab[@type='chapterabstract']", namespaces=NS_MAP) | |
for abstract in chapter_abstracts: | |
logging.info("Removing chapter abstract, see https://github.molgen.mpg.de/EditionOpenAccess/django-eoapublications/issues/3") | |
abstract.tag = "elementtoberemoved" | |
############## | |
# References # | |
############## | |
eoa_collage_refs = xml_tree.xpath("//t:body//t:ref[@type='hionlycollage']", namespaces=NS_MAP) | |
# extract internal references in collage and put them directly after the collage EOAref | |
for collage_ref in eoa_collage_refs: | |
reference_children = collage_ref.getchildren() | |
reference_text = collage_ref.text | |
if reference_children or reference_text: | |
retain_original_contents(collage_ref) | |
original_contents = (collage_ref.find(".//originalcontents")) | |
collage_ref.addnext(original_contents) | |
eoa_ref = xml_tree.xpath("//t:body//t:ref", namespaces=NS_MAP) | |
for ref in eoa_ref: | |
handle_refs_default(ref) | |
############ | |
# Indexing # | |
############ | |
index_entries = xml_tree.xpath("//t:body//t:index[not(ancestor::t:index)]", namespaces=NS_MAP) | |
for entry in index_entries: | |
index_type = entry.get("indexName") | |
tagname = "EOAindex" | |
if index_type != "keyword": | |
tagname += index_type | |
entry.tag = tagname | |
entry_tail = entry.tail | |
entry_content = entry.find("t:term", namespaces=NS_MAP) | |
entry_text = "" | |
sortkey = entry_content.get("sortKey") | |
# try: | |
# sortkey = entry_content.get("sortKey") | |
# except AttributeError: | |
# sortkey = "" | |
if sortkey is None: | |
logging.info("No sortkey found") | |
else: | |
entry_text = f"{sortkey}@" | |
# if len(sortkey) > 0: | |
# entry_text = f"{sortkey}@" | |
# markup ignored for now | |
remainder = libeoaconvert.gettext(entry_content) | |
entry_text += remainder | |
# re-assemble | |
entry.clear() | |
entry.text = entry_text | |
entry.tail = entry_tail | |
######## | |
# Math # | |
######## | |
process_inline_equations(xml_tree, eoa_chapters, template_path, temp_dir, output_dir) | |
block_equations = xml_tree.xpath("//t:body//t:ab[@type='equation']", namespaces=NS_MAP) | |
for equation in block_equations: | |
# take care of nonumber, as well | |
equation.tag = "ACHTUNGstillsomeworktodo" | |
sub_equations = xml_tree.xpath("//t:body//t:ab[@type='subequations']", namespaces=NS_MAP) | |
for equation in sub_equations: | |
# take care of nonumber, as well | |
equation.tag = "ACHTUNGstillsomeworktodo" | |
pass | |
equationarrays = xml_tree.xpath("//t:body//t:ab[@type='equationarray']", namespaces=NS_MAP) | |
for equation in equationarrays: | |
# take care of nonumber, as well | |
equation.tag = "ACHTUNGstillsomeworktodo" | |
pass | |
# <ab type="theoremdeclaration" xml:id="theorem1">My Theorem</ab> | |
# <ab type="theoreminstance" corresp="#theorem1">A statement.</ab> | |
# theorems = | |
############# | |
# Chemistry # | |
############# | |
# <formula notation="mhchem" rend="inline">2HCO3^- + Ca^2+ <=> CaCO3 + CO2 + H2O</formula> | |
inline_chemo = xml_tree.xpath("//t:body//t:formula[@rend='inline' and @notation='mhchem']", namespaces=NS_MAP) | |
for chem in inline_chemo: | |
pass | |
return xml_tree | |
# def transform_body ends here | |
def assign_ids(xml_tree, data, suppress_chapter_number): | |
"""Walk the xml tree again. Assign ids to xml and put them into dicts, as well.""" | |
chapterdict = {} | |
figdict = {} | |
eqdict = {} | |
fndict = {} | |
listdict = {} | |
pagelabeldict = {} | |
secdict = {} | |
tabdict = {} | |
theoremdict = {} | |
chapter_counter = 1 | |
xml_chapters = xml_tree.xpath("//div1") | |
for chapter in xml_chapters: | |
equation_counter = 1 | |
footnote_counter = 1 | |
list_counter = 1 | |
figure_counter = 1 | |
collage_counter = 1 | |
section_counter = 1 | |
table_counter = 1 | |
theorem_counter = 1 | |
if chapter.get('n') != "nonumber": | |
chapter.set("id-text", str(chapter_counter)) | |
chapterdict[chapter.get("id")] = str(chapter_counter) | |
figure_anchors = chapter.findall(".//EOAfigure/anchor") | |
figure_is_subfigure = False | |
for anchor in figure_anchors: | |
figure_number = "%d" % (figure_counter) | |
figure_element = anchor.getparent() | |
figure_element.set("id", anchor.get("id")) | |
figure_type = figure_element.get("type") | |
list_characters = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'] | |
if figure_type == "hionlysub": | |
if figure_is_subfigure == True: | |
pass | |
else: | |
subfigure_counter = 0 | |
figure_is_subfigure = True | |
subfigure_counter += 1 | |
figure_number = f"{int(figure_number) - 1}{list_characters[subfigure_counter - 1]}" | |
elif figure_type == "hionlycollage": | |
figure_number = f"hicollage{chapter_counter}_{collage_counter}" | |
collage_counter += 1 | |
else: | |
subfigure_counter = 0 | |
figure_counter += 1 | |
if suppress_chapter_number: | |
pass | |
else: | |
figure_number = f"{chapter_counter}.{figure_number}" | |
figdict[anchor.get("id")] = figure_number | |
anchor.set("id-text", figure_number) | |
footnotes = chapter.findall(".//note") | |
for footnote in footnotes: | |
fndict[footnote.get("id")] = footnote.get("n") | |
sections = chapter.findall(".//div2") | |
for section in sections: | |
if section.get('n') != "nonumber": | |
section_number = "%d.%d" % (chapter_counter, section_counter) | |
section.set("id-text", section_number) | |
secdict[section.get("id")] = section_number | |
subsection_counter = 1 | |
subsections = section.findall(".//div3") | |
for subsection in subsections: | |
if subsection.get('n') != "nonumber": | |
subsection_number = "%d.%d.%d" % (chapter_counter, section_counter, subsection_counter) | |
subsection.set("id-text", subsection_number) | |
secdict[subsection.get("id")] = subsection_number | |
subsection_counter += 1 | |
section_counter += 1 | |
tables = chapter.findall(".//EOAtable/table") | |
for table in tables: | |
if suppress_chapter_number: | |
table_number = "%d" % (table_counter) | |
else: | |
table_number = "%d.%d" % (chapter_counter, table_counter) | |
# table.attrib["id-text"] = table_number | |
table.set("id-text", table_number) | |
table_counter += 1 | |
tabdict[table.get("id")] = table_number | |
equations = chapter.findall(".//EOAequation") | |
for equation in equations: | |
if suppress_chapter_number: | |
equation_number = "%d" % (equation_counter) | |
else: | |
equation_number = "%d.%d" % (chapter_counter, equation_counter) | |
equation.attrib["id-text"] = equation_number | |
# table.set("id-text", table_number) | |
equation_counter += 1 | |
eqdict[equation.get("id")] = equation_number | |
if chapter.get('n') != "nonumber": | |
chapter_counter += 1 | |
# not implemented yet: list, pagelabel, tab, theorem | |
data["chapterdict"] = chapterdict | |
data["figdict"] = figdict | |
data["eqdict"] = eqdict | |
data["fndict"] = fndict | |
data["listdict"] = listdict | |
data["pagelabeldict"] = pagelabeldict | |
data["secdict"] = secdict | |
data["tabdict"] = tabdict | |
data["theoremdict"] = theoremdict | |
return xml_tree, data | |
# def assign_ids ends here | |
def update_ids(xml_tree, ignore_ref_errors): | |
"""Update the references in EOAref to the id value assigned in assign_ids""" | |
xmlReferences = xml_tree.xpath(".//EOAref|.//EOApageref") | |
logging.debug("Found %d references", len(xmlReferences)) | |
for xmlReference in xmlReferences: | |
eoa_reference = xmlReference.find("ref") | |
label_text = xmlReference.find("Label").text | |
logging.debug("label text is %s" % label_text) | |
# if label_text.endswith("-hi"): | |
# logging.debug("%s is a hyperimage reference. Leaving out for now." % label_text) | |
# pass | |
# else: | |
corresponding_eoa_id_element = xml_tree.xpath("//*[@xml:id='{}']".format(label_text)) | |
logging.debug("The corresponding id element is %s", corresponding_eoa_id_element) | |
# if corresponding_eoa_id_element is None: | |
if len(corresponding_eoa_id_element) == 0: | |
if ignore_ref_errors: | |
logging.warning(f"Found no corresponding xml:id for {label_text}. Ignoring it for now.") | |
eoa_reference.set("target", "??") | |
else: | |
logging.error("There seems to be no corresponding xml:id for %s. Exiting." % label_text) | |
sys.exit(1) | |
elif len(corresponding_eoa_id_element) > 1: | |
if ignore_ref_errors: | |
pass | |
else: | |
logging.error("The xml:id %s has been assigned more than once. This is not allowed. Exiting." % corresponding_eoa_id_element[0].attrib["{http://www.w3.org/XML/1998/namespace}id"]) | |
sys.exit(1) | |
else: | |
eoa_id_element = corresponding_eoa_id_element[0] | |
if eoa_id_element.get("id"): | |
eoa_id = eoa_id_element.get("id") | |
elif eoa_id_element.xpath("@xml:id", namespaces=NS_MAP): | |
eoa_id = eoa_id_element.xpath("@xml:id", namespaces=NS_MAP)[0] | |
else: | |
logging.warning(f"{eoa_id_element.tag} has no id") | |
eoa_reference.set("target", eoa_id) | |
return xml_tree | |
# def update_ids ends here | |
def add_bibliography_monograph(xml_tree, refs_for_bib_chapter): | |
"""Add another chapter containing the bibliography.""" | |
# root_element = xml_tree.getroot() | |
# xml_chapters = root_element.xpath("//div1") | |
# number_of_chapters = len(xml_chapters) | |
# bibliography_chapter = etree.Element("div1", rend="nonumber", language="english") | |
# this needs to be configurable by language | |
# bib_head = etree.SubElement(bibliography_chapter, "head").text = "Bibliography" | |
eoa_pis = xml_tree.xpath("//div1//processing-instruction('eoa')") | |
for eoa_pi in eoa_pis: | |
if eoa_pi.text == "printbibliography": | |
bibliography_parent = eoa_pi.getparent() | |
if bibliography_parent.tag == "div1": | |
bibliography_chapter = bibliography_parent | |
else: | |
bibliography_parent.tag = "tagtobestripped" | |
bibliography_chapter = eoa_pi.xpath("ancestor-or-self::div1[1]")[0] | |
# print(bibliography_chapter) | |
bib_div_1 = etree.SubElement(bibliography_chapter, "div") | |
bib_div_2 = etree.SubElement(bib_div_1, "div") | |
entries = refs_for_bib_chapter.findall(".//div") | |
for entry in entries: | |
entry_id = entry.get("id") | |
entry.set("class", "bibliography") | |
etree.strip_tags(entry, "p") | |
entry.tag = "p" | |
internal_markup = entry.findall(".//em") | |
for markup in internal_markup: | |
markup.tag = "i" | |
bib_div_2.append(entry) | |
# root_element.insert(number_of_chapters + 1, bibliography_chapter) | |
return xml_tree | |
# def add_bibliography_monograph ends here | |
def add_bibliography_anthology(xml_tree, formatted_references_dict): | |
"""Add another chapter containing the bibliography.""" | |
for chapter in formatted_references_dict.keys(): | |
chapter_id = chapter.replace("dict_", "") | |
# print("looking at", chapter_id, formatted_references_dict[chapter]) | |
# tmp_xpath_ns = "//t:div1[@xml:id='%s']//processing-instruction('eoa')" % chapter_id | |
tmp_xpath = "//div1[@xml:id='%s']//processing-instruction('eoa')" % chapter_id | |
# print(tmp_xpath) | |
# eoa_pis_ns = xml_tree.xpath(tmp_xpath, namespaces=NS_MAP) | |
eoa_pis = xml_tree.xpath(tmp_xpath) | |
# eoa_pis = xml_tree.xpath("//processing-instruction('eoa')") | |
# eoa_pis = xml_tree.xpath("//t:div1[@xml:id='chap17_riggs']//processing-instruction('eoa')", namespaces=NS_MAP) | |
# print(eoa_pis) | |
for eoa_pi in eoa_pis: | |
if eoa_pi.text == "printbibliography": | |
# print("ok cool, printbibliography") | |
# assuming there's only one | |
bibliography_parent = eoa_pi.getparent() | |
fixed_references = fix_bib_entries(formatted_references_dict[chapter]) | |
""" | |
<div1 rend="nonumber" language="english"><head>Bibliography</head> | |
<div><div><p id="ref-adami_storia_1737" class="bibliography"> | |
Adami, Andrea (1737). <i>Storia Di Volseno Antica Metropoli Della Toscana Descritta in Quattro Libri</i>. Vol. I. IV vol. In Roma: Per Antonio de’ Rossi, nella Strada del Seminario Romano. | |
</p> | |
""" | |
extra_div = etree.SubElement(bibliography_parent, "div") | |
extra_div.insert(1, fixed_references) | |
# bibliography_parent.insert(1, formatted_references_dict[chapter]) | |
bibliography_parent.remove(eoa_pi) | |
# xml_chapters = xml_tree.xpath("//t:div1", namespaces=NS_MAP) | |
# xml_chapters_re = root_element.xpath("//t:div1", namespaces=NS_MAP) | |
# xml_chapters = xml_tree.xpath("//div1") | |
# yyy.xpath("//t:div1[@xml:id='chap01_caraffa']//processing-instruction('eoa')", namespaces=NS_MAP) | |
# for chapter in xml_chapters: | |
# chapter_id = chapter.xpath("@xml:id") | |
# eoa_pi = chapter.xpath("//processing-instruction('eoa')") | |
# print("Looking at", chapter_id, eoa_pi) | |
return xml_tree | |
# def add_bibliography_anthology ends here | |
def fix_bib_entries(div_snippet): | |
"""Modify the html code returned by pandoc-citeproc""" | |
entries = div_snippet.findall(".//div") | |
for entry in entries: | |
entry_id = entry.get("id") | |
entry.set("class", "bibliography") | |
etree.strip_tags(entry, "p") | |
entry.tag = "p" | |
internal_markup = entry.findall(".//em") | |
for markup in internal_markup: | |
markup.tag = "i" | |
return div_snippet | |
# def fix_bib_entries ends here | |
def detect_hyperimage(tei_body): | |
"""Search for hyperimage types, return boolean""" | |
hyperimage_types = tei_body.xpath(".//t:figure[@type='hitrue' or @type='hionly']", namespaces=NS_MAP) | |
if len(hyperimage_types) == 0: | |
hyperimage_present = False | |
else: | |
hyperimage_present = True | |
return hyperimage_present | |
# def detect_hyperimage ends here | |
def main(): | |
"""Main function""" | |
# parse args: | |
parser = argparse.ArgumentParser( | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter | |
) | |
parser.add_argument( | |
"-c", "--config", | |
dest="CONFIG_FILE", | |
default = BASE_DIR / "config" / "eoaconvert.cfg", | |
help="Name of configuration file", | |
metavar="CONFIGURATION" | |
) | |
parser.add_argument( | |
"--log-level", | |
default = "INFO", | |
help="log level: choose between DEBUG, INFO, WARNING, ERROR, CRITICAL" | |
) | |
parser.add_argument( | |
"-f", "--filename", | |
default = Path("*.xml"), | |
type = Path, | |
help = "xml file inside PUBLICATION_DIR, or absolute path. Patterns like '*.xml' are also acceptable" | |
) | |
parser.add_argument( | |
"-o", "--output-dir", | |
help = f"output directory. default: {DEFAULT_OUTPUT_DIR}/PUBLICATION_NAME/imxml", | |
type = Path, | |
) | |
parser.add_argument( | |
"-i", "--ignore-ref-errors", | |
action="store_true", | |
help="Ignore warnings of missing or duplicate ids." | |
) | |
parser.add_argument( | |
"-s", "--suppress-chapter-number", | |
action="store_true", | |
help="In floats, only print number of figure, not chapter before." | |
) | |
parser.add_argument( | |
"-d", "--pickleddata", | |
help = f"directory containing pickled data file to be used. default {DEFAULT_OUTPUT_DIR}/PUBLICATION_NAME/pickle", | |
type = Path, | |
) | |
parser.add_argument( | |
"-n", "--no-bib4ht", | |
action="store_true", | |
help="Skip creation of bibliography, rely on already present HTML files." | |
) | |
parser.add_argument( | |
"-classic", "--eoa-classic", | |
action="store_true", | |
help="Embed webdesign of EOA1.0 into XML" | |
) | |
parser.add_argument( | |
"PUBLICATION_DIR", | |
help = "directory containing the publication (including resources like pictures, etc.)", | |
type = Path, | |
) | |
args = parser.parse_args() | |
INPUT_DIR = args.PUBLICATION_DIR | |
INPUT_PATH = \ | |
args.filename if args.filename . is_absolute() else list( INPUT_DIR . glob( str(args.filename) ))[0] | |
OUTPUT_DIR = \ | |
args.output_dir if args.output_dir is not None else (DEFAULT_OUTPUT_DIR / INPUT_DIR.resolve().stem) / "imxml" | |
PICKLE_DIR = \ | |
args.pickleddata if args.pickleddata is not None else (DEFAULT_OUTPUT_DIR / INPUT_DIR.resolve().stem) / "pickle" | |
PICKLE_FILE = PICKLE_DIR / "data.pickle" | |
LOG_DIR = OUTPUT_DIR / "log" | |
TEMP_DIR = OUTPUT_DIR / "tmp_files" | |
DEBUG_DIR = OUTPUT_DIR / "debug" | |
# where to output the xml file: | |
XML_FILE = (OUTPUT_DIR / INPUT_PATH.name) .with_suffix( ".xml" ) | |
config_file = args.CONFIG_FILE | |
print("The config file is ", config_file) | |
CONFIG = load_config( | |
config_file, | |
args.log_level, | |
(LOG_DIR / SCRIPT_NAME) . with_suffix( ".log" ), | |
) | |
logging.info( "checking executables 'utils.bib2html' needs...:" ) | |
bib2html.check_executables() | |
OUTPUT_DIR.mkdir( | |
parents = True, | |
exist_ok = True | |
) | |
TEMP_DIR.mkdir( | |
parents = True, | |
exist_ok = True | |
) | |
DEBUG_DIR.mkdir( | |
parents = True, | |
exist_ok = True | |
) | |
try: | |
with open(PICKLE_FILE, 'rb') as f: | |
data = pickle.load(f) | |
except FileNotFoundError: | |
logging.error("File 'data.pickle' not found. You should run 'fix_tei.py' first. Exiting.") | |
sys.exit(1) | |
TRANSLATION_FILE = BASE_DIR / CONFIG['Auxiliaries']['TRANSLATIONS'] | |
TEMPLATE_PATH = BASE_DIR / CONFIG['Auxiliaries']['template_path'] | |
xml_tree = etree.parse(str(INPUT_PATH)) | |
publication_language = xml_tree.xpath("//t:teiHeader/t:profileDesc/t:langUsage/t:language/@ident", namespaces=NS_MAP)[0] | |
cover_image_path = xml_tree.xpath("//t:text/t:front/t:figure[@type='cover']/t:graphic/@url", namespaces=NS_MAP)[0] | |
shutil.copy( INPUT_DIR / cover_image_path, | |
OUTPUT_DIR / "Cover.jpg" | |
) | |
bib_data = check_bibliography(xml_tree) | |
logging.debug("Creating bibliographies.") | |
cited_dict = {} | |
if bib_data["type"] == "monograph": | |
bibl_info = bib2html.get_bibl_info( xml_tree ) | |
logging.debug( f"citekeys: {bibl_info['citekeys']}" ) | |
citations_filename_tei = \ | |
(INPUT_DIR / "bibliography/bibliography_all") . with_suffix(".tei") | |
citations_filename_html = (TEMP_DIR / "formatted_citations_monograph") . with_suffix(".html") | |
bib2html.teibib_to_eoa1( | |
citations_filename_tei, | |
output_file = citations_filename_html | |
) | |
logging.info("Formatting citations now.") | |
# citekey -> (authoryear, year, title) | |
cited_dict = format_citations( | |
bibl_info['citekeys'], | |
citations_filename_html | |
) | |
refs_for_bib_chapter = format_reference_list(bibl_info['citekeys'], citations_filename_html) | |
elif bib_data["type"] == "anthology": | |
bibl_info = bib2html.get_bibl_info( xml_tree ) | |
citations_per_chapter = bib2html.get_citations_per_chapter(xml_tree) | |
formatted_references_dict = {} | |
all_chapter_ids = xml_tree.xpath("//t:div[@type='chapter']/@xml:id", namespaces=NS_MAP) | |
for chapter_id in all_chapter_ids: | |
used_citekeys_per_chapter = citations_per_chapter[chapter_id] | |
logging.debug(f"{len(used_citekeys_per_chapter)} citations in this chapter") | |
if not used_citekeys_per_chapter: | |
logging.debug("No citations found, advancing to next chapter.") | |
continue | |
else: | |
citations_filename_root = Path(TEMP_DIR, f"formatted_citations_{chapter_id}") | |
if args.no_bib4ht: | |
# citations_filename_html_per_chapter = citations_filename_root.with_suffix(".html") | |
logging.info("Skipping creation of HTML bibliography files. Using the existing ones.") | |
else: | |
citations_filename_tei_per_chapter = citations_filename_root . with_suffix(".tei") | |
if not citations_filename_tei_per_chapter.is_file(): | |
translations = {"de" : "german", "en" : "english", "it" : "italian", "fr" : "french"} | |
bib2html.bib2tei( | |
bib_file = INPUT_DIR / bib_data["source"], | |
citekeys = used_citekeys_per_chapter, | |
language = translations[publication_language], | |
temp_dir = TEMP_DIR, | |
output_file = citations_filename_tei_per_chapter, | |
log_dir = LOG_DIR, | |
keywords = [""] | |
) | |
citations_filename_html_per_chapter = citations_filename_root . with_suffix(".html") | |
bib2html.teibib_to_eoa1( | |
citations_filename_tei_per_chapter, | |
output_file = citations_filename_html_per_chapter | |
) | |
logging.info("Formatting citations now.") | |
# citekey -> (authoryear, year, title) | |
cited_dict_per_chapter = format_citations(used_citekeys_per_chapter, citations_filename_html_per_chapter) | |
# Merge dictionaries | |
cited_dict = {**cited_dict, **cited_dict_per_chapter} | |
refs_for_bib_chapter = format_reference_list(used_citekeys_per_chapter, citations_filename_html_per_chapter) | |
tmp_dict_key = "dict_" + chapter_id | |
# create a dictionary entry containing the formatted references | |
formatted_references_dict[tmp_dict_key] = refs_for_bib_chapter | |
logging.debug(f"cited_dict now has {libeoaconvert.plural(len(cited_dict), 'entry', plural='entries')}.") | |
else: | |
raise( Exception("unknown publication type!")) | |
logging.debug( cited_dict ) | |
tei_body = xml_tree.xpath("//t:body", namespaces=NS_MAP)[0] | |
hyperimage_present = detect_hyperimage(tei_body) | |
if hyperimage_present: | |
logging.info("Transforming body with Hyperimage support") | |
shutil.copy( INPUT_DIR / "hyperimage/hi_figures.xml", | |
OUTPUT_DIR / "hi_figures.xml") | |
HI_XML_FILE = OUTPUT_DIR / "hi_figures.xml" | |
else: | |
HI_XML_FILE = None | |
body_transformed_tmp = transform_body( | |
tei_body, | |
cited_dict, | |
TRANSLATION_FILE, | |
TEMPLATE_PATH, | |
HI_XML_FILE, | |
args.eoa_classic, | |
publang=publication_language, | |
temp_dir = TEMP_DIR, | |
output_dir = OUTPUT_DIR, | |
hyperimage=hyperimage_present | |
) | |
libeoaconvert.debug_xml_here(body_transformed_tmp, "body_transformed", DEBUG_DIR) | |
body_transformed = etree.ElementTree(body_transformed_tmp) | |
if bib_data["type"] == "monograph": | |
xml_add_bib = add_bibliography_monograph(body_transformed, refs_for_bib_chapter) | |
elif bib_data["type"] == "anthology": | |
xml_add_bib = add_bibliography_anthology(body_transformed, formatted_references_dict) | |
etree.strip_tags(xml_add_bib, "tagtobestripped") | |
etree.strip_elements(xml_add_bib, "elementtoberemoved") | |
libeoaconvert.debug_xml_here(xml_add_bib, "xml_add_bib", DEBUG_DIR) | |
elements_with_ids = xml_add_bib.xpath("//div1 | //div2 | //div3 | //note | //item | //table | //EOAfigure/anchor | //EOAequation | //formula | //theorem") | |
element_counter = 1 | |
for element in elements_with_ids: | |
element.set("id", "uid" + '{:04d}'.format(element_counter)) | |
element_counter += 1 | |
assigned_ids, data_to_pickle = assign_ids(xml_add_bib, data, args.suppress_chapter_number) | |
updated_xml_tree = update_ids(assigned_ids, args.ignore_ref_errors) | |
# libeoaconvert.debug_xml_here(updated_xml_tree, "updated_tree") | |
# nearly_final_tree = etree.ElementTree(updated_xml_tree) | |
# xml_root = nearly_final_tree.getroot() | |
if isinstance(updated_xml_tree, etree._ElementTree): | |
pass | |
else: | |
updated_xml_tree = etree.ElementTree(updated_xml_tree) | |
xml_root = updated_xml_tree.getroot() | |
xml_root.tag = "Book" | |
final_tree = updated_xml_tree | |
# objectify.deannotate(final_tree, cleanup_namespaces=True) | |
# etree.cleanup_namespaces(xml_root) | |
with open(OUTPUT_DIR / 'tmp_files/data.pickle', 'wb') as f: | |
# Pickle the 'data' dictionary using the highest protocol available. | |
pickle.dump(data_to_pickle, f, pickle.HIGHEST_PROTOCOL) | |
publication_info = get_publication_info(xml_tree, TRANSLATION_FILE) | |
config_data = make_publication_cfg(publication_info, TRANSLATION_FILE) | |
output_filename = OUTPUT_DIR / "publication.cfg" | |
with open(output_filename, 'w') as configfile: | |
config_data.write(configfile) | |
logging.info(f"Wrote {output_filename}.") | |
output_filename = str(OUTPUT_DIR / "IntermediateXMLFile.xml") | |
final_tree.write(output_filename, pretty_print=True, xml_declaration=True, encoding="utf-8") | |
logging.info(f"Wrote {output_filename}.") | |
# Remove namespace info (brute force solution) | |
bad_ns_string = ' xmlns="http://www.tei-c.org/ns/1.0"' | |
with open(output_filename, 'r') as textfile: | |
xml_as_string = textfile.read() | |
removed_namespace = xml_as_string.replace(bad_ns_string, "") | |
with open(output_filename, 'w') as amended_textfile: | |
amended_textfile.write(removed_namespace) | |
# def main ends here | |
if __name__ == '__main__': | |
# run main: | |
main() | |
# finis |