Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
EOASkripts/imxml2tei.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
430 lines (335 sloc)
17.1 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8; mode: python -*- | |
"""Unfinished program to convert a customized DocBook XML to TEI XML. | |
This program creates a TEI XML version out of a DocBook XML file. | |
""" | |
import sys | |
import configparser | |
from lxml import etree | |
# citations need a little more work: especially citedRange | |
# so do landscape figures, no way to distinguish them! | |
# namespaces | |
TEI_NS = "http://www.tei-c.org/ns/1.0" | |
TEI = "{%s}" % TEI_NS | |
NS_MAP = { | |
None: TEI_NS, | |
"xml": "https://www.w3.org/TR/xml11", | |
"tmp": "tmp", | |
"re": "http://exslt.org/regular-expressions"} | |
PUB_CONFIG = configparser.ConfigParser() | |
PUB_CONFIG.read("publication.cfg") | |
id_attr = "{{{ns}}}id".format( ns=NS_MAP["xml"] ) | |
def populate_front_part(CONFIG): | |
"""This part consists mainly of boilerplate text""" | |
list_of_elements = [] | |
if len(CONFIG['General']['Dedication']) > 0: | |
dedication = etree.Element(TEI + "div", type="dedication") | |
dedication_text = etree.SubElement(dedication, TEI + "p").text = CONFIG['General']['Dedication'] | |
list_of_elements.append(dedication) | |
return list_of_elements | |
# def populate_front_part ends here | |
def check_index_formatting(entry, xml_element): | |
"""Check if index string contains formatting directions""" | |
if entry.find("@") > 0: | |
sort_key, formatted_string = entry.split("@") | |
xml_element.set("sortKey", sort_key) | |
xml_element.text = formatted_string | |
else: | |
xml_element.text = entry | |
return xml_element | |
# def check_index_formatting ends here | |
def fix_xml_id(id_string): | |
"""Make a string NCName conform""" | |
replacements = { | |
":" : "_" | |
} | |
for character in replacements.keys(): | |
id_string = id_string.replace(character, replacements[character]) | |
return id_string | |
# def fix_xml_id ends here | |
def fix_equations(tex_equation): | |
"""Remove surrounding TeX code from formulas.""" | |
superfluous_code = [r"\end{equation*}", r"\begin{equation*}"] | |
for codes in superfluous_code: | |
print(codes) | |
tex_equation = tex_equation.replace(codes, "") | |
return tex_equation | |
# def fix_equations ends here | |
def create_tei_header(CONFIG): | |
"""Based on publication.cfg, create the header for a TEI file.""" | |
header = etree.Element(TEI + "teiHeader") | |
filedesc = etree.SubElement(header, TEI + "fileDesc") | |
titlestmt = etree.SubElement(filedesc, TEI + "titleStmt") | |
series_title = etree.SubElement(titlestmt, TEI + "title", level = "s", n = CONFIG['Technical']['Number']).text = CONFIG['Technical']['Serie'] | |
book_title = etree.SubElement(titlestmt, TEI + "title", type = "main", level = "m").text = CONFIG['Technical']['Title'] | |
book_subtitle = etree.SubElement(titlestmt, TEI + "title", type = "sub", level = "m").text = CONFIG['Technical']['Subtitle'] | |
for person in range(1,6): | |
tmpstring = """Author%s""" % person | |
tmpauthor = CONFIG['Authors'][tmpstring] | |
if len(tmpauthor) > 0: | |
author = etree.SubElement(titlestmt, TEI + "author").text = CONFIG['Authors'][tmpstring] | |
etree.SubElement(titlestmt, TEI + "editor", role="publicationmanager").text = "Lindy Divarci" | |
other_roles = {"Submitter" : "submitter", "EditorialCoordination" : | |
"editorialcoordinator", "Copyediting" : "copyeditor", "Translator" : | |
"translator"} | |
for role in other_roles: | |
tmprole = CONFIG['General'][role] | |
if len(tmprole) > 0: | |
editor = etree.SubElement(titlestmt, TEI + "editor", role=other_roles[role]).text = CONFIG['General'][role] | |
extent = etree.SubElement(filedesc, TEI + "extent") | |
# numpages = etree.SubElement(extent, "measure", unit="pages", quantity=) | |
numpages = etree.SubElement(extent, "measure", unit="EUR", quantity=CONFIG['Technical']['Price']) | |
pub_statement = etree.SubElement(filedesc, TEI + "publicationStmt") | |
publisher = etree.SubElement(pub_statement, TEI + "publisher").text = "Edition Open Access" | |
distributor = etree.SubElement(pub_statement, TEI + "distributor").text = CONFIG['Technical']['Shoplink'] | |
publication_date = etree.SubElement(pub_statement, TEI + "date", when=CONFIG['Technical']['PublicationDate']).text = CONFIG['Technical']['PublicationDate'] | |
isbn = etree.SubElement(pub_statement, TEI + "idno", type="ISBN").text = CONFIG['Technical']['ISBN'] | |
shoplink = etree.SubElement(pub_statement, TEI + "idno", type="shoplink").text = CONFIG['Technical']['Shoplink'] | |
availability = etree.SubElement(pub_statement, TEI + "availability") | |
licence = etree.SubElement(availability, TEI + "licence", target="https://creativecommons.org/licenses/" + CONFIG['Technical']['License'] + "/3.0/de/deed.en") | |
# following string should not be hardcoded | |
licence_text = etree.SubElement(licence, TEI + "p").text = "Distributed under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Germany License." | |
sourcedesc = etree.SubElement(filedesc, TEI + "sourceDesc") | |
sourcedesc_text = etree.SubElement(sourcedesc, TEI + "p").text = "This is a born digital document from Edition Open Access." | |
profiledesc = etree.SubElement(header, TEI + "profileDesc") | |
shortabstract = etree.SubElement(profiledesc, TEI + "abstract", n="brief") | |
shortabstract_text = etree.SubElement(shortabstract, TEI + "p").text = CONFIG['General']['BriefDescription'] | |
longabstract = etree.SubElement(profiledesc, TEI + "abstract", n="detail") | |
longabstract_text = etree.SubElement(longabstract, TEI + "p").text = CONFIG['General']['DetailedDescription'] | |
if len(CONFIG['General']['AdditionalInformation']) > 0: | |
additionalabstract = etree.SubElement(profiledesc, TEI + "abstract", n="additional") | |
additionalabstract_text = etree.SubElement(additionalabstract, TEI + "p").text = CONFIG['General']['AdditionalInformation'] | |
textclass = etree.SubElement(profiledesc, TEI + "textClass") | |
keywords = etree.SubElement(textclass, TEI + "keywords") | |
keyword_list = etree.SubElement(keywords, TEI + "list") | |
for keyword in range(1,7): | |
tmpstring = """Keyword%s""" % keyword | |
tmpkeyword = CONFIG['General'][tmpstring] | |
if len(tmpkeyword) > 0: | |
keyword = etree.SubElement(keyword_list, TEI + "item").text = tmpkeyword | |
language_dictionary = {"de" : "Deutsch", "en" : "Englisch"} | |
language_usage = etree.SubElement(profiledesc, TEI + "langUsage") | |
mainlanguage = etree.SubElement(language_usage, TEI + "language", ident=CONFIG['Technical']['Language']).text = language_dictionary[CONFIG['Technical']['Language']] | |
encodingdesc = etree.SubElement(header, TEI + "encodingDesc") | |
tag_declaration = etree.SubElement(encodingdesc, TEI + "tagsDecl") | |
struck_rendition = etree.SubElement(tag_declaration, TEI + "rendition", scheme="css") | |
struck_rendition.text = "text-decoration: line-through;" | |
struck_rendition.set(id_attr, "struck") | |
spaced_rendition = etree.SubElement(tag_declaration, TEI + "rendition", scheme="css") | |
spaced_rendition.text = "letter-spacing:0.3em" | |
spaced_rendition.set(id_attr, "spaced") | |
smallcaps_rendition = etree.SubElement(tag_declaration, TEI + "rendition", scheme="css") | |
smallcaps_rendition.text = "font-variant:small-caps" | |
smallcaps_rendition.set(id_attr, "smallcaps") | |
return header | |
# def create_tei_header ends here | |
def transform_intermediate_xml(xml_tree): | |
"""Perform a transformation of existing XML structure""" | |
bibliography_type = xml_tree.xpath("/Book/p/EOAbibliographytype/text()")[0] | |
bibliographydatabase = xml_tree.xpath("/Book/p/EOAbibliographydatabase/text()")[0] | |
delete_info_paragraphs = xml_tree.xpath("/Book/p") | |
for paragraph in delete_info_paragraphs: | |
paragraph.getparent().remove(paragraph) | |
etree.strip_elements(xml_tree, "tableofcontents") | |
root_element = xml_tree.getroot() | |
root_element.tag = "body" | |
etree.strip_attributes(root_element, "A", "B", "part", "id-text", "id") | |
etree.strip_tags(root_element, "allowbreak") | |
alldivs = xml_tree.xpath("//div1|//div2|//div3|//div4|") | |
divdict = {"div1" : "chapter", "div2" : "section", "div3" : "subsection", "div4" : "subsubsection"} | |
for div in alldivs: | |
div.set("type", divdict[div.tag]) | |
div.tag = "div" | |
italics = xml_tree.xpath("//hi[@rend='it']") | |
for element in italics: | |
element.set("rend", "italic") | |
higher_lower = xml_tree.xpath("//EOAup|//EOAdown") | |
for element in higher_lower: | |
if element.tag == "EOAup": | |
element.set("rend", "superscript") | |
else: | |
element.set("rend", "subscript") | |
element.tag = "hi" | |
inline_image = xml_tree.xpath("//EOAinline") | |
for element in inline_image: | |
element.tag = "graphic" | |
element.set("url", element.text) | |
# is there a better way? | |
element.text = "" | |
footnotes = xml_tree.xpath("//note[@place='Inline']") | |
for element in footnotes: | |
element.set("place", "bottom") | |
urls = xml_tree.xpath("//xref") | |
for element in urls: | |
element.tag = "ref" | |
element.set("type", "url") | |
element.set("target", element.get("url")) | |
etree.strip_attributes(element, "url") | |
citations = xml_tree.xpath("//span[@class='citation']|//EOAcitenumeric") | |
for element in citations: | |
# how to determine the cite pages? | |
citedrange = "" | |
if element.tag == "span": | |
citekey = element.get("citekey") | |
booktitle = element.get("data-content") | |
other_text = element.get("data-title") | |
elif element.tag == "EOAcitenumeric": | |
citekey = element.find("citekey").text | |
citetext = element.find("citetext").text | |
element.set("rend", "numeric") | |
if citetext is None: | |
citedrange = "" | |
else: | |
citedrange = citetext | |
element.tag = "bibl" | |
reference = etree.SubElement(element, "ref", target="#" + citekey) | |
if len(citedrange) > 0: | |
# leaving unit attribute out for now and assume that page is default range | |
cited_range = etree.SubElement(element, "citedRange")#, unit="page") | |
cited_range.text = citedrange | |
element.text = "" | |
etree.strip_attributes(element, "rel", "class", "citekey", "data-toggle", "html", "data-placement", "data-content", "data-title") | |
etree.strip_elements(element, "citekey", "citetext") | |
references = xml_tree.xpath("//EOApageref|//EOAref") | |
for element in references: | |
if element.tag == "EOApageref": | |
element.set("type", "page") | |
element.tag = "ref" | |
label = element.find("Label") | |
element.set("target", "#" + label.text) | |
etree.strip_elements(element, "ref", "Label") | |
quoted_paragraph = xml_tree.xpath("//p[@rend='quoted']") | |
for element in quoted_paragraph: | |
element.tag = "quote" | |
etree.strip_attributes(element, "rend") | |
ordered_lists = xml_tree.xpath("//list[@type='ordered' or @type='simple' or @type='description']") | |
for element in ordered_lists: | |
if element.get("type") == "simple": | |
element.set("type", "bulleted") | |
elif element.get("type") == "description": | |
element.set("type", "gloss") | |
items = element.getchildren() | |
for item in items: | |
etree.strip_attributes(item, "label") | |
etree.strip_tags(item, "p") | |
# code if we need to treat the description list differently for any reason | |
# description_lists = xml_tree.xpath("//list[@type='description']") | |
# for element in description_lists: | |
# element.set("type", "gloss") | |
# labels = element.findall("label") | |
# for label in labels: | |
# corresponding_item = label.getnext() | |
# etree.strip_tags(corresponding_item, "p") | |
index_entries = xml_tree.xpath("//EOAindex|//EOAindexperson|//EOAindexlocation") | |
index_names = {"EOAindex" : "keyword", "EOAindexperson" : "Person", "EOAindexlocation" : "Location"} | |
for entry in index_entries: | |
index_type = entry.tag | |
index_text = entry.text | |
entry.tag = "index" | |
entry.text = "" | |
entry.set("indexName", index_names[index_type]) | |
# does split() order the pieces in the correct way? | |
subentries = index_text.split("!") | |
# could probably be made more concise with recursion | |
if len(subentries) == 1: | |
term_element = etree.SubElement(entry, "term") | |
term_element = check_index_formatting(subentries[0], term_element) | |
elif len(subentries) == 2: | |
term_element = etree.SubElement(entry, "term") | |
term_element = check_index_formatting(subentries[0], term_element) | |
second_index_level = etree.SubElement(entry,"index") | |
second_term_element = etree.SubElement(second_index_level, "term") | |
second_term_element = check_index_formatting(subentries[1], second_term_element) | |
elif len(subentries) == 3: | |
term_element = etree.SubElement(entry, "term") | |
term_element = check_index_formatting(subentries[0], term_element) | |
second_index_level = etree.SubElement(entry, "index") | |
second_term_element = etree.SubElement(second_index_level, "term") | |
second_term_element = check_index_formatting(subentries[1], second_term_element) | |
third_index_level = etree.SubElement(second_index_level, "index") | |
third_term_element = etree.SubElement(third_index_level, "term") | |
third_term_element = check_index_formatting(subentries[2], third_term_element) | |
elif len(subentries) >= 4: | |
print("Error: more than two levels of subentries are disallowed!\n", index_text) | |
sys.exit() | |
tables = xml_tree.xpath("//EOAtable") | |
for table in tables: | |
tab_label = table.find("EOAtablelabel") | |
tab_caption = table.find("EOAtablecaption") | |
tab_config = table.find("EOAtablecolumns") | |
realtable = table.find("table") | |
rows = realtable.findall("row") | |
# debatable, as there might be colspan! | |
columns = rows[0].findall("cell") | |
table.tag = "table" | |
table.set("rows", str(len(rows))) | |
table.set("cols", str(len(columns))) | |
table.set(id_attr, fix_xml_id(tab_label.text)) | |
table_header = etree.Element("head") | |
table_header.text = tab_caption.text | |
table.insert(0, table_header) | |
for row in rows: | |
header = row.xpath(".//tableheader") | |
if len(header) > 0: | |
header[0].text = "" | |
row.set("role", "label") | |
else: | |
row.set("role", "data") | |
cells = row.findall("cell") | |
for cell in cells: | |
cell.set("role", "data") | |
etree.strip_elements(table, "EOAtablelabel", "EOAtablecaption", "EOAtablecolumns") | |
etree.strip_tags(table, "table", "tableheader") | |
figures = xml_tree.xpath("//EOAfigure|//EOAfigurenonumber") | |
for figure in figures: | |
if figure.tag == "EOAfigurenonumber": | |
figure.set("rend", "nonumber") | |
else: | |
image_caption = figure.find(".//caption") | |
image_caption.tag = "head" | |
figure.append(image_caption) | |
figure.tag = "figure" | |
figure_size = figure.find(".//width") | |
etree.Comment("width is " + figure_size.text) | |
image_path = figure.find(".//file").text | |
etree.SubElement(figure, "graphic", url=image_path) | |
etree.strip_elements(figure, "anchor", "p") | |
equations = xml_tree.xpath("//EOAineq|//EOAequation|//EOAequationnonumber") | |
for element in equations: | |
if element.tag == "EOAineq": | |
element.set("rend", "inline") | |
elif element.tag == "EOAequationnonumber": | |
element.set("rend", "block nonumber") | |
else: | |
element.set("rend", "block") | |
element.set("notation", "TeX") | |
element.text = fix_equations(element.get("TeX")) | |
element.tag = "formula" | |
etree.strip_attributes(element, "TeX", "src", "filename", "number", "uid") | |
etree.strip_elements(element, "math", "formula", "anchor") | |
return xml_tree | |
# def transform_intermediate_xml ends here | |
def main(): | |
"""The main bit""" | |
# CONFIG['Authors']['Zusatz']: | |
# create document structure | |
tei_root = etree.Element(TEI + "TEI", nsmap=NS_MAP) | |
tei_header = create_tei_header(PUB_CONFIG) | |
tei_root.append(tei_header) | |
tei_body = etree.SubElement(tei_root, "text") | |
intermediate_xml_tree = etree.parse("IntermediateXMLFile.xml") | |
tei_body_xml = transform_intermediate_xml(intermediate_xml_tree) | |
front_part = etree.SubElement(tei_body, "front") | |
front_contents_list = populate_front_part(PUB_CONFIG) | |
for part in front_contents_list: | |
front_part.append(part) | |
back_part = etree.SubElement(tei_body, "back") | |
tei_body.insert(1, tei_body_xml.getroot()) | |
outfile = 'CONVERT/TEI.xml' | |
output_string = etree.tostring(tei_root, xml_declaration=True, pretty_print=True, encoding="UTF-8", doctype= '<?xml-model href="eoa_tei.rnc" type="application/relax-ng-compact-syntax"?>\n<?xml-stylesheet type="text/css" href="tei.css" ?>') | |
with open(outfile, 'w') as output_file: | |
output_file.write(output_string.decode("utf-8")) | |
# def main ends here | |
if __name__ == '__main__': | |
main() | |
# finis |