Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 431 lines (335 sloc) 17.1 KB
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
"""Unfinished program to convert a customized DocBook XML to TEI XML.
This program creates a TEI XML version out of a DocBook XML file.
"""
import sys
import configparser
from lxml import etree
# citations need a little more work: especially citedRange
# so do landscape figures, no way to distinguish them!
# namespaces
TEI_NS = "http://www.tei-c.org/ns/1.0"
TEI = "{%s}" % TEI_NS
NS_MAP = {
None: TEI_NS,
"xml": "https://www.w3.org/TR/xml11",
"tmp": "tmp",
"re": "http://exslt.org/regular-expressions"}
PUB_CONFIG = configparser.ConfigParser()
PUB_CONFIG.read("publication.cfg")
id_attr = "{{{ns}}}id".format( ns=NS_MAP["xml"] )
def populate_front_part(CONFIG):
"""This part consists mainly of boilerplate text"""
list_of_elements = []
if len(CONFIG['General']['Dedication']) > 0:
dedication = etree.Element(TEI + "div", type="dedication")
dedication_text = etree.SubElement(dedication, TEI + "p").text = CONFIG['General']['Dedication']
list_of_elements.append(dedication)
return list_of_elements
# def populate_front_part ends here
def check_index_formatting(entry, xml_element):
"""Check if index string contains formatting directions"""
if entry.find("@") > 0:
sort_key, formatted_string = entry.split("@")
xml_element.set("sortKey", sort_key)
xml_element.text = formatted_string
else:
xml_element.text = entry
return xml_element
# def check_index_formatting ends here
def fix_xml_id(id_string):
"""Make a string NCName conform"""
replacements = {
":" : "_"
}
for character in replacements.keys():
id_string = id_string.replace(character, replacements[character])
return id_string
# def fix_xml_id ends here
def fix_equations(tex_equation):
"""Remove surrounding TeX code from formulas."""
superfluous_code = [r"\end{equation*}", r"\begin{equation*}"]
for codes in superfluous_code:
print(codes)
tex_equation = tex_equation.replace(codes, "")
return tex_equation
# def fix_equations ends here
def create_tei_header(CONFIG):
"""Based on publication.cfg, create the header for a TEI file."""
header = etree.Element(TEI + "teiHeader")
filedesc = etree.SubElement(header, TEI + "fileDesc")
titlestmt = etree.SubElement(filedesc, TEI + "titleStmt")
series_title = etree.SubElement(titlestmt, TEI + "title", level = "s", n = CONFIG['Technical']['Number']).text = CONFIG['Technical']['Serie']
book_title = etree.SubElement(titlestmt, TEI + "title", type = "main", level = "m").text = CONFIG['Technical']['Title']
book_subtitle = etree.SubElement(titlestmt, TEI + "title", type = "sub", level = "m").text = CONFIG['Technical']['Subtitle']
for person in range(1,6):
tmpstring = """Author%s""" % person
tmpauthor = CONFIG['Authors'][tmpstring]
if len(tmpauthor) > 0:
author = etree.SubElement(titlestmt, TEI + "author").text = CONFIG['Authors'][tmpstring]
etree.SubElement(titlestmt, TEI + "editor", role="publicationmanager").text = "Lindy Divarci"
other_roles = {"Submitter" : "submitter", "EditorialCoordination" :
"editorialcoordinator", "Copyediting" : "copyeditor", "Translator" :
"translator"}
for role in other_roles:
tmprole = CONFIG['General'][role]
if len(tmprole) > 0:
editor = etree.SubElement(titlestmt, TEI + "editor", role=other_roles[role]).text = CONFIG['General'][role]
extent = etree.SubElement(filedesc, TEI + "extent")
# numpages = etree.SubElement(extent, "measure", unit="pages", quantity=)
numpages = etree.SubElement(extent, "measure", unit="EUR", quantity=CONFIG['Technical']['Price'])
pub_statement = etree.SubElement(filedesc, TEI + "publicationStmt")
publisher = etree.SubElement(pub_statement, TEI + "publisher").text = "Edition Open Access"
distributor = etree.SubElement(pub_statement, TEI + "distributor").text = CONFIG['Technical']['Shoplink']
publication_date = etree.SubElement(pub_statement, TEI + "date", when=CONFIG['Technical']['PublicationDate']).text = CONFIG['Technical']['PublicationDate']
isbn = etree.SubElement(pub_statement, TEI + "idno", type="ISBN").text = CONFIG['Technical']['ISBN']
shoplink = etree.SubElement(pub_statement, TEI + "idno", type="shoplink").text = CONFIG['Technical']['Shoplink']
availability = etree.SubElement(pub_statement, TEI + "availability")
licence = etree.SubElement(availability, TEI + "licence", target="https://creativecommons.org/licenses/" + CONFIG['Technical']['License'] + "/3.0/de/deed.en")
# following string should not be hardcoded
licence_text = etree.SubElement(licence, TEI + "p").text = "Distributed under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Germany License."
sourcedesc = etree.SubElement(filedesc, TEI + "sourceDesc")
sourcedesc_text = etree.SubElement(sourcedesc, TEI + "p").text = "This is a born digital document from Edition Open Access."
profiledesc = etree.SubElement(header, TEI + "profileDesc")
shortabstract = etree.SubElement(profiledesc, TEI + "abstract", n="brief")
shortabstract_text = etree.SubElement(shortabstract, TEI + "p").text = CONFIG['General']['BriefDescription']
longabstract = etree.SubElement(profiledesc, TEI + "abstract", n="detail")
longabstract_text = etree.SubElement(longabstract, TEI + "p").text = CONFIG['General']['DetailedDescription']
if len(CONFIG['General']['AdditionalInformation']) > 0:
additionalabstract = etree.SubElement(profiledesc, TEI + "abstract", n="additional")
additionalabstract_text = etree.SubElement(additionalabstract, TEI + "p").text = CONFIG['General']['AdditionalInformation']
textclass = etree.SubElement(profiledesc, TEI + "textClass")
keywords = etree.SubElement(textclass, TEI + "keywords")
keyword_list = etree.SubElement(keywords, TEI + "list")
for keyword in range(1,7):
tmpstring = """Keyword%s""" % keyword
tmpkeyword = CONFIG['General'][tmpstring]
if len(tmpkeyword) > 0:
keyword = etree.SubElement(keyword_list, TEI + "item").text = tmpkeyword
language_dictionary = {"de" : "Deutsch", "en" : "Englisch"}
language_usage = etree.SubElement(profiledesc, TEI + "langUsage")
mainlanguage = etree.SubElement(language_usage, TEI + "language", ident=CONFIG['Technical']['Language']).text = language_dictionary[CONFIG['Technical']['Language']]
encodingdesc = etree.SubElement(header, TEI + "encodingDesc")
tag_declaration = etree.SubElement(encodingdesc, TEI + "tagsDecl")
struck_rendition = etree.SubElement(tag_declaration, TEI + "rendition", scheme="css")
struck_rendition.text = "text-decoration: line-through;"
struck_rendition.set(id_attr, "struck")
spaced_rendition = etree.SubElement(tag_declaration, TEI + "rendition", scheme="css")
spaced_rendition.text = "letter-spacing:0.3em"
spaced_rendition.set(id_attr, "spaced")
smallcaps_rendition = etree.SubElement(tag_declaration, TEI + "rendition", scheme="css")
smallcaps_rendition.text = "font-variant:small-caps"
smallcaps_rendition.set(id_attr, "smallcaps")
return header
# def create_tei_header ends here
def transform_intermediate_xml(xml_tree):
"""Perform a transformation of existing XML structure"""
bibliography_type = xml_tree.xpath("/Book/p/EOAbibliographytype/text()")[0]
bibliographydatabase = xml_tree.xpath("/Book/p/EOAbibliographydatabase/text()")[0]
delete_info_paragraphs = xml_tree.xpath("/Book/p")
for paragraph in delete_info_paragraphs:
paragraph.getparent().remove(paragraph)
etree.strip_elements(xml_tree, "tableofcontents")
root_element = xml_tree.getroot()
root_element.tag = "body"
etree.strip_attributes(root_element, "A", "B", "part", "id-text", "id")
etree.strip_tags(root_element, "allowbreak")
alldivs = xml_tree.xpath("//div1|//div2|//div3|//div4|")
divdict = {"div1" : "chapter", "div2" : "section", "div3" : "subsection", "div4" : "subsubsection"}
for div in alldivs:
div.set("type", divdict[div.tag])
div.tag = "div"
italics = xml_tree.xpath("//hi[@rend='it']")
for element in italics:
element.set("rend", "italic")
higher_lower = xml_tree.xpath("//EOAup|//EOAdown")
for element in higher_lower:
if element.tag == "EOAup":
element.set("rend", "superscript")
else:
element.set("rend", "subscript")
element.tag = "hi"
inline_image = xml_tree.xpath("//EOAinline")
for element in inline_image:
element.tag = "graphic"
element.set("url", element.text)
# is there a better way?
element.text = ""
footnotes = xml_tree.xpath("//note[@place='Inline']")
for element in footnotes:
element.set("place", "bottom")
urls = xml_tree.xpath("//xref")
for element in urls:
element.tag = "ref"
element.set("type", "url")
element.set("target", element.get("url"))
etree.strip_attributes(element, "url")
citations = xml_tree.xpath("//span[@class='citation']|//EOAcitenumeric")
for element in citations:
# how to determine the cite pages?
citedrange = ""
if element.tag == "span":
citekey = element.get("citekey")
booktitle = element.get("data-content")
other_text = element.get("data-title")
elif element.tag == "EOAcitenumeric":
citekey = element.find("citekey").text
citetext = element.find("citetext").text
element.set("rend", "numeric")
if citetext is None:
citedrange = ""
else:
citedrange = citetext
element.tag = "bibl"
reference = etree.SubElement(element, "ref", target="#" + citekey)
if len(citedrange) > 0:
# leaving unit attribute out for now and assume that page is default range
cited_range = etree.SubElement(element, "citedRange")#, unit="page")
cited_range.text = citedrange
element.text = ""
etree.strip_attributes(element, "rel", "class", "citekey", "data-toggle", "html", "data-placement", "data-content", "data-title")
etree.strip_elements(element, "citekey", "citetext")
references = xml_tree.xpath("//EOApageref|//EOAref")
for element in references:
if element.tag == "EOApageref":
element.set("type", "page")
element.tag = "ref"
label = element.find("Label")
element.set("target", "#" + label.text)
etree.strip_elements(element, "ref", "Label")
quoted_paragraph = xml_tree.xpath("//p[@rend='quoted']")
for element in quoted_paragraph:
element.tag = "quote"
etree.strip_attributes(element, "rend")
ordered_lists = xml_tree.xpath("//list[@type='ordered' or @type='simple' or @type='description']")
for element in ordered_lists:
if element.get("type") == "simple":
element.set("type", "bulleted")
elif element.get("type") == "description":
element.set("type", "gloss")
items = element.getchildren()
for item in items:
etree.strip_attributes(item, "label")
etree.strip_tags(item, "p")
# code if we need to treat the description list differently for any reason
# description_lists = xml_tree.xpath("//list[@type='description']")
# for element in description_lists:
# element.set("type", "gloss")
# labels = element.findall("label")
# for label in labels:
# corresponding_item = label.getnext()
# etree.strip_tags(corresponding_item, "p")
index_entries = xml_tree.xpath("//EOAindex|//EOAindexperson|//EOAindexlocation")
index_names = {"EOAindex" : "keyword", "EOAindexperson" : "Person", "EOAindexlocation" : "Location"}
for entry in index_entries:
index_type = entry.tag
index_text = entry.text
entry.tag = "index"
entry.text = ""
entry.set("indexName", index_names[index_type])
# does split() order the pieces in the correct way?
subentries = index_text.split("!")
# could probably be made more concise with recursion
if len(subentries) == 1:
term_element = etree.SubElement(entry, "term")
term_element = check_index_formatting(subentries[0], term_element)
elif len(subentries) == 2:
term_element = etree.SubElement(entry, "term")
term_element = check_index_formatting(subentries[0], term_element)
second_index_level = etree.SubElement(entry,"index")
second_term_element = etree.SubElement(second_index_level, "term")
second_term_element = check_index_formatting(subentries[1], second_term_element)
elif len(subentries) == 3:
term_element = etree.SubElement(entry, "term")
term_element = check_index_formatting(subentries[0], term_element)
second_index_level = etree.SubElement(entry, "index")
second_term_element = etree.SubElement(second_index_level, "term")
second_term_element = check_index_formatting(subentries[1], second_term_element)
third_index_level = etree.SubElement(second_index_level, "index")
third_term_element = etree.SubElement(third_index_level, "term")
third_term_element = check_index_formatting(subentries[2], third_term_element)
elif len(subentries) >= 4:
print("Error: more than two levels of subentries are disallowed!\n", index_text)
sys.exit()
tables = xml_tree.xpath("//EOAtable")
for table in tables:
tab_label = table.find("EOAtablelabel")
tab_caption = table.find("EOAtablecaption")
tab_config = table.find("EOAtablecolumns")
realtable = table.find("table")
rows = realtable.findall("row")
# debatable, as there might be colspan!
columns = rows[0].findall("cell")
table.tag = "table"
table.set("rows", str(len(rows)))
table.set("cols", str(len(columns)))
table.set(id_attr, fix_xml_id(tab_label.text))
table_header = etree.Element("head")
table_header.text = tab_caption.text
table.insert(0, table_header)
for row in rows:
header = row.xpath(".//tableheader")
if len(header) > 0:
header[0].text = ""
row.set("role", "label")
else:
row.set("role", "data")
cells = row.findall("cell")
for cell in cells:
cell.set("role", "data")
etree.strip_elements(table, "EOAtablelabel", "EOAtablecaption", "EOAtablecolumns")
etree.strip_tags(table, "table", "tableheader")
figures = xml_tree.xpath("//EOAfigure|//EOAfigurenonumber")
for figure in figures:
if figure.tag == "EOAfigurenonumber":
figure.set("rend", "nonumber")
else:
image_caption = figure.find(".//caption")
image_caption.tag = "head"
figure.append(image_caption)
figure.tag = "figure"
figure_size = figure.find(".//width")
etree.Comment("width is " + figure_size.text)
image_path = figure.find(".//file").text
etree.SubElement(figure, "graphic", url=image_path)
etree.strip_elements(figure, "anchor", "p")
equations = xml_tree.xpath("//EOAineq|//EOAequation|//EOAequationnonumber")
for element in equations:
if element.tag == "EOAineq":
element.set("rend", "inline")
elif element.tag == "EOAequationnonumber":
element.set("rend", "block nonumber")
else:
element.set("rend", "block")
element.set("notation", "TeX")
element.text = fix_equations(element.get("TeX"))
element.tag = "formula"
etree.strip_attributes(element, "TeX", "src", "filename", "number", "uid")
etree.strip_elements(element, "math", "formula", "anchor")
return xml_tree
# def transform_intermediate_xml ends here
def main():
"""The main bit"""
# CONFIG['Authors']['Zusatz']:
# create document structure
tei_root = etree.Element(TEI + "TEI", nsmap=NS_MAP)
tei_header = create_tei_header(PUB_CONFIG)
tei_root.append(tei_header)
tei_body = etree.SubElement(tei_root, "text")
intermediate_xml_tree = etree.parse("tmp_files/IntermediateXMLFile.xml")
tei_body_xml = transform_intermediate_xml(intermediate_xml_tree)
front_part = etree.SubElement(tei_body, "front")
front_contents_list = populate_front_part(PUB_CONFIG)
for part in front_contents_list:
front_part.append(part)
back_part = etree.SubElement(tei_body, "back")
tei_body.insert(1, tei_body_xml.getroot())
outfile = 'CONVERT/TEI.xml'
output_string = etree.tostring(tei_root, xml_declaration=True, pretty_print=True, encoding="UTF-8", doctype= '<?xml-model href="eoa_tei.rnc" type="application/relax-ng-compact-syntax"?>\n<?xml-stylesheet type="text/css" href="tei.css" ?>')
with open(outfile, 'w') as output_file:
output_file.write(output_string.decode("utf-8"))
# def main ends here
if __name__ == '__main__':
main()
# finis