Skip to content
Permalink
0c7f9d5809
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 430 lines (335 sloc) 17.1 KB
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
"""Unfinished program to convert a customized DocBook XML to TEI XML.
This program creates a TEI XML version out of a DocBook XML file.
"""
import sys
import configparser
from lxml import etree
# citations need a little more work: especially citedRange
# so do landscape figures, no way to distinguish them!
# namespaces
TEI_NS = "http://www.tei-c.org/ns/1.0"
TEI = "{%s}" % TEI_NS
NS_MAP = {
None: TEI_NS,
"xml": "https://www.w3.org/TR/xml11",
"tmp": "tmp",
"re": "http://exslt.org/regular-expressions"}
PUB_CONFIG = configparser.ConfigParser()
PUB_CONFIG.read("publication.cfg")
id_attr = "{{{ns}}}id".format( ns=NS_MAP["xml"] )
def populate_front_part(CONFIG):
"""This part consists mainly of boilerplate text"""
list_of_elements = []
if len(CONFIG['General']['Dedication']) > 0:
dedication = etree.Element(TEI + "div", type="dedication")
dedication_text = etree.SubElement(dedication, TEI + "p").text = CONFIG['General']['Dedication']
list_of_elements.append(dedication)
return list_of_elements
# def populate_front_part ends here
def check_index_formatting(entry, xml_element):
"""Check if index string contains formatting directions"""
if entry.find("@") > 0:
sort_key, formatted_string = entry.split("@")
xml_element.set("sortKey", sort_key)
xml_element.text = formatted_string
else:
xml_element.text = entry
return xml_element
# def check_index_formatting ends here
def fix_xml_id(id_string):
"""Make a string NCName conform"""
replacements = {
":" : "_"
}
for character in replacements.keys():
id_string = id_string.replace(character, replacements[character])
return id_string
# def fix_xml_id ends here
def fix_equations(tex_equation):
"""Remove surrounding TeX code from formulas."""
superfluous_code = [r"\end{equation*}", r"\begin{equation*}"]
for codes in superfluous_code:
print(codes)
tex_equation = tex_equation.replace(codes, "")
return tex_equation
# def fix_equations ends here
def create_tei_header(CONFIG):
"""Based on publication.cfg, create the header for a TEI file."""
header = etree.Element(TEI + "teiHeader")
filedesc = etree.SubElement(header, TEI + "fileDesc")
titlestmt = etree.SubElement(filedesc, TEI + "titleStmt")
series_title = etree.SubElement(titlestmt, TEI + "title", level = "s", n = CONFIG['Technical']['Number']).text = CONFIG['Technical']['Serie']
book_title = etree.SubElement(titlestmt, TEI + "title", type = "main", level = "m").text = CONFIG['Technical']['Title']
book_subtitle = etree.SubElement(titlestmt, TEI + "title", type = "sub", level = "m").text = CONFIG['Technical']['Subtitle']
for person in range(1,6):
tmpstring = """Author%s""" % person
tmpauthor = CONFIG['Authors'][tmpstring]
if len(tmpauthor) > 0:
author = etree.SubElement(titlestmt, TEI + "author").text = CONFIG['Authors'][tmpstring]
etree.SubElement(titlestmt, TEI + "editor", role="publicationmanager").text = "Lindy Divarci"
other_roles = {"Submitter" : "submitter", "EditorialCoordination" :
"editorialcoordinator", "Copyediting" : "copyeditor", "Translator" :
"translator"}
for role in other_roles:
tmprole = CONFIG['General'][role]
if len(tmprole) > 0:
editor = etree.SubElement(titlestmt, TEI + "editor", role=other_roles[role]).text = CONFIG['General'][role]
extent = etree.SubElement(filedesc, TEI + "extent")
# numpages = etree.SubElement(extent, "measure", unit="pages", quantity=)
numpages = etree.SubElement(extent, "measure", unit="EUR", quantity=CONFIG['Technical']['Price'])
pub_statement = etree.SubElement(filedesc, TEI + "publicationStmt")
publisher = etree.SubElement(pub_statement, TEI + "publisher").text = "Edition Open Access"
distributor = etree.SubElement(pub_statement, TEI + "distributor").text = CONFIG['Technical']['Shoplink']
publication_date = etree.SubElement(pub_statement, TEI + "date", when=CONFIG['Technical']['PublicationDate']).text = CONFIG['Technical']['PublicationDate']
isbn = etree.SubElement(pub_statement, TEI + "idno", type="ISBN").text = CONFIG['Technical']['ISBN']
shoplink = etree.SubElement(pub_statement, TEI + "idno", type="shoplink").text = CONFIG['Technical']['Shoplink']
availability = etree.SubElement(pub_statement, TEI + "availability")
licence = etree.SubElement(availability, TEI + "licence", target="https://creativecommons.org/licenses/" + CONFIG['Technical']['License'] + "/3.0/de/deed.en")
# following string should not be hardcoded
licence_text = etree.SubElement(licence, TEI + "p").text = "Distributed under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Germany License."
sourcedesc = etree.SubElement(filedesc, TEI + "sourceDesc")
sourcedesc_text = etree.SubElement(sourcedesc, TEI + "p").text = "This is a born digital document from Edition Open Access."
profiledesc = etree.SubElement(header, TEI + "profileDesc")
shortabstract = etree.SubElement(profiledesc, TEI + "abstract", n="brief")
shortabstract_text = etree.SubElement(shortabstract, TEI + "p").text = CONFIG['General']['BriefDescription']
longabstract = etree.SubElement(profiledesc, TEI + "abstract", n="detail")
longabstract_text = etree.SubElement(longabstract, TEI + "p").text = CONFIG['General']['DetailedDescription']
if len(CONFIG['General']['AdditionalInformation']) > 0:
additionalabstract = etree.SubElement(profiledesc, TEI + "abstract", n="additional")
additionalabstract_text = etree.SubElement(additionalabstract, TEI + "p").text = CONFIG['General']['AdditionalInformation']
textclass = etree.SubElement(profiledesc, TEI + "textClass")
keywords = etree.SubElement(textclass, TEI + "keywords")
keyword_list = etree.SubElement(keywords, TEI + "list")
for keyword in range(1,7):
tmpstring = """Keyword%s""" % keyword
tmpkeyword = CONFIG['General'][tmpstring]
if len(tmpkeyword) > 0:
keyword = etree.SubElement(keyword_list, TEI + "item").text = tmpkeyword
language_dictionary = {"de" : "Deutsch", "en" : "Englisch"}
language_usage = etree.SubElement(profiledesc, TEI + "langUsage")
mainlanguage = etree.SubElement(language_usage, TEI + "language", ident=CONFIG['Technical']['Language']).text = language_dictionary[CONFIG['Technical']['Language']]
encodingdesc = etree.SubElement(header, TEI + "encodingDesc")
tag_declaration = etree.SubElement(encodingdesc, TEI + "tagsDecl")
struck_rendition = etree.SubElement(tag_declaration, TEI + "rendition", scheme="css")
struck_rendition.text = "text-decoration: line-through;"
struck_rendition.set(id_attr, "struck")
spaced_rendition = etree.SubElement(tag_declaration, TEI + "rendition", scheme="css")
spaced_rendition.text = "letter-spacing:0.3em"
spaced_rendition.set(id_attr, "spaced")
smallcaps_rendition = etree.SubElement(tag_declaration, TEI + "rendition", scheme="css")
smallcaps_rendition.text = "font-variant:small-caps"
smallcaps_rendition.set(id_attr, "smallcaps")
return header
# def create_tei_header ends here
def transform_intermediate_xml(xml_tree):
"""Perform a transformation of existing XML structure"""
bibliography_type = xml_tree.xpath("/Book/p/EOAbibliographytype/text()")[0]
bibliographydatabase = xml_tree.xpath("/Book/p/EOAbibliographydatabase/text()")[0]
delete_info_paragraphs = xml_tree.xpath("/Book/p")
for paragraph in delete_info_paragraphs:
paragraph.getparent().remove(paragraph)
etree.strip_elements(xml_tree, "tableofcontents")
root_element = xml_tree.getroot()
root_element.tag = "body"
etree.strip_attributes(root_element, "A", "B", "part", "id-text", "id")
etree.strip_tags(root_element, "allowbreak")
alldivs = xml_tree.xpath("//div1|//div2|//div3|//div4|")
divdict = {"div1" : "chapter", "div2" : "section", "div3" : "subsection", "div4" : "subsubsection"}
for div in alldivs:
div.set("type", divdict[div.tag])
div.tag = "div"
italics = xml_tree.xpath("//hi[@rend='it']")
for element in italics:
element.set("rend", "italic")
higher_lower = xml_tree.xpath("//EOAup|//EOAdown")
for element in higher_lower:
if element.tag == "EOAup":
element.set("rend", "superscript")
else:
element.set("rend", "subscript")
element.tag = "hi"
inline_image = xml_tree.xpath("//EOAinline")
for element in inline_image:
element.tag = "graphic"
element.set("url", element.text)
# is there a better way?
element.text = ""
footnotes = xml_tree.xpath("//note[@place='Inline']")
for element in footnotes:
element.set("place", "bottom")
urls = xml_tree.xpath("//xref")
for element in urls:
element.tag = "ref"
element.set("type", "url")
element.set("target", element.get("url"))
etree.strip_attributes(element, "url")
citations = xml_tree.xpath("//span[@class='citation']|//EOAcitenumeric")
for element in citations:
# how to determine the cite pages?
citedrange = ""
if element.tag == "span":
citekey = element.get("citekey")
booktitle = element.get("data-content")
other_text = element.get("data-title")
elif element.tag == "EOAcitenumeric":
citekey = element.find("citekey").text
citetext = element.find("citetext").text
element.set("rend", "numeric")
if citetext is None:
citedrange = ""
else:
citedrange = citetext
element.tag = "bibl"
reference = etree.SubElement(element, "ref", target="#" + citekey)
if len(citedrange) > 0:
# leaving unit attribute out for now and assume that page is default range
cited_range = etree.SubElement(element, "citedRange")#, unit="page")
cited_range.text = citedrange
element.text = ""
etree.strip_attributes(element, "rel", "class", "citekey", "data-toggle", "html", "data-placement", "data-content", "data-title")
etree.strip_elements(element, "citekey", "citetext")
references = xml_tree.xpath("//EOApageref|//EOAref")
for element in references:
if element.tag == "EOApageref":
element.set("type", "page")
element.tag = "ref"
label = element.find("Label")
element.set("target", "#" + label.text)
etree.strip_elements(element, "ref", "Label")
quoted_paragraph = xml_tree.xpath("//p[@rend='quoted']")
for element in quoted_paragraph:
element.tag = "quote"
etree.strip_attributes(element, "rend")
ordered_lists = xml_tree.xpath("//list[@type='ordered' or @type='simple' or @type='description']")
for element in ordered_lists:
if element.get("type") == "simple":
element.set("type", "bulleted")
elif element.get("type") == "description":
element.set("type", "gloss")
items = element.getchildren()
for item in items:
etree.strip_attributes(item, "label")
etree.strip_tags(item, "p")
# code if we need to treat the description list differently for any reason
# description_lists = xml_tree.xpath("//list[@type='description']")
# for element in description_lists:
# element.set("type", "gloss")
# labels = element.findall("label")
# for label in labels:
# corresponding_item = label.getnext()
# etree.strip_tags(corresponding_item, "p")
index_entries = xml_tree.xpath("//EOAindex|//EOAindexperson|//EOAindexlocation")
index_names = {"EOAindex" : "keyword", "EOAindexperson" : "Person", "EOAindexlocation" : "Location"}
for entry in index_entries:
index_type = entry.tag
index_text = entry.text
entry.tag = "index"
entry.text = ""
entry.set("indexName", index_names[index_type])
# does split() order the pieces in the correct way?
subentries = index_text.split("!")
# could probably be made more concise with recursion
if len(subentries) == 1:
term_element = etree.SubElement(entry, "term")
term_element = check_index_formatting(subentries[0], term_element)
elif len(subentries) == 2:
term_element = etree.SubElement(entry, "term")
term_element = check_index_formatting(subentries[0], term_element)
second_index_level = etree.SubElement(entry,"index")
second_term_element = etree.SubElement(second_index_level, "term")
second_term_element = check_index_formatting(subentries[1], second_term_element)
elif len(subentries) == 3:
term_element = etree.SubElement(entry, "term")
term_element = check_index_formatting(subentries[0], term_element)
second_index_level = etree.SubElement(entry, "index")
second_term_element = etree.SubElement(second_index_level, "term")
second_term_element = check_index_formatting(subentries[1], second_term_element)
third_index_level = etree.SubElement(second_index_level, "index")
third_term_element = etree.SubElement(third_index_level, "term")
third_term_element = check_index_formatting(subentries[2], third_term_element)
elif len(subentries) >= 4:
print("Error: more than two levels of subentries are disallowed!\n", index_text)
sys.exit()
tables = xml_tree.xpath("//EOAtable")
for table in tables:
tab_label = table.find("EOAtablelabel")
tab_caption = table.find("EOAtablecaption")
tab_config = table.find("EOAtablecolumns")
realtable = table.find("table")
rows = realtable.findall("row")
# debatable, as there might be colspan!
columns = rows[0].findall("cell")
table.tag = "table"
table.set("rows", str(len(rows)))
table.set("cols", str(len(columns)))
table.set(id_attr, fix_xml_id(tab_label.text))
table_header = etree.Element("head")
table_header.text = tab_caption.text
table.insert(0, table_header)
for row in rows:
header = row.xpath(".//tableheader")
if len(header) > 0:
header[0].text = ""
row.set("role", "label")
else:
row.set("role", "data")
cells = row.findall("cell")
for cell in cells:
cell.set("role", "data")
etree.strip_elements(table, "EOAtablelabel", "EOAtablecaption", "EOAtablecolumns")
etree.strip_tags(table, "table", "tableheader")
figures = xml_tree.xpath("//EOAfigure|//EOAfigurenonumber")
for figure in figures:
if figure.tag == "EOAfigurenonumber":
figure.set("rend", "nonumber")
else:
image_caption = figure.find(".//caption")
image_caption.tag = "head"
figure.append(image_caption)
figure.tag = "figure"
figure_size = figure.find(".//width")
etree.Comment("width is " + figure_size.text)
image_path = figure.find(".//file").text
etree.SubElement(figure, "graphic", url=image_path)
etree.strip_elements(figure, "anchor", "p")
equations = xml_tree.xpath("//EOAineq|//EOAequation|//EOAequationnonumber")
for element in equations:
if element.tag == "EOAineq":
element.set("rend", "inline")
elif element.tag == "EOAequationnonumber":
element.set("rend", "block nonumber")
else:
element.set("rend", "block")
element.set("notation", "TeX")
element.text = fix_equations(element.get("TeX"))
element.tag = "formula"
etree.strip_attributes(element, "TeX", "src", "filename", "number", "uid")
etree.strip_elements(element, "math", "formula", "anchor")
return xml_tree
# def transform_intermediate_xml ends here
def main():
"""The main bit"""
# CONFIG['Authors']['Zusatz']:
# create document structure
tei_root = etree.Element(TEI + "TEI", nsmap=NS_MAP)
tei_header = create_tei_header(PUB_CONFIG)
tei_root.append(tei_header)
tei_body = etree.SubElement(tei_root, "text")
intermediate_xml_tree = etree.parse("tmp_files/IntermediateXMLFile.xml")
tei_body_xml = transform_intermediate_xml(intermediate_xml_tree)
front_part = etree.SubElement(tei_body, "front")
front_contents_list = populate_front_part(PUB_CONFIG)
for part in front_contents_list:
front_part.append(part)
back_part = etree.SubElement(tei_body, "back")
tei_body.insert(1, tei_body_xml.getroot())
outfile = 'CONVERT/TEI.xml'
output_string = etree.tostring(tei_root, xml_declaration=True, pretty_print=True, encoding="UTF-8", doctype= '<?xml-model href="eoa_tei.rnc" type="application/relax-ng-compact-syntax"?>\n<?xml-stylesheet type="text/css" href="tei.css" ?>')
with open(outfile, 'w') as output_file:
output_file.write(output_string.decode("utf-8"))
# def main ends here
if __name__ == '__main__':
main()
# finis