imxml2tei.py

#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-

"""Unfinished program to convert a customized DocBook XML to TEI XML.

This program creates a TEI XML version out of a DocBook XML file.

"""

import sys
import configparser
from lxml import etree

# citations need a little more work: especially citedRange
# so do landscape figures, no way to distinguish them!

# namespaces
TEI_NS = "http://www.tei-c.org/ns/1.0"
TEI = "{%s}" % TEI_NS

NS_MAP = {
    None: TEI_NS,
    "xml": "https://www.w3.org/TR/xml11",
    "tmp": "tmp",
    "re": "http://exslt.org/regular-expressions"}

PUB_CONFIG = configparser.ConfigParser()
PUB_CONFIG.read("publication.cfg")

id_attr = "{{{ns}}}id".format( ns=NS_MAP["xml"] )

def populate_front_part(CONFIG):
    """This part consists mainly of boilerplate text"""

    list_of_elements = []

    if len(CONFIG['General']['Dedication']) > 0:
        dedication = etree.Element(TEI + "div", type="dedication")
        dedication_text = etree.SubElement(dedication, TEI + "p").text = CONFIG['General']['Dedication']

        list_of_elements.append(dedication)

    return list_of_elements
# def populate_front_part ends here

def check_index_formatting(entry, xml_element):
    """Check if index string contains formatting directions"""

    if entry.find("@") > 0:
        sort_key, formatted_string = entry.split("@")
        xml_element.set("sortKey", sort_key)
        xml_element.text = formatted_string
    else:
        xml_element.text = entry

    return xml_element
# def check_index_formatting ends here

def fix_xml_id(id_string):
    """Make a string NCName conform"""

    replacements = {
        ":" : "_"
        }

    for character in replacements.keys():
        id_string = id_string.replace(character, replacements[character])

    return id_string
# def fix_xml_id ends here

def fix_equations(tex_equation):
    """Remove surrounding TeX code from formulas."""

    superfluous_code = [r"\end{equation*}", r"\begin{equation*}"]

    for codes in superfluous_code:
        print(codes)
        tex_equation = tex_equation.replace(codes, "")

    return tex_equation
# def fix_equations ends here

def create_tei_header(CONFIG):
    """Based on publication.cfg, create the header for a TEI file."""

    header = etree.Element(TEI + "teiHeader")
    filedesc = etree.SubElement(header, TEI + "fileDesc")
    titlestmt = etree.SubElement(filedesc, TEI + "titleStmt")
    series_title = etree.SubElement(titlestmt, TEI + "title", level = "s", n = CONFIG['Technical']['Number']).text = CONFIG['Technical']['Serie']
    book_title = etree.SubElement(titlestmt, TEI + "title", type = "main", level = "m").text = CONFIG['Technical']['Title']
    book_subtitle = etree.SubElement(titlestmt, TEI + "title", type = "sub", level = "m").text = CONFIG['Technical']['Subtitle']

    for person in range(1,6):
        tmpstring = """Author%s""" % person
        tmpauthor = CONFIG['Authors'][tmpstring]
        if len(tmpauthor) > 0:
            author = etree.SubElement(titlestmt, TEI + "author").text = CONFIG['Authors'][tmpstring]

    etree.SubElement(titlestmt, TEI + "editor", role="publicationmanager").text = "Lindy Divarci"

    other_roles = {"Submitter" : "submitter", "EditorialCoordination" :
                               "editorialcoordinator", "Copyediting" : "copyeditor", "Translator" :
                               "translator"}

    for role in other_roles:
        tmprole = CONFIG['General'][role]
        if len(tmprole) > 0:
            editor = etree.SubElement(titlestmt, TEI + "editor", role=other_roles[role]).text = CONFIG['General'][role]

    extent = etree.SubElement(filedesc, TEI + "extent")
    # numpages = etree.SubElement(extent, "measure", unit="pages", quantity=)
    numpages = etree.SubElement(extent, "measure", unit="EUR", quantity=CONFIG['Technical']['Price'])

    pub_statement = etree.SubElement(filedesc, TEI + "publicationStmt")
    publisher = etree.SubElement(pub_statement, TEI + "publisher").text = "Edition Open Access"
    distributor = etree.SubElement(pub_statement, TEI + "distributor").text = CONFIG['Technical']['Shoplink']
    publication_date = etree.SubElement(pub_statement, TEI + "date", when=CONFIG['Technical']['PublicationDate']).text = CONFIG['Technical']['PublicationDate']
    isbn = etree.SubElement(pub_statement, TEI + "idno", type="ISBN").text = CONFIG['Technical']['ISBN']
    shoplink = etree.SubElement(pub_statement, TEI + "idno", type="shoplink").text = CONFIG['Technical']['Shoplink']

    availability = etree.SubElement(pub_statement, TEI + "availability")
    licence = etree.SubElement(availability, TEI + "licence", target="https://creativecommons.org/licenses/" + CONFIG['Technical']['License'] + "/3.0/de/deed.en")
    # following string should not be hardcoded
    licence_text = etree.SubElement(licence, TEI + "p").text = "Distributed under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Germany License."

    sourcedesc = etree.SubElement(filedesc, TEI + "sourceDesc")
    sourcedesc_text = etree.SubElement(sourcedesc, TEI + "p").text = "This is a born digital document from Edition Open Access."

    profiledesc = etree.SubElement(header, TEI + "profileDesc")
    shortabstract = etree.SubElement(profiledesc, TEI + "abstract", n="brief")
    shortabstract_text = etree.SubElement(shortabstract, TEI + "p").text = CONFIG['General']['BriefDescription']
    longabstract = etree.SubElement(profiledesc, TEI + "abstract", n="detail")
    longabstract_text = etree.SubElement(longabstract, TEI + "p").text = CONFIG['General']['DetailedDescription']

    if len(CONFIG['General']['AdditionalInformation']) > 0:
        additionalabstract = etree.SubElement(profiledesc, TEI + "abstract", n="additional")
        additionalabstract_text = etree.SubElement(additionalabstract, TEI + "p").text = CONFIG['General']['AdditionalInformation']

    textclass = etree.SubElement(profiledesc, TEI + "textClass")
    keywords = etree.SubElement(textclass, TEI + "keywords")
    keyword_list = etree.SubElement(keywords, TEI + "list")

    for keyword in range(1,7):
        tmpstring = """Keyword%s""" % keyword
        tmpkeyword = CONFIG['General'][tmpstring]
        if len(tmpkeyword) > 0:
            keyword = etree.SubElement(keyword_list, TEI + "item").text = tmpkeyword

    language_dictionary = {"de" : "Deutsch", "en" : "Englisch"}

    language_usage = etree.SubElement(profiledesc, TEI + "langUsage")
    mainlanguage = etree.SubElement(language_usage, TEI + "language", ident=CONFIG['Technical']['Language']).text = language_dictionary[CONFIG['Technical']['Language']]

    encodingdesc = etree.SubElement(header, TEI + "encodingDesc")

    tag_declaration = etree.SubElement(encodingdesc, TEI + "tagsDecl")
    struck_rendition = etree.SubElement(tag_declaration, TEI + "rendition", scheme="css")
    struck_rendition.text = "text-decoration: line-through;"
    struck_rendition.set(id_attr, "struck")

    spaced_rendition = etree.SubElement(tag_declaration, TEI + "rendition", scheme="css")
    spaced_rendition.text = "letter-spacing:0.3em"
    spaced_rendition.set(id_attr, "spaced")

    smallcaps_rendition = etree.SubElement(tag_declaration, TEI + "rendition", scheme="css")
    smallcaps_rendition.text = "font-variant:small-caps"
    smallcaps_rendition.set(id_attr, "smallcaps")

    return header
# def create_tei_header ends here

def transform_intermediate_xml(xml_tree):
    """Perform a transformation of existing XML structure"""

    bibliography_type = xml_tree.xpath("/Book/p/EOAbibliographytype/text()")[0]
    bibliographydatabase = xml_tree.xpath("/Book/p/EOAbibliographydatabase/text()")[0]

    delete_info_paragraphs = xml_tree.xpath("/Book/p")
    for paragraph in delete_info_paragraphs:
        paragraph.getparent().remove(paragraph)

    etree.strip_elements(xml_tree, "tableofcontents")

    root_element = xml_tree.getroot()
    root_element.tag = "body"

    etree.strip_attributes(root_element, "A", "B", "part", "id-text", "id")
    etree.strip_tags(root_element, "allowbreak")

    alldivs = xml_tree.xpath("//div1|//div2|//div3|//div4|")
    divdict = {"div1" : "chapter", "div2" : "section", "div3" : "subsection", "div4" : "subsubsection"}

    for div in alldivs:
        div.set("type", divdict[div.tag])
        div.tag = "div"

    italics = xml_tree.xpath("//hi[@rend='it']")
    for element in italics:
        element.set("rend", "italic")

    higher_lower = xml_tree.xpath("//EOAup|//EOAdown")
    for element in higher_lower:
        if element.tag == "EOAup":
            element.set("rend", "superscript")
        else:
            element.set("rend", "subscript")
        element.tag = "hi"

    inline_image = xml_tree.xpath("//EOAinline")
    for element in inline_image:
        element.tag = "graphic"
        element.set("url", element.text)
        # is there a better way?
        element.text = ""

    footnotes = xml_tree.xpath("//note[@place='Inline']")
    for element in footnotes:
        element.set("place", "bottom")

    urls = xml_tree.xpath("//xref")
    for element in urls:
        element.tag = "ref"
        element.set("type", "url")
        element.set("target", element.get("url"))
        etree.strip_attributes(element, "url")

    citations = xml_tree.xpath("//span[@class='citation']|//EOAcitenumeric")
    for element in citations:
        # how to determine the cite pages?
        citedrange = ""

        if element.tag == "span":
            citekey = element.get("citekey")
            booktitle = element.get("data-content")
            other_text = element.get("data-title")
        elif element.tag == "EOAcitenumeric":
            citekey = element.find("citekey").text
            citetext = element.find("citetext").text
            element.set("rend", "numeric")

            if citetext is None:
                citedrange = ""
            else:
                citedrange = citetext

        element.tag = "bibl"
        reference = etree.SubElement(element, "ref", target="#" + citekey)
        if len(citedrange) > 0:
            # leaving unit attribute out for now and assume that page is default range
            cited_range = etree.SubElement(element, "citedRange")#, unit="page")
            cited_range.text = citedrange

        element.text = ""
        etree.strip_attributes(element, "rel", "class", "citekey", "data-toggle", "html", "data-placement", "data-content", "data-title")
        etree.strip_elements(element, "citekey", "citetext")

    references = xml_tree.xpath("//EOApageref|//EOAref")
    for element in references:
        if element.tag == "EOApageref":
            element.set("type", "page")
        element.tag = "ref"
        label = element.find("Label")
        element.set("target", "#" + label.text)
        etree.strip_elements(element, "ref", "Label")

    quoted_paragraph = xml_tree.xpath("//p[@rend='quoted']")
    for element in quoted_paragraph:
        element.tag = "quote"
        etree.strip_attributes(element, "rend")

    ordered_lists = xml_tree.xpath("//list[@type='ordered' or @type='simple' or @type='description']")
    for element in ordered_lists:
        if element.get("type") == "simple":
            element.set("type", "bulleted")
        elif element.get("type") == "description":
            element.set("type", "gloss")
        items = element.getchildren()
        for item in items:
            etree.strip_attributes(item, "label")
            etree.strip_tags(item, "p")

    # code if we need to treat the description list differently for any reason
    # description_lists = xml_tree.xpath("//list[@type='description']")
    # for element in description_lists:
    #     element.set("type", "gloss")
    #     labels = element.findall("label")
    #     for label in labels:
    #         corresponding_item = label.getnext()
    #         etree.strip_tags(corresponding_item, "p")

    index_entries = xml_tree.xpath("//EOAindex|//EOAindexperson|//EOAindexlocation")
    index_names = {"EOAindex" : "keyword", "EOAindexperson" : "Person", "EOAindexlocation" : "Location"}
    for entry in index_entries:
        index_type = entry.tag
        index_text = entry.text
        entry.tag = "index"
        entry.text = ""
        entry.set("indexName", index_names[index_type])

        # does split() order the pieces in the correct way?
        subentries = index_text.split("!")

        # could probably be made more concise with recursion
        if len(subentries) == 1:
            term_element = etree.SubElement(entry, "term")
            term_element = check_index_formatting(subentries[0], term_element)
        elif len(subentries) == 2:
            term_element = etree.SubElement(entry, "term")
            term_element = check_index_formatting(subentries[0], term_element)
            second_index_level = etree.SubElement(entry,"index")
            second_term_element = etree.SubElement(second_index_level, "term")
            second_term_element = check_index_formatting(subentries[1], second_term_element)
        elif len(subentries) == 3:
            term_element = etree.SubElement(entry, "term")
            term_element = check_index_formatting(subentries[0], term_element)
            second_index_level = etree.SubElement(entry, "index")
            second_term_element = etree.SubElement(second_index_level, "term")
            second_term_element = check_index_formatting(subentries[1], second_term_element)
            third_index_level = etree.SubElement(second_index_level, "index")
            third_term_element = etree.SubElement(third_index_level, "term")
            third_term_element = check_index_formatting(subentries[2], third_term_element)
        elif len(subentries) >= 4:
            print("Error: more than two levels of subentries are disallowed!\n", index_text)
            sys.exit()

    tables = xml_tree.xpath("//EOAtable")
    for table in tables:
        tab_label = table.find("EOAtablelabel")
        tab_caption = table.find("EOAtablecaption")
        tab_config = table.find("EOAtablecolumns")
        realtable = table.find("table")

        rows = realtable.findall("row")
        # debatable, as there might be colspan!
        columns = rows[0].findall("cell")

        table.tag = "table"
        table.set("rows", str(len(rows)))
        table.set("cols", str(len(columns)))
        table.set(id_attr, fix_xml_id(tab_label.text))

        table_header = etree.Element("head")
        table_header.text = tab_caption.text
        table.insert(0, table_header)

        for row in rows:
            header = row.xpath(".//tableheader")
            if len(header) > 0:
                header[0].text = ""
                row.set("role", "label")
            else:
                row.set("role", "data")
            cells = row.findall("cell")
            for cell in cells:
                cell.set("role", "data")

        etree.strip_elements(table, "EOAtablelabel", "EOAtablecaption", "EOAtablecolumns")
        etree.strip_tags(table, "table", "tableheader")

    figures = xml_tree.xpath("//EOAfigure|//EOAfigurenonumber")
    for figure in figures:
        if figure.tag == "EOAfigurenonumber":
            figure.set("rend", "nonumber")
        else:
            image_caption = figure.find(".//caption")
            image_caption.tag = "head"
            figure.append(image_caption)

        figure.tag = "figure"
        figure_size = figure.find(".//width")

        etree.Comment("width is " + figure_size.text)

        image_path = figure.find(".//file").text

        etree.SubElement(figure, "graphic", url=image_path)
        etree.strip_elements(figure, "anchor", "p")

    equations = xml_tree.xpath("//EOAineq|//EOAequation|//EOAequationnonumber")
    for element in equations:
        if element.tag == "EOAineq":
            element.set("rend", "inline")
        elif element.tag == "EOAequationnonumber":
            element.set("rend", "block nonumber")
        else:
            element.set("rend", "block")

        element.set("notation", "TeX")
        element.text = fix_equations(element.get("TeX"))
        element.tag = "formula"
        etree.strip_attributes(element, "TeX", "src", "filename", "number", "uid")
        etree.strip_elements(element, "math", "formula", "anchor")

    return xml_tree
# def transform_intermediate_xml ends here

def main():
    """The main bit"""

    # CONFIG['Authors']['Zusatz']:

    # create document structure
    tei_root = etree.Element(TEI + "TEI", nsmap=NS_MAP)
    tei_header = create_tei_header(PUB_CONFIG)
    tei_root.append(tei_header)

    tei_body = etree.SubElement(tei_root, "text")
    intermediate_xml_tree = etree.parse("tmp_files/IntermediateXMLFile.xml")
    tei_body_xml = transform_intermediate_xml(intermediate_xml_tree)
    front_part = etree.SubElement(tei_body, "front")
    front_contents_list = populate_front_part(PUB_CONFIG)

    for part in front_contents_list:
        front_part.append(part)

    back_part = etree.SubElement(tei_body, "back")
    tei_body.insert(1, tei_body_xml.getroot())

    outfile = 'CONVERT/TEI.xml'
    output_string = etree.tostring(tei_root, xml_declaration=True, pretty_print=True, encoding="UTF-8", doctype= '<?xml-model href="eoa_tei.rnc" type="application/relax-ng-compact-syntax"?>\n<?xml-stylesheet type="text/css" href="tei.css" ?>')

    with open(outfile, 'w') as output_file:
        output_file.write(output_string.decode("utf-8"))
# def main ends here

if __name__ == '__main__':
    main()

# finis