imxml2tei.py

#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-

"""Unfinished program to convert a customized DocBook XML to TEI XML.

This program creates a TEI XML version out of a DocBook XML file.

"""

import sys
import configparser
from lxml import etree

# citations need a little more work: especially citedRange
# so do landscape figures, no way to distinguish them!

# namespaces
TEI_NS = "http://www.tei-c.org/ns/1.0"
TEI = "{%s}" % TEI_NS

NS_MAP = {
    None: TEI_NS,
    "xml": "https://www.w3.org/TR/xml11",
    "tmp": "tmp",
    "re": "http://exslt.org/regular-expressions"}

PUB_CONFIG = configparser.ConfigParser()
PUB_CONFIG.read("publication.cfg")

id_attr = "{{{ns}}}id".format( ns=NS_MAP["xml"] )

def populate_front_part(CONFIG):
    """This part consists mainly of boilerplate text"""

    list_of_elements = []

    if len(CONFIG['General']['Dedication']) > 0:
        dedication = etree.Element(TEI + "div", type="dedication")
        dedication_text = etree.SubElement(dedication, TEI + "p").text = CONFIG['General']['Dedication']

        list_of_elements.append(dedication)

    return list_of_elements
# def populate_front_part ends here

def check_index_formatting(entry, xml_element):
    """Check if index string contains formatting directions"""

    if entry.find("@") > 0:
        sort_key, formatted_string = entry.split("@")
        xml_element.set("sortKey", sort_key)
        xml_element.text = formatted_string
    else:
        xml_element.text = entry

    return xml_element
# def check_index_formatting ends here

def fix_xml_id(id_string):
    """Make a string NCName conform"""

    replacements = {
        ":" : "_"
        }

    for character in replacements.keys():
        id_string = id_string.replace(character, replacements[character])

    return id_string
# def fix_xml_id ends here

def fix_equations(tex_equation):
    """Remove surrounding TeX code from formulas."""

    superfluous_code = [r"\end{equation*}", r"\begin{equation*}"]

    for codes in superfluous_code:
        print(codes)
        tex_equation = tex_equation.replace(codes, "")

    return tex_equation
# def fix_equations ends here

def create_tei_header(CONFIG):
    """Based on publication.cfg, create the header for a TEI file."""

    header = etree.Element(TEI + "teiHeader")
    filedesc = etree.SubElement(header, TEI + "fileDesc")
    titlestmt = etree.SubElement(filedesc, TEI + "titleStmt")
    series_title = etree.SubElement(titlestmt, TEI + "title", level = "s", n = CONFIG['Technical']['Number']).text = CONFIG['Technical']['Serie']
    book_title = etree.SubElement(titlestmt, TEI + "title", type = "main", level = "m").text = CONFIG['Technical']['Title']
    book_subtitle = etree.SubElement(titlestmt, TEI + "title", type = "sub", level = "m").text = CONFIG['Technical']['Subtitle']

    for person in range(1,6):
        tmpstring = """Author%s""" % person
        tmpauthor = CONFIG['Authors'][tmpstring]
        if len(tmpauthor) > 0:
            author = etree.SubElement(titlestmt, TEI + "author").text = CONFIG['Authors'][tmpstring]

    etree.SubElement(titlestmt, TEI + "editor", role="publicationmanager").text = "Lindy Divarci"

    other_roles = {"Submitter" : "submitter", "EditorialCoordination" :
                               "editorialcoordinator", "Copyediting" : "copyeditor", "Translator" :
                               "translator"}

    for role in other_roles:
        tmprole = CONFIG['General'][role]
        if len(tmprole) > 0:
            editor = etree.SubElement(titlestmt, TEI + "editor", role=other_roles[role]).text = CONFIG['General'][role]

    extent = etree.SubElement(filedesc, TEI + "extent")
    # numpages = etree.SubElement(extent, "measure", unit="pages", quantity=)
    numpages = etree.SubElement(extent, "measure", unit="EUR", quantity=CONFIG['Technical']['Price'])

    pub_statement = etree.SubElement(filedesc, TEI + "publicationStmt")
    publisher = etree.SubElement(pub_statement, TEI + "publisher").text = "Edition Open Access"
    distributor = etree.SubElement(pub_statement, TEI + "distributor").text = CONFIG['Technical']['Shoplink']
    publication_date = etree.SubElement(pub_statement, TEI + "date", when=CONFIG['Technical']['PublicationDate']).text = CONFIG['Technical']['PublicationDate']
    isbn = etree.SubElement(pub_statement, TEI + "idno", type="ISBN").text = CONFIG['Technical']['ISBN']
    shoplink = etree.SubElement(pub_statement, TEI + "idno", type="shoplink").text = CONFIG['Technical']['Shoplink']

    availability = etree.SubElement(pub_statement, TEI + "availability")
    licence = etree.SubElement(availability, TEI + "licence", target="https://creativecommons.org/licenses/" + CONFIG['Technical']['License'] + "/3.0/de/deed.en")
    # following string should not be hardcoded
    licence_text = etree.SubElement(licence, TEI + "p").text = "Distributed under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Germany License."

    sourcedesc = etree.SubElement(filedesc, TEI + "sourceDesc")
    sourcedesc_text = etree.SubElement(sourcedesc, TEI + "p").text = "This is a born digital document from Edition Open Access."

    profiledesc = etree.SubElement(header, TEI + "profileDesc")
    shortabstract = etree.SubElement(profiledesc, TEI + "abstract", n="brief")
    shortabstract_text = etree.SubElement(shortabstract, TEI + "p").text = CONFIG['General']['BriefDescription']
    longabstract = etree.SubElement(profiledesc, TEI + "abstract", n="detail")
    longabstract_text = etree.SubElement(longabstract, TEI + "p").text = CONFIG['General']['DetailedDescription']

    if len(CONFIG['General']['AdditionalInformation']) > 0:
        additionalabstract = etree.SubElement(profiledesc, TEI + "abstract", n="additional")
        additionalabstract_text = etree.SubElement(additionalabstract, TEI + "p").text = CONFIG['General']['AdditionalInformation']

    textclass = etree.SubElement(profiledesc, TEI + "textClass")
    keywords = etree.SubElement(textclass, TEI + "keywords")
    keyword_list = etree.SubElement(keywords, TEI + "list")

    for keyword in range(1,7):
        tmpstring = """Keyword%s""" % keyword
        tmpkeyword = CONFIG['General'][tmpstring]
        if len(tmpkeyword) > 0:
            keyword = etree.SubElement(keyword_list, TEI + "item").text = tmpkeyword

    language_dictionary = {"de" : "Deutsch", "en" : "Englisch"}

    language_usage = etree.SubElement(profiledesc, TEI + "langUsage")
    mainlanguage = etree.SubElement(language_usage, TEI + "language", ident=CONFIG['Technical']['Language']).text = language_dictionary[CONFIG['Technical']['Language']]

    encodingdesc = etree.SubElement(header, TEI + "encodingDesc")

    tag_declaration = etree.SubElement(encodingdesc, TEI + "tagsDecl")
    struck_rendition = etree.SubElement(tag_declaration, TEI + "rendition", scheme="css")
    struck_rendition.text = "text-decoration: line-through;"
    struck_rendition.set(id_attr, "struck")

    spaced_rendition = etree.SubElement(tag_declaration, TEI + "rendition", scheme="css")
    spaced_rendition.text = "letter-spacing:0.3em"
    spaced_rendition.set(id_attr, "spaced")

    smallcaps_rendition = etree.SubElement(tag_declaration, TEI + "rendition", scheme="css")
    smallcaps_rendition.text = "font-variant:small-caps"
    smallcaps_rendition.set(id_attr, "smallcaps")

    return header
# def create_tei_header ends here

def transform_intermediate_xml(xml_tree):
    """Perform a transformation of existing XML structure"""

    bibliography_type = xml_tree.xpath("/Book/p/EOAbibliographytype/text()")[0]
    bibliographydatabase = xml_tree.xpath("/Book/p/EOAbibliographydatabase/text()")[0]

    delete_info_paragraphs = xml_tree.xpath("/Book/p")
    for paragraph in delete_info_paragraphs:
        paragraph.getparent().remove(paragraph)

    etree.strip_elements(xml_tree, "tableofcontents")

    root_element = xml_tree.getroot()
    root_element.tag = "body"

    etree.strip_attributes(root_element, "A", "B", "part", "id-text", "id")
    etree.strip_tags(root_element, "allowbreak")

    alldivs = xml_tree.xpath("//div1|//div2|//div3|//div4|")
    divdict = {"div1" : "chapter", "div2" : "section", "div3" : "subsection", "div4" : "subsubsection"}

    for div in alldivs:
        div.set("type", divdict[div.tag])
        div.tag = "div"

    italics = xml_tree.xpath("//hi[@rend='it']")
    for element in italics:
        element.set("rend", "italic")

    higher_lower = xml_tree.xpath("//EOAup|//EOAdown")
    for element in higher_lower:
        if element.tag == "EOAup":
            element.set("rend", "superscript")
        else:
            element.set("rend", "subscript")
        element.tag = "hi"

    inline_image = xml_tree.xpath("//EOAinline")
    for element in inline_image:
        element.tag = "graphic"
        element.set("url", element.text)
        # is there a better way?
        element.text = ""

    footnotes = xml_tree.xpath("//note[@place='Inline']")
    for element in footnotes:
        element.set("place", "bottom")

    urls = xml_tree.xpath("//xref")
    for element in urls:
        element.tag = "ref"
        element.set("type", "url")
        element.set("target", element.get("url"))
        etree.strip_attributes(element, "url")

    citations = xml_tree.xpath("//span[@class='citation']|//EOAcitenumeric")
    for element in citations:
        # how to determine the cite pages?
        citedrange = ""

        if element.tag == "span":
            citekey = element.get("citekey")
            booktitle = element.get("data-content")
            other_text = element.get("data-title")
        elif element.tag == "EOAcitenumeric":
            citekey = element.find("citekey").text
            citetext = element.find("citetext").text
            element.set("rend", "numeric")

            if citetext is None:
                citedrange = ""
            else:
                citedrange = citetext

        element.tag = "bibl"
        reference = etree.SubElement(element, "ref", target="#" + citekey)
        if len(citedrange) > 0:
            # leaving unit attribute out for now and assume that page is default range
            cited_range = etree.SubElement(element, "citedRange")#, unit="page")
            cited_range.text = citedrange

        element.text = ""
        etree.strip_attributes(element, "rel", "class", "citekey", "data-toggle", "html", "data-placement", "data-content", "data-title")
        etree.strip_elements(element, "citekey", "citetext")

    references = xml_tree.xpath("//EOApageref|//EOAref")
    for element in references:
        if element.tag == "EOApageref":
            element.set("type", "page")
        element.tag = "ref"
        label = element.find("Label")
        element.set("target", "#" + label.text)
        etree.strip_elements(element, "ref", "Label")

    quoted_paragraph = xml_tree.xpath("//p[@rend='quoted']")
    for element in quoted_paragraph:
        element.tag = "quote"
        etree.strip_attributes(element, "rend")

    ordered_lists = xml_tree.xpath("//list[@type='ordered' or @type='simple' or @type='description']")
    for element in ordered_lists:
        if element.get("type") == "simple":
            element.set("type", "bulleted")
        elif element.get("type") == "description":
            element.set("type", "gloss")
        items = element.getchildren()
        for item in items:
            etree.strip_attributes(item, "label")
            etree.strip_tags(item, "p")

    # code if we need to treat the description list differently for any reason
    # description_lists = xml_tree.xpath("//list[@type='description']")
    # for element in description_lists:
    #     element.set("type", "gloss")
    #     labels = element.findall("label")
    #     for label in labels:
    #         corresponding_item = label.getnext()
    #         etree.strip_tags(corresponding_item, "p")

    index_entries = xml_tree.xpath("//EOAindex|//EOAindexperson|//EOAindexlocation")
    index_names = {"EOAindex" : "keyword", "EOAindexperson" : "Person", "EOAindexlocation" : "Location"}
    for entry in index_entries:
        index_type = entry.tag
        index_text = entry.text
        entry.tag = "index"
        entry.text = ""
        entry.set("indexName", index_names[index_type])

        # does split() order the pieces in the correct way?
        subentries = index_text.split("!")

        # could probably be made more concise with recursion
        if len(subentries) == 1:
            term_element = etree.SubElement(entry, "term")
            term_element = check_index_formatting(subentries[0], term_element)
        elif len(subentries) == 2:
            term_element = etree.SubElement(entry, "term")
            term_element = check_index_formatting(subentries[0], term_element)
            second_index_level = etree.SubElement(entry,"index")
            second_term_element = etree.SubElement(second_index_level, "term")
            second_term_element = check_index_formatting(subentries[1], second_term_element)
        elif len(subentries) == 3:
            term_element = etree.SubElement(entry, "term")
            term_element = check_index_formatting(subentries[0], term_element)
            second_index_level = etree.SubElement(entry, "index")
            second_term_element = etree.SubElement(second_index_level, "term")
            second_term_element = check_index_formatting(subentries[1], second_term_element)
            third_index_level = etree.SubElement(second_index_level, "index")
            third_term_element = etree.SubElement(third_index_level, "term")
            third_term_element = check_index_formatting(subentries[2], third_term_element)
        elif len(subentries) >= 4:
            print("Error: more than two levels of subentries are disallowed!\n", index_text)
            sys.exit()

    tables = xml_tree.xpath("//EOAtable")
    for table in tables:
        tab_label = table.find("EOAtablelabel")
        tab_caption = table.find("EOAtablecaption")
        tab_config = table.find("EOAtablecolumns")
        realtable = table.find("table")

        rows = realtable.findall("row")
        # debatable, as there might be colspan!
        columns = rows[0].findall("cell")

        table.tag = "table"
        table.set("rows", str(len(rows)))
        table.set("cols", str(len(columns)))
        table.set(id_attr, fix_xml_id(tab_label.text))

        table_header = etree.Element("head")
        table_header.text = tab_caption.text
        table.insert(0, table_header)

        for row in rows:
            header = row.xpath(".//tableheader")
            if len(header) > 0:
                header[0].text = ""
                row.set("role", "label")
            else:
                row.set("role", "data")
            cells = row.findall("cell")
            for cell in cells:
                cell.set("role", "data")

        etree.strip_elements(table, "EOAtablelabel", "EOAtablecaption", "EOAtablecolumns")
        etree.strip_tags(table, "table", "tableheader")

    figures = xml_tree.xpath("//EOAfigure|//EOAfigurenonumber")
    for figure in figures:
        if figure.tag == "EOAfigurenonumber":
            figure.set("rend", "nonumber")
        else:
            image_caption = figure.find(".//caption")
            image_caption.tag = "head"
            figure.append(image_caption)

        figure.tag = "figure"
        figure_size = figure.find(".//width")

        etree.Comment("width is " + figure_size.text)

        image_path = figure.find(".//file").text

        etree.SubElement(figure, "graphic", url=image_path)
        etree.strip_elements(figure, "anchor", "p")

    equations = xml_tree.xpath("//EOAineq|//EOAequation|//EOAequationnonumber")
    for element in equations:
        if element.tag == "EOAineq":
            element.set("rend", "inline")
        elif element.tag == "EOAequationnonumber":
            element.set("rend", "block nonumber")
        else:
            element.set("rend", "block")

        element.set("notation", "TeX")
        element.text = fix_equations(element.get("TeX"))
        element.tag = "formula"
        etree.strip_attributes(element, "TeX", "src", "filename", "number", "uid")
        etree.strip_elements(element, "math", "formula", "anchor")

    return xml_tree
# def transform_intermediate_xml ends here

def main():
    """The main bit"""

    # CONFIG['Authors']['Zusatz']:

    # create document structure
    tei_root = etree.Element(TEI + "TEI", nsmap=NS_MAP)
    tei_header = create_tei_header(PUB_CONFIG)
    tei_root.append(tei_header)

    tei_body = etree.SubElement(tei_root, "text")
    intermediate_xml_tree = etree.parse("IntermediateXMLFile.xml")
    tei_body_xml = transform_intermediate_xml(intermediate_xml_tree)
    front_part = etree.SubElement(tei_body, "front")
    front_contents_list = populate_front_part(PUB_CONFIG)

    for part in front_contents_list:
        front_part.append(part)

    back_part = etree.SubElement(tei_body, "back")
    tei_body.insert(1, tei_body_xml.getroot())

    outfile = 'CONVERT/TEI.xml'
    output_string = etree.tostring(tei_root, xml_declaration=True, pretty_print=True, encoding="UTF-8", doctype= '<?xml-model href="eoa_tei.rnc" type="application/relax-ng-compact-syntax"?>\n<?xml-stylesheet type="text/css" href="tei.css" ?>')

    with open(outfile, 'w') as output_file:
        output_file.write(output_string.decode("utf-8"))
# def main ends here

if __name__ == '__main__':
    main()

# finis
	#!/usr/bin/env python3
	# -- coding: utf-8; mode: python --

	"""Unfinished program to convert a customized DocBook XML to TEI XML.

	This program creates a TEI XML version out of a DocBook XML file.

	"""

	import sys
	import configparser
	from lxml import etree

	# citations need a little more work: especially citedRange
	# so do landscape figures, no way to distinguish them!

	# namespaces
	TEI_NS = "http://www.tei-c.org/ns/1.0"
	TEI = "{%s}" % TEI_NS

	NS_MAP = {
	None: TEI_NS,
	"xml": "https://www.w3.org/TR/xml11",
	"tmp": "tmp",
	"re": "http://exslt.org/regular-expressions"}

	PUB_CONFIG = configparser.ConfigParser()
	PUB_CONFIG.read("publication.cfg")

	id_attr = "{{{ns}}}id".format( ns=NS_MAP["xml"] )

	def populate_front_part(CONFIG):
	"""This part consists mainly of boilerplate text"""

	list_of_elements = []

	if len(CONFIG['General']['Dedication']) > 0:
	dedication = etree.Element(TEI + "div", type="dedication")
	dedication_text = etree.SubElement(dedication, TEI + "p").text = CONFIG['General']['Dedication']

	list_of_elements.append(dedication)

	return list_of_elements
	# def populate_front_part ends here

	def check_index_formatting(entry, xml_element):
	"""Check if index string contains formatting directions"""

	if entry.find("@") > 0:
	sort_key, formatted_string = entry.split("@")
	xml_element.set("sortKey", sort_key)
	xml_element.text = formatted_string
	else:
	xml_element.text = entry

	return xml_element
	# def check_index_formatting ends here

	def fix_xml_id(id_string):
	"""Make a string NCName conform"""

	replacements = {
	":" : "_"
	}

	for character in replacements.keys():
	id_string = id_string.replace(character, replacements[character])

	return id_string
	# def fix_xml_id ends here

	def fix_equations(tex_equation):
	"""Remove surrounding TeX code from formulas."""

	superfluous_code = [r"\end{equation}", r"\begin{equation}"]

	for codes in superfluous_code:
	print(codes)
	tex_equation = tex_equation.replace(codes, "")

	return tex_equation
	# def fix_equations ends here

	def create_tei_header(CONFIG):
	"""Based on publication.cfg, create the header for a TEI file."""

	header = etree.Element(TEI + "teiHeader")
	filedesc = etree.SubElement(header, TEI + "fileDesc")
	titlestmt = etree.SubElement(filedesc, TEI + "titleStmt")
	series_title = etree.SubElement(titlestmt, TEI + "title", level = "s", n = CONFIG['Technical']['Number']).text = CONFIG['Technical']['Serie']
	book_title = etree.SubElement(titlestmt, TEI + "title", type = "main", level = "m").text = CONFIG['Technical']['Title']
	book_subtitle = etree.SubElement(titlestmt, TEI + "title", type = "sub", level = "m").text = CONFIG['Technical']['Subtitle']

	for person in range(1,6):
	tmpstring = """Author%s""" % person
	tmpauthor = CONFIG['Authors'][tmpstring]
	if len(tmpauthor) > 0:
	author = etree.SubElement(titlestmt, TEI + "author").text = CONFIG['Authors'][tmpstring]

	etree.SubElement(titlestmt, TEI + "editor", role="publicationmanager").text = "Lindy Divarci"

	other_roles = {"Submitter" : "submitter", "EditorialCoordination" :
	"editorialcoordinator", "Copyediting" : "copyeditor", "Translator" :
	"translator"}

	for role in other_roles:
	tmprole = CONFIG['General'][role]
	if len(tmprole) > 0:
	editor = etree.SubElement(titlestmt, TEI + "editor", role=other_roles[role]).text = CONFIG['General'][role]

	extent = etree.SubElement(filedesc, TEI + "extent")
	# numpages = etree.SubElement(extent, "measure", unit="pages", quantity=)
	numpages = etree.SubElement(extent, "measure", unit="EUR", quantity=CONFIG['Technical']['Price'])

	pub_statement = etree.SubElement(filedesc, TEI + "publicationStmt")
	publisher = etree.SubElement(pub_statement, TEI + "publisher").text = "Edition Open Access"
	distributor = etree.SubElement(pub_statement, TEI + "distributor").text = CONFIG['Technical']['Shoplink']
	publication_date = etree.SubElement(pub_statement, TEI + "date", when=CONFIG['Technical']['PublicationDate']).text = CONFIG['Technical']['PublicationDate']
	isbn = etree.SubElement(pub_statement, TEI + "idno", type="ISBN").text = CONFIG['Technical']['ISBN']
	shoplink = etree.SubElement(pub_statement, TEI + "idno", type="shoplink").text = CONFIG['Technical']['Shoplink']

	availability = etree.SubElement(pub_statement, TEI + "availability")
	licence = etree.SubElement(availability, TEI + "licence", target="https://creativecommons.org/licenses/" + CONFIG['Technical']['License'] + "/3.0/de/deed.en")
	# following string should not be hardcoded
	licence_text = etree.SubElement(licence, TEI + "p").text = "Distributed under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Germany License."

	sourcedesc = etree.SubElement(filedesc, TEI + "sourceDesc")
	sourcedesc_text = etree.SubElement(sourcedesc, TEI + "p").text = "This is a born digital document from Edition Open Access."

	profiledesc = etree.SubElement(header, TEI + "profileDesc")
	shortabstract = etree.SubElement(profiledesc, TEI + "abstract", n="brief")
	shortabstract_text = etree.SubElement(shortabstract, TEI + "p").text = CONFIG['General']['BriefDescription']
	longabstract = etree.SubElement(profiledesc, TEI + "abstract", n="detail")
	longabstract_text = etree.SubElement(longabstract, TEI + "p").text = CONFIG['General']['DetailedDescription']

	if len(CONFIG['General']['AdditionalInformation']) > 0:
	additionalabstract = etree.SubElement(profiledesc, TEI + "abstract", n="additional")
	additionalabstract_text = etree.SubElement(additionalabstract, TEI + "p").text = CONFIG['General']['AdditionalInformation']

	textclass = etree.SubElement(profiledesc, TEI + "textClass")
	keywords = etree.SubElement(textclass, TEI + "keywords")
	keyword_list = etree.SubElement(keywords, TEI + "list")

	for keyword in range(1,7):
	tmpstring = """Keyword%s""" % keyword
	tmpkeyword = CONFIG['General'][tmpstring]
	if len(tmpkeyword) > 0:
	keyword = etree.SubElement(keyword_list, TEI + "item").text = tmpkeyword

	language_dictionary = {"de" : "Deutsch", "en" : "Englisch"}

	language_usage = etree.SubElement(profiledesc, TEI + "langUsage")
	mainlanguage = etree.SubElement(language_usage, TEI + "language", ident=CONFIG['Technical']['Language']).text = language_dictionary[CONFIG['Technical']['Language']]

	encodingdesc = etree.SubElement(header, TEI + "encodingDesc")

	tag_declaration = etree.SubElement(encodingdesc, TEI + "tagsDecl")
	struck_rendition = etree.SubElement(tag_declaration, TEI + "rendition", scheme="css")
	struck_rendition.text = "text-decoration: line-through;"
	struck_rendition.set(id_attr, "struck")

	spaced_rendition = etree.SubElement(tag_declaration, TEI + "rendition", scheme="css")
	spaced_rendition.text = "letter-spacing:0.3em"
	spaced_rendition.set(id_attr, "spaced")

	smallcaps_rendition = etree.SubElement(tag_declaration, TEI + "rendition", scheme="css")
	smallcaps_rendition.text = "font-variant:small-caps"
	smallcaps_rendition.set(id_attr, "smallcaps")

	return header
	# def create_tei_header ends here

	def transform_intermediate_xml(xml_tree):
	"""Perform a transformation of existing XML structure"""

	bibliography_type = xml_tree.xpath("/Book/p/EOAbibliographytype/text()")[0]
	bibliographydatabase = xml_tree.xpath("/Book/p/EOAbibliographydatabase/text()")[0]

	delete_info_paragraphs = xml_tree.xpath("/Book/p")
	for paragraph in delete_info_paragraphs:
	paragraph.getparent().remove(paragraph)

	etree.strip_elements(xml_tree, "tableofcontents")

	root_element = xml_tree.getroot()
	root_element.tag = "body"

	etree.strip_attributes(root_element, "A", "B", "part", "id-text", "id")
	etree.strip_tags(root_element, "allowbreak")

	alldivs = xml_tree.xpath("//div1\|//div2\|//div3\|//div4\|")
	divdict = {"div1" : "chapter", "div2" : "section", "div3" : "subsection", "div4" : "subsubsection"}

	for div in alldivs:
	div.set("type", divdict[div.tag])
	div.tag = "div"

	italics = xml_tree.xpath("//hi[@rend='it']")
	for element in italics:
	element.set("rend", "italic")

	higher_lower = xml_tree.xpath("//EOAup\|//EOAdown")
	for element in higher_lower:
	if element.tag == "EOAup":
	element.set("rend", "superscript")
	else:
	element.set("rend", "subscript")
	element.tag = "hi"

	inline_image = xml_tree.xpath("//EOAinline")
	for element in inline_image:
	element.tag = "graphic"
	element.set("url", element.text)
	# is there a better way?
	element.text = ""

	footnotes = xml_tree.xpath("//note[@place='Inline']")
	for element in footnotes:
	element.set("place", "bottom")

	urls = xml_tree.xpath("//xref")
	for element in urls:
	element.tag = "ref"
	element.set("type", "url")
	element.set("target", element.get("url"))
	etree.strip_attributes(element, "url")

	citations = xml_tree.xpath("//span[@class='citation']\|//EOAcitenumeric")
	for element in citations:
	# how to determine the cite pages?
	citedrange = ""

	if element.tag == "span":
	citekey = element.get("citekey")
	booktitle = element.get("data-content")
	other_text = element.get("data-title")
	elif element.tag == "EOAcitenumeric":
	citekey = element.find("citekey").text
	citetext = element.find("citetext").text
	element.set("rend", "numeric")

	if citetext is None:
	citedrange = ""
	else:
	citedrange = citetext

	element.tag = "bibl"
	reference = etree.SubElement(element, "ref", target="#" + citekey)
	if len(citedrange) > 0:
	# leaving unit attribute out for now and assume that page is default range
	cited_range = etree.SubElement(element, "citedRange")#, unit="page")
	cited_range.text = citedrange

	element.text = ""
	etree.strip_attributes(element, "rel", "class", "citekey", "data-toggle", "html", "data-placement", "data-content", "data-title")
	etree.strip_elements(element, "citekey", "citetext")

	references = xml_tree.xpath("//EOApageref\|//EOAref")
	for element in references:
	if element.tag == "EOApageref":
	element.set("type", "page")
	element.tag = "ref"
	label = element.find("Label")
	element.set("target", "#" + label.text)
	etree.strip_elements(element, "ref", "Label")

	quoted_paragraph = xml_tree.xpath("//p[@rend='quoted']")
	for element in quoted_paragraph:
	element.tag = "quote"
	etree.strip_attributes(element, "rend")

	ordered_lists = xml_tree.xpath("//list[@type='ordered' or @type='simple' or @type='description']")
	for element in ordered_lists:
	if element.get("type") == "simple":
	element.set("type", "bulleted")
	elif element.get("type") == "description":
	element.set("type", "gloss")
	items = element.getchildren()
	for item in items:
	etree.strip_attributes(item, "label")
	etree.strip_tags(item, "p")

	# code if we need to treat the description list differently for any reason
	# description_lists = xml_tree.xpath("//list[@type='description']")
	# for element in description_lists:
	# element.set("type", "gloss")
	# labels = element.findall("label")
	# for label in labels:
	# corresponding_item = label.getnext()
	# etree.strip_tags(corresponding_item, "p")

	index_entries = xml_tree.xpath("//EOAindex\|//EOAindexperson\|//EOAindexlocation")
	index_names = {"EOAindex" : "keyword", "EOAindexperson" : "Person", "EOAindexlocation" : "Location"}
	for entry in index_entries:
	index_type = entry.tag
	index_text = entry.text
	entry.tag = "index"
	entry.text = ""
	entry.set("indexName", index_names[index_type])

	# does split() order the pieces in the correct way?
	subentries = index_text.split("!")

	# could probably be made more concise with recursion
	if len(subentries) == 1:
	term_element = etree.SubElement(entry, "term")
	term_element = check_index_formatting(subentries[0], term_element)
	elif len(subentries) == 2:
	term_element = etree.SubElement(entry, "term")
	term_element = check_index_formatting(subentries[0], term_element)
	second_index_level = etree.SubElement(entry,"index")
	second_term_element = etree.SubElement(second_index_level, "term")
	second_term_element = check_index_formatting(subentries[1], second_term_element)
	elif len(subentries) == 3:
	term_element = etree.SubElement(entry, "term")
	term_element = check_index_formatting(subentries[0], term_element)
	second_index_level = etree.SubElement(entry, "index")
	second_term_element = etree.SubElement(second_index_level, "term")
	second_term_element = check_index_formatting(subentries[1], second_term_element)
	third_index_level = etree.SubElement(second_index_level, "index")
	third_term_element = etree.SubElement(third_index_level, "term")
	third_term_element = check_index_formatting(subentries[2], third_term_element)
	elif len(subentries) >= 4:
	print("Error: more than two levels of subentries are disallowed!\n", index_text)
	sys.exit()

	tables = xml_tree.xpath("//EOAtable")
	for table in tables:
	tab_label = table.find("EOAtablelabel")
	tab_caption = table.find("EOAtablecaption")
	tab_config = table.find("EOAtablecolumns")
	realtable = table.find("table")

	rows = realtable.findall("row")
	# debatable, as there might be colspan!
	columns = rows[0].findall("cell")

	table.tag = "table"
	table.set("rows", str(len(rows)))
	table.set("cols", str(len(columns)))
	table.set(id_attr, fix_xml_id(tab_label.text))

	table_header = etree.Element("head")
	table_header.text = tab_caption.text
	table.insert(0, table_header)

	for row in rows:
	header = row.xpath(".//tableheader")
	if len(header) > 0:
	header[0].text = ""
	row.set("role", "label")
	else:
	row.set("role", "data")
	cells = row.findall("cell")
	for cell in cells:
	cell.set("role", "data")

	etree.strip_elements(table, "EOAtablelabel", "EOAtablecaption", "EOAtablecolumns")
	etree.strip_tags(table, "table", "tableheader")

	figures = xml_tree.xpath("//EOAfigure\|//EOAfigurenonumber")
	for figure in figures:
	if figure.tag == "EOAfigurenonumber":
	figure.set("rend", "nonumber")
	else:
	image_caption = figure.find(".//caption")
	image_caption.tag = "head"
	figure.append(image_caption)

	figure.tag = "figure"
	figure_size = figure.find(".//width")

	etree.Comment("width is " + figure_size.text)

	image_path = figure.find(".//file").text

	etree.SubElement(figure, "graphic", url=image_path)
	etree.strip_elements(figure, "anchor", "p")

	equations = xml_tree.xpath("//EOAineq\|//EOAequation\|//EOAequationnonumber")
	for element in equations:
	if element.tag == "EOAineq":
	element.set("rend", "inline")
	elif element.tag == "EOAequationnonumber":
	element.set("rend", "block nonumber")
	else:
	element.set("rend", "block")

	element.set("notation", "TeX")
	element.text = fix_equations(element.get("TeX"))
	element.tag = "formula"
	etree.strip_attributes(element, "TeX", "src", "filename", "number", "uid")
	etree.strip_elements(element, "math", "formula", "anchor")

	return xml_tree
	# def transform_intermediate_xml ends here

	def main():
	"""The main bit"""

	# CONFIG['Authors']['Zusatz']:

	# create document structure
	tei_root = etree.Element(TEI + "TEI", nsmap=NS_MAP)
	tei_header = create_tei_header(PUB_CONFIG)
	tei_root.append(tei_header)

	tei_body = etree.SubElement(tei_root, "text")
	intermediate_xml_tree = etree.parse("IntermediateXMLFile.xml")
	tei_body_xml = transform_intermediate_xml(intermediate_xml_tree)
	front_part = etree.SubElement(tei_body, "front")
	front_contents_list = populate_front_part(PUB_CONFIG)

	for part in front_contents_list:
	front_part.append(part)

	back_part = etree.SubElement(tei_body, "back")
	tei_body.insert(1, tei_body_xml.getroot())

	outfile = 'CONVERT/TEI.xml'
	output_string = etree.tostring(tei_root, xml_declaration=True, pretty_print=True, encoding="UTF-8", doctype= '<?xml-model href="eoa_tei.rnc" type="application/relax-ng-compact-syntax"?>\n<?xml-stylesheet type="text/css" href="tei.css" ?>')

	with open(outfile, 'w') as output_file:
	output_file.write(output_string.decode("utf-8"))
	# def main ends here

	if __name__ == '__main__':
	main()

	# finis