fix_tei.py

#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-

"""
This program is a processing step after the conversion from docx to TEI.

The EOA guidelines for working with docx files encourage the use of
shorthands for citations, cross references and figures. After the
conversion to TEI through oxgarage, these shorthands are converted
into proper TEI-XML markup.

Additionally, external dependencies (image files for figures as well
as bibliography files) are checked, and a report is written so that
these issues can be fixed there.

If there is no bibtex file at all, a temporary one can be created
using the tool create_tmpbib.

Some data from the publication is also gathered in Python dictionaries
which will be re-used by the tei2imxml program.

 """

__version__ = "1.0"
__date__ = "20180109"
__author__ = "kthoden@mpiwg-berlin.mpg.de"

from utils.load_config import load_config
import utils.libeoaconvert as libeoaconvert

import sys
import os
import re
import json
import logging
import shlex
import pickle
import subprocess
from lxml import etree
import datetime
import bibtexparser
import argparse
import traceback
import configparser

logging.basicConfig(level=logging.INFO, format=' %(asctime)s - %(levelname)s - %(message)s')

ns_tei = "http://www.tei-c.org/ns/1.0"
NS_MAP = {"t" : ns_tei}

PICKLE_DIR = os.path.expanduser("output/pickle_from_tei")
OUTPUT_DIR = os.path.expanduser("output/imxml_from_tei")

RUNNING_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
logging.debug("The script is run from {}".format(RUNNING_DIRECTORY))
TEI_BOILERPLATE = os.path.sep.join([RUNNING_DIRECTORY, "data", "tei_boilerplate.cfg"])

if not os.path.exists(TEI_BOILERPLATE):
    logging.error("Could not find TEI boilerplate config. Exiting.")
    sys.exit()

BOILERPLATES = configparser.ConfigParser()
BOILERPLATES.read(TEI_BOILERPLATE)

def parse_bibtex(bibfile):
    """Parse the bibtex file, return a dict"""

    all_references = {}

    with open(bibfile) as btf:
        btb = bibtexparser.load(btf)
        tmp_dict = btb.entries_dict

    all_references.update(tmp_dict)

    # return all_references
    return tmp_dict
# def parse_bibtex ends here


def unescape(text):
    """Remove HTML or XML character references and entities from a text
    string. Return a Unicode string.

    With thanks to http://effbot.org/zone/re-sub.htm#unescape-html.
    Modified to work with Python3.
    """
    import re, html.entities

    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return chr(int(text[3:-1], 16))
                else:
                    return chr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            try:
                text = chr(html.entities.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        # leave as is
        return text
    return re.sub(r"&#?\w+;", fixup, text)
# def unescape ends here

def convert_references(string):
    """Find reference markers (#) in the text"""

    references_pattern = re.compile(r"(#)(?P<reference>.+?)(#)")
    references = re.findall(references_pattern, string)
    logging.info("Found %s references" % len(references))
    for reference in references:
        string = re.sub(references_pattern, r"<ref><![CDATA[\g<1>]]></ref>", string)

    return string
# def convert_references ends here

def convert_citations(string, citedrangetext):
    """Find citation shorthand using regex.

    Return a tuple of the modified string and a list of found citations

    In a second step, parse the result and return citekey and pagerange
    (if present)).

    <bibl>
      <ref target="#Kaulbach_1960"/>
      <citedRange from="320" to="322"/>
    </bibl>
    """
    citations = []

    # BUG: a dot as in 197f. is not recognized!
    year_citations_pattern = re.compile(r"(§|&#xA7;|&#167;)(§|&#xA7;|&#167;)(?P<citekey>.+?)(\!(?P<pages>.*?))?(§|&#xA7;|&#167;)(§|&#xA7;|&#167;)")
    year_citations = re.findall(year_citations_pattern, string)
    logging.info("Found %s year citations." % len(year_citations))
    if citedrangetext:
        string = re.sub(year_citations_pattern, r"<bibl><ref type='year' target='#\g<citekey>'/><citedRange>\g<pages></citedRange></bibl>", string)
    else:
        string = re.sub(year_citations_pattern, r"<bibl><ref type='year' target='#\g<citekey>'/><citedRange from='\g<pages>'/></bibl>", string)

    authoryear_citation_pattern = re.compile(r"(§|&#xA7;|&#167;)(?P<citekey>.+?)(\!(?P<pages>.*?))?(§|&#xA7;|&#167;)")
    authoryear_citations = re.findall(authoryear_citation_pattern, string)
    logging.info("Found %s author/year citations." % len(authoryear_citations))
    if citedrangetext:
        string = re.sub(authoryear_citation_pattern, r"<bibl><ref type='authoryear' target='#\g<citekey>'/><citedRange>\g<pages></citedRange></bibl>", string)
    else:
        string = re.sub(authoryear_citation_pattern, r"<bibl><ref type='authoryear' target='#\g<citekey>'/><citedRange from='\g<pages>'/></bibl>", string)

    for year_citation in year_citations:
        citations.append(year_citation[2])
    for authoryear_citation in authoryear_citations:
        citations.append(authoryear_citation[1])

    return (string, citations)
# def convert_citations ends here


def convert_math(string):
    """Find math shorthand using regex.

    <formula notation="tex" rend="inline">2^2 = 4</formula>
    """
    formulae = []

    math_pattern = re.compile(r"(\$|&#x24;|&#36;)(?P<contents>.+?)(\$|&#x24;|&#36;)")
    found_math = re.findall(math_pattern, string)
    logging.info(f"Found {len(found_math)} formulae.")

    string = re.sub(math_pattern, r'<formula notation="tex" rend="inline">\g<contents></formula>', string)

    for formula in found_math:
        formulae.append(formula[1])

    return string
# def convert_math ends here


def parse_cited_range(list_of_xml_elements):
    """citedRange: split up parameters or remove element if attributes are empty"""

    unsplittable_pageref = []

    for reference in list_of_xml_elements:
        cited_range = reference.find(".//t:citedRange", namespaces=NS_MAP)
        from_value = cited_range.get("from")

        if from_value is None:
            pass
        else:

            split_values = re.findall(r"[\w']+", from_value)
            if len(from_value) == 0:
                cited_range.tag = "tagtobestripped"
                cited_range.attrib.pop("from")
            elif len(split_values) == 1:
                cited_range.set("from", split_values[0])
            elif len(split_values) == 2:
                cited_range.set("from", split_values[0])
                cited_range.set("to", split_values[1])
            elif len(split_values) == 3:
                cited_range.set("from", split_values[0])
                cited_range.set("to", split_values[2])
            else:
                logging.info("Splitting the page range produced unexpected result. Tried to split %s. Wrote to text field." % from_value)
                cited_range.text = from_value
                cited_range.attrib.pop("from")
                unsplittable_pageref.append(from_value)

    return unsplittable_pageref
# def parse_cited_range ends here

def validate_citations(used_citekeys, bibdata):
    """Check if all found citekeys are in the database

    Return a list of unavailable citekeys."""

    available_citekeys = bibdata.keys()

    no_citekey = []

    for citekey in used_citekeys:
        if citekey not in available_citekeys:
            no_citekey.append(citekey)
            logging.info("%s is not in the bibliographic database" % citekey)

    return no_citekey
# def validate_citations ends here

def insert_bib_section(xml_tree):
    """Insert a section containing a PI for bibliography"""

    bib_pi = etree.ProcessingInstruction("eoa", "printbibliography")
    bib_div = etree.Element("div", type="section", n="nonumber")
    bib_head = etree.SubElement(bib_div, "head").text = "References"
    bib_div.append(bib_pi)

    all_children = xml_tree.getchildren()
    last_child = all_children[-1]
    last_child.addnext(bib_div)

    return
# def insert_bib_section ends here

def convert_figures(string):
    """Find figures shorthands"""

    # negative lookbehind assertion. Real + characters must be escaped by \
    graphic_pattern = re.compile(r"(?<!\\)\+(.*?)\+")

    # +Fig.1CarteDuCielPotsdam!Glass photographic plate from the Carte
    # du Ciel survey, Potsdam Observatory, Plate 5, taken January 11,
    # 1894. The plate is approximately 16 cm x 16 cm; each plate
    # covered two square degrees of the sky. Courtesy of the
    # Leibniz-Institut f&#xFC;r Astrophysik, Potsdam+

    figures = re.findall(graphic_pattern, string)
    logging.info("Found %s figures" % len(figures))

    for figure in figures:
        string = re.sub(graphic_pattern, r"<graphic><![CDATA[\g<1>]]></graphic>", string)

    return string
# def convert_figures ends here

def make_figure_elements(list_of_figures, figure_directory):
    """Construct the figure element."""

    bad_images = []
    available_images = []
    available_images_long = os.listdir(figure_directory)

    for img in available_images_long:
        available_images.append(os.path.splitext(img)[0])

    for graphic in list_of_figures:
        parent_tag = graphic.getparent()
        parent_tag.tag = "figure"

        original_string = graphic.text
        graphic.clear()

        parts = original_string.split("!")

        if len(parts) in range(2,4):
            if parts[0] in available_images or parts[0] in available_images_long:
                selected_image = parts[0]
                logging.info("Found %s in the text. Selected %s as corresponding image." % (parts[0], selected_image))
                graphic.set("scale", "50")
                graphic.set("url", figure_directory + os.path.sep + selected_image)
            else:
                bad_images.append(original_string)

            caption = "<head>" + parts[1] + "</head>"
            head_element = etree.fromstring(caption)
            parent_tag.insert(1, head_element)

            if len(parts) == 3:
                logging.info("This figure contains hyperimage directions")
                yenda_command = etree.ProcessingInstruction("hyperimage", "Hyperimage direction: %s" % parts[2])
                # yenda_command = etree.Comment("Hyperimage direction: %s" % parts[2])
                parent_tag.append(yenda_command)

        else:
            logging.info(f"The figure string could not be split by '!'. Adding graphic without caption.")
            graphic.set("scale", "50")
            graphic.set("url", figure_directory + os.path.sep + parts[0])
            parent_tag.set("n", "nonumber")

    return bad_images
# def make_figure_elements ends here

def cleanup_xml(xml_tree):
    """Perform some cleaning on XML

    Also, delete elements and attributes inserted by metypeset and
    rename elements according to our schema
    """

    metypeset_attrib = xml_tree.findall("//t:*[@meTypesetSize]", namespaces=NS_MAP)
    color_attrib = xml_tree.xpath("//t:hi[contains(@rend, 'color') or contains(@rend, 'background')]", namespaces=NS_MAP)
    hi_style_attrib = xml_tree.xpath("//t:hi[contains(@style,'font-size')]", namespaces=NS_MAP)
    xml_preserve = xml_tree.xpath("//*[@xml:space]")

    logging.info("Found %s metypesets." % len(metypeset_attrib))
    logging.info("Found %s colour attributes." % len(color_attrib))
    logging.info(f"Found {len(hi_style_attrib)} hi style attributes.")
    logging.info(f"Found {len(xml_preserve)} xml:space attributes.")

    for attribute in metypeset_attrib:
        logging.info("Number of attributes: %s" % len(attribute.attrib))
        attribute.attrib.pop("meTypesetSize")

    for attribute in color_attrib:
        attribute.attrib.pop("rend")

    for attribute in hi_style_attrib:
        attribute.attrib.pop("style")

    for attribute in xml_preserve:
        attribute.attrib.pop("{http://www.w3.org/XML/1998/namespace}space")

    formulae = xml_tree.xpath("//t:formula", namespaces=NS_MAP)
    logging.info(f"Found {len(formulae)} formulae.")

    for formula in formulae:
        hi_in_formula = formula.xpath("t:hi", namespaces=NS_MAP)
        for hi in hi_in_formula:
            logging.debug(f"Found something in the formula: {etree.tostring(hi)}")
            if hi.attrib["rend"] == "italic":
                hi.attrib.pop("rend")
            else:
                logging.warning(f"Found another rend attribute: {hi.get('rend')}")
            hi.tag = "tagtobestripped"

    empty_rend = xml_tree.xpath("//t:hi[(@rend='')]", namespaces=NS_MAP)
    logging.info(f"Found {len(empty_rend)} empty rend attributes.")

    for attribute in empty_rend:
        attribute.attrib.pop("rend")

    hi_without_attrib2 = xml_tree.findall("//t:hi", namespaces=NS_MAP)

    for attribute in hi_without_attrib2:
        if len(attribute.attrib) == 0:
            xml_parent = attribute.getparent()
            attribute.tag = "tagtobestripped"

    seg_element = xml_tree.findall("//t:seg", namespaces=NS_MAP)
    for seg in seg_element:
        seg.tag = "tagtobestripped"

    footnotes = xml_tree.xpath("//t:note[@place='foot']", namespaces=NS_MAP)
    for footnote in footnotes:
        footnote.set("place", "bottom")

    paragraphs = xml_tree.xpath("//t:p[contains(@rend, 'Text') or contains(@rend, 'Footnote')]", namespaces=NS_MAP)
    for paragraph in paragraphs:
        paragraph.attrib.pop("rend")

    wrongblockquotes = xml_tree.xpath("//t:p[contains(@rend, 'Quotation')]", namespaces=NS_MAP)
    for blockquote in wrongblockquotes:
        paragraph.set("rend", "quote")

    etree.strip_tags(xml_tree, "tagtobestripped")

    return xml_tree
# def cleanup_xml ends here

def fix_document_structure(xml_tree, highest_level):
    """Insert div types"""

    # Unsure here, but maybe have a rule that one file is one chapter,
    # so the highest level would be sections

    if highest_level == "chapter":
        chapter_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP)
        section_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP)
        subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP)
        subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP)

        for chapter in chapter_divs:
            chapter.set("type", "chapter")
        for section in section_divs:
            section.set("type", "section")
        for subsection in subsection_divs:
            subsection.set("type", "subsection")
        for subsubsection in subsubsection_divs:
            subsubsection.set("type", "subsubsection")

    elif highest_level == "part":
        part_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP)
        chapter_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP)
        section_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP)
        subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP)
        subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div/t:div", namespaces=NS_MAP)

        for part in part_divs:
            part.set("type", "part")
        for chapter in chapter_divs:
            chapter.set("type", "chapter")
        for section in section_divs:
            section.set("type", "section")
        for subsection in subsection_divs:
            subsection.set("type", "subsection")
        for subsubsection in subsubsection_divs:
            subsubsection.set("type", "subsubsection")
# def fix_document_structure ends here

def fix_tei_header(xml_tree, bibfile_string, bibtype, organisation):
    """Populate TEI header with mandatory data"""

    title_statement = xml_tree.xpath("//t:titleStmt", namespaces=NS_MAP)[0]
    title_element = title_statement.find("t:title", namespaces=NS_MAP)

    title_element.set("type", "main")
    title_element.set("level", "m")
    if title_element.text is None:
        title_element.text = "Insert title of publication here"
    else:
        pass

    author_element = title_statement.find("t:author", namespaces=NS_MAP)
    author_element.tag = "editor"
    author_element.set("n", "1")
    author_element.set("ref", "#nn")
    author_element.set("role", "volumeeditor")
    author_element.set("key", "eoa_00")
    author_element.text = "N.N."

    example_resp = etree.Element("respStmt")
    example_resp.attrib["{http://www.w3.org/XML/1998/namespace}id"] = "nn"
    respresp = etree.SubElement(example_resp, "resp")
    respresp.text = "chapter author"
    respname = etree.SubElement(example_resp, "persName")
    surname = etree.SubElement(respname, "surname").text = "N"
    forename = etree.SubElement(respname, "forename").text = "N"
    author_element.addnext(example_resp)

    edition = xml_tree.xpath("//t:editionStmt/t:edition", namespaces=NS_MAP)[0]
    edition_date = edition.find("t:date", namespaces=NS_MAP)
    edition_date.clear()
    edition_date.tag = "tagtobestripped"
    # projecting release date half a year from begin of production
    publication_date = datetime.datetime.now() + datetime.timedelta(days=180)
    edition.text = "First published {} by {}".format(publication_date.strftime("%Y"), BOILERPLATES.get("Header","eoa_name"))

    publication_statement = xml_tree.xpath("//t:publicationStmt", namespaces=NS_MAP)[0]
    unknown_paragraph = publication_statement.find("t:p", namespaces=NS_MAP)
    if unknown_paragraph.text == "unknown":
        unknown_paragraph.clear()
        unknown_paragraph.tag = "tagtobestripped"

    extent_element = etree.Element("extent")
    pages = etree.SubElement(extent_element, "measure", commodity="pages", quantity="0")
    price = etree.SubElement(extent_element, "measure", type="price", unit="EUR", quantity="0")

    publication_statement.addprevious(extent_element)

    publisher_element = etree.SubElement(publication_statement, "publisher")
    overall_org = etree.SubElement(publisher_element, "orgName", n="EOA", ref=BOILERPLATES.get("Header","eoa_url"))
    overall_org.text = BOILERPLATES.get("Header","eoa_name")
    if organisation == "mprl":
        publishing_org = etree.SubElement(publisher_element, "orgName", n="Press", ref=BOILERPLATES.get("Header","mprl_url"))
        publishing_org.text = BOILERPLATES.get("Header","mprl_name")
    else:
        publishing_org = etree.SubElement(publisher_element, "orgName", n="Press", ref=BOILERPLATES.get("Header","eos_url"))
        publishing_org.text = BOILERPLATES.get("Header","eos_name")


    pub_date = etree.SubElement(publication_statement, "date", when=publication_date.strftime("%Y-%m-%d"))
    availability = etree.SubElement(publication_statement, "availability")
    licence = etree.SubElement(availability, "licence", target=BOILERPLATES.get("Header","licence_url"))
    licence.text = BOILERPLATES.get("Header","licence_text")

    # series statement
    series_stmt = etree.Element("seriesStmt")
    title_element = etree.SubElement(series_stmt, "title")
    resp_stmt = etree.SubElement(series_stmt, "respStmt")
    resp_title = etree.SubElement(resp_stmt, "resp").text = "Series Editors"
    resp_names = etree.SubElement(resp_stmt, "name", type="serieseditors")
    resp_names.text = BOILERPLATES.get("Header","eoa_series_editors")
    series_number = etree.SubElement(series_stmt, "idno", type="number").text = "number"
    if organisation == "mprl":
        title_element.text = "Series_title"
    else:
        title_element.text = "Sources"

    publication_statement.addnext(series_stmt)

    source_desc = xml_tree.xpath("//t:sourceDesc", namespaces=NS_MAP)[0]
    suggested_citation = etree.SubElement(source_desc, "ab", type="suggestedcitation").text = "Suggested Citation"
    bibfile = etree.SubElement(source_desc, "ab", type="bibdatabase")
    etree.SubElement(bibfile, "ref", type=bibtype, target=bibfile_string)

    # profile description
    profile_desc = etree.Element("profileDesc")
    brief_abstract = etree.SubElement(profile_desc, "abstract", n="brief")
    brief_abstract_p = etree.SubElement(brief_abstract, "p").text = "Short abstract"
    detailed_abstract = etree.SubElement(profile_desc, "abstract", n="detailed")
    detailed_abstract_p = etree.SubElement(detailed_abstract, "p").text = "Long abstract"
    additional_text = etree.SubElement(profile_desc, "abstract", n="additional")
    additional_text_p = etree.SubElement(additional_text, "p").text = "Additional text"
    textclass = etree.SubElement(profile_desc, "textClass")
    keywords = etree.SubElement(textclass, "keywords")
    list_keywords = etree.SubElement(keywords, "list")
    keyword_item = etree.SubElement(list_keywords, "item").text = BOILERPLATES.get("Header","eoa_name")
    langusage = etree.SubElement(profile_desc, "langUsage")
    language = etree.SubElement(langusage, "language", ident="principal_language")
    xml_tree.insert(1, profile_desc)

    encoding_desc = xml_tree.xpath("//t:encodingDesc", namespaces=NS_MAP)[0]

    project_desc = etree.Element("projectDesc")
    eoainfo_p1 = etree.SubElement(project_desc, "p", n="eoainfo").text = BOILERPLATES.get("Header","eoainfo_p1")
    eoainfo_p2 = etree.SubElement(project_desc, "p", n="eoainfo").text = BOILERPLATES.get("Header","eoainfo_p2")
    pressinformation = etree.SubElement(project_desc, "p", n="pressinformation")
    scientificboard = etree.SubElement(project_desc, "p", n="scientificboard")
    if organisation == "mprl":
        pressinformation.text = BOILERPLATES.get("Header","pressinformation_mprl")
        scientificboard.text = BOILERPLATES.get("Header","scientificboard_mprl")
    else:
        pressinformation.text = BOILERPLATES.get("Header","pressinformation_eos")
        scientificboard.text = BOILERPLATES.get("Header","scientificboard_eos")

    eoadevteam = etree.SubElement(project_desc, "p", n="eoadevteam").text = BOILERPLATES.get("Header","eoadevteam")
    encoding_desc.insert(0, project_desc)

    appinfo = xml_tree.xpath("//t:encodingDesc/t:appInfo", namespaces=NS_MAP)[0]
    fix_tei_info = libeoaconvert.write_appinfo("fix_tei", __version__, "fixtei", "Fix TEI for EOA", datetime.datetime.now())
    appinfo.insert(0, fix_tei_info)

    revision_desc = xml_tree.xpath("//t:revisionDesc", namespaces=NS_MAP)[0]
    olderchanges = revision_desc.find("t:listChange", namespaces=NS_MAP)
    olderchanges.clear()
    olderchanges.tag = "tagtobestripped"

    first_change = etree.SubElement(revision_desc, "change", when=datetime.datetime.now().strftime("%Y-%m-%d"), who="#fixtei")
    first_change.text = "Fixed TEI created by docxtotei"

    return xml_tree
# def fix_tei_header ends here

def add_tei_frontpart():
    """Add a small front part

    Contains a cover image and an optional dedication.
    """

    frontpart = etree.Element("front")
    cover_image = etree.SubElement(frontpart, "figure", type="cover")
    cover_url = etree.SubElement(cover_image, "graphic", url="images/Cover.jpg")
    cover_caption = etree.SubElement(cover_image, "head").text = "Cover caption"

    dedication = etree.SubElement(frontpart, "div", type="dedication")
    dedication_text = etree.SubElement(dedication, "ab").text = "Dedication text"

    return frontpart
# def add_tei_frontpart ends here

def evaluate_report(report, printlog, filename):
    """Print report of conversion."""

    report_string = "="*60 + "\n"
    report_string += ' '*4 + "Conversion report\n"
    report_string += "-"*60 + "\n"
    if len(report["bad_figures"]) > 0:
        report_string += "{} {} could not be linked to a file in the image directory:\n".format(len(report["bad_figures"]), libeoaconvert.plural(len(report["bad_figures"]), "figure"))
        for item in report["bad_figures"]:
            report_string += ' '*4 + item + "\n"
    else:
        report_string += "All figures were linked.\n"
    if len(report["citekeys_not_in_bib"]) > 0:
        report_string += "{} of {} {} could not be found in the bibliography database:\n".format(len(report["citekeys_not_in_bib"]), report["len_citekeys"], libeoaconvert.plural(len(report["citekeys_not_in_bib"]), "citation"))
        for item in report["citekeys_not_in_bib"]:
            report_string += ' '*4 +  item + "\n"
        report_string += "\nThe missing citations were also stored in the pickle file and can be re-used by the create_tmpbib tool.\n"
    else:
        report_string += "All citekeys were found in the bibliography database.\n"
    if len(report["bad_pageref"]) > 0:
        report_string += "{} page {} could not be parsed into start and end value:\n".format(len(report["bad_pageref"]), libeoaconvert.plural(len(report["bad_pageref"]), "reference"))
        for item in report["bad_pageref"]:
            report_string += ' '*4 + item + "\n"
    else:
        report_string += ("All page references could be parsed into discrete values.\n")
    report_string += "="*60 + "\n"

    if printlog:
        print(report_string)
    else:
        with open(filename.replace(".xml", ".log"), "w") as f:
             f.write(report_string)
    return
# def evaluate_report ends here


def pickle_data(citekeys_not_in_bib, used_citekeys, picklefile):
    """Create a structure for pickling data"""

    dictChapters = {}
    dictEquations = {}
    dictLists = {}
    dictTheorems = {}
    dictFigures = {}
    dictSections = {}
    dictFootnotes = {}
    dictTables = {}
    dictPagelabels = {}

    data_to_pickle = {'citekey_not_in_bib' : citekeys_not_in_bib,
                      'citekeys' : used_citekeys,
                      'chapterdict' : dictChapters,
                      'eqdict' : dictEquations,
                      'listdict' : dictLists,
                      'theoremdict' : dictTheorems,
                      'figdict' : dictFigures,
                      'secdict' : dictSections,
                      'fndict' : dictFootnotes,
                      'tabdict' : dictTables,
                      'pagelabeldict' : dictPagelabels}


    with open(picklefile, 'wb') as f:
        pickle.dump(data_to_pickle, f, pickle.HIGHEST_PROTOCOL)

    logging.info(f"Wrote {picklefile}.")
# def pickle_data ends here


def main():
    """The main bit"""

    parser = argparse.ArgumentParser()
    parser.add_argument("-d", "--dochighestorder", default='chapter', help="Specify which divider is at the highest level, possible values: part, chapter. Default is chapter.")
    parser.add_argument("-p", "--bibtexparserlog", help="Display logging output of bibtexparser", action="store_true")
    parser.add_argument("-f", "--finalize", help="Finalize a publication.", action="store_true")
    parser.add_argument("-t", "--citedrangetext", help="Do not try to parse cited range values.", action="store_true")
    parser.add_argument("-b", "--bibtype", help="Specify the type of bibliography, possible values: anthology, monograph.", default="monograph")
    parser.add_argument("-c", "--chapter", help="Treat the TEI as one chapter, discards header.", action="store_true")
    parser.add_argument("-a", "--addbibliography", help="Add a section with bibliography PI.", action="store_true")
    parser.add_argument("-l", "--printlog", help="Write logfile to stdout instead of writing to file.", action="store_true")
    parser.add_argument("-o", "--organisation", help="Specify which to publisher this publication belongs.", choices=["eos", "mprl"], default="mprl")
    parser.add_argument("teifile", help="Output from oxgarage/metypeset, an TEI XML file.")
    parser.add_argument("bibfile", help="The bibliography database of the publication.")
    parser.add_argument("figdir", help="The directory that contains the figures belonging to the publication.")
    args = parser.parse_args()

    highest_level = args.dochighestorder
    if highest_level not in ["chapter", "part"]:
        sys.stderr.write("Specify either 'chapter' or 'part' as highest level. Exiting")
        sys.exit()

    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
    if not os.path.exists(PICKLE_DIR):
        os.makedirs(PICKLE_DIR)

    with open(args.teifile, 'r') as xmlfile:
        xml_tree = etree.parse(xmlfile)

    report = {}

    ################
    # bibliography #
    ################

    if not args.bibtexparserlog:
        logging.debug("Suppressing debug output of bibtexparser")
        logging.getLogger("bibtexparser").propagate = False
    else:
        pass

    # bibtexparser
    bibdata = parse_bibtex(args.bibfile)

    xml_cleaned = cleanup_xml(xml_tree)
    cleaned_path = OUTPUT_DIR + os.path.sep + args.teifile.replace(".xml", "-cleaned.xml")
    xml_cleaned.write(cleaned_path, pretty_print=True, xml_declaration=True, encoding="utf-8")
    logging.info("Wrote %s." % cleaned_path)

    # first some modifications on a string object
    xml_string = etree.tostring(xml_cleaned).decode('utf-8')

    # the '#' sign is a bad choice!
    # mod_string = convert_references(xml_string)

    mod_string2, cited = convert_citations(xml_string, args.citedrangetext)

    used_citekeys = [unescape(c) for c in cited]
    citekeys_not_in_bib = validate_citations(used_citekeys, bibdata)

    report["len_citekeys"] = len(used_citekeys)
    report["citekeys_not_in_bib"] = citekeys_not_in_bib

    mod_string3 = convert_figures(mod_string2)

    math_string = convert_math(mod_string3)

    debug_output = OUTPUT_DIR + os.path.sep + args.teifile.replace(".xml", "-modified.xml")
    with open(debug_output, "w") as debugfile:
        debugfile.write(math_string)
    logging.info("Wrote %s." % debug_output)

    # check for wellformedness, read again as xml
    try:
        xml_tree2 = etree.fromstring(math_string)
    except etree.XMLSyntaxError:
        logging.error("\nXML syntax error when trying to parse modified tree. Dumped it to %s." % debug_output)
        print("-"*60)
        traceback.print_exc(file=sys.stdout)
        print("-"*60)
        exit()

    if args.finalize:
        pass
    else:
        all_figures = xml_tree2.xpath("//t:body//t:graphic", namespaces=NS_MAP)
        bad_figures = make_figure_elements(all_figures, args.figdir)

        report["bad_figures"] = bad_figures

    references_with_citedrange = xml_tree2.xpath("//t:bibl[t:ref/t:citedRange]", namespaces=NS_MAP)
    if args.finalize:
        pass
    else:
        bad_pageref = parse_cited_range(references_with_citedrange)
        report["bad_pageref"] = bad_pageref

        if args.chapter:
            tei_text = xml_tree2.xpath("//t:body", namespaces=NS_MAP)[0]
            body_element = tei_text.xpath("//t:body", namespaces=NS_MAP)[0]
            body_element.tag = "div"
            body_element.set("type", "chapter")

            etree.strip_tags(xml_tree2, "tagtobestripped")

            if args.addbibliography:
                logging.debug("Adding bib")
                insert_bib_section(body_element)
            else:
                logging.debug("Not adding bib")
                pass

            output = args.teifile.replace(".xml", "-out.xml")
            tree = etree.ElementTree(body_element)
            tree.write(output, pretty_print=True, xml_declaration=True,encoding="utf-8")
            logging.info("Wrote %s." % output)

        else:
            tei_header = xml_tree2.xpath("//t:teiHeader", namespaces=NS_MAP)
            fix_tei_header(tei_header[0], str(args.bibfile), str(args.bibtype), args.organisation)

            tei_text = xml_tree2.xpath("/t:TEI/t:text", namespaces=NS_MAP)[0]
            # this xml:id is for xmlmind functionality
            libeoaconvert.assign_xml_id(tei_text, "text")
            tei_front_part = add_tei_frontpart()
            tei_text.insert(0, tei_front_part)

            tei_root = xml_tree2.xpath("/t:TEI", namespaces=NS_MAP)[0]
            tei_root.set("change", "metopes_publication#eoa")

            etree.strip_tags(xml_tree2, "tagtobestripped")

    picklefile = "output/pickle_from_tei/data.pickle"
    pickle_data(citekeys_not_in_bib, used_citekeys, picklefile)

    fix_document_structure(xml_tree2, highest_level)
    # output
    if args.chapter:
        pass
    else:
        output = args.teifile.replace(".xml", "-out.xml")
        tree = etree.ElementTree(xml_tree2)
        tree.write(output, pretty_print=True, xml_declaration=True,encoding="utf-8")
        logging.info("Wrote %s." % output)

    if args.finalize:
        pass
    else:
        evaluate_report(report, args.printlog, args.teifile)
# def main ends here

if __name__ == '__main__':
    main()
# finis