src/utils/libeoaconvert.py

#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-

"""A collection of functions for the different conversion steps"""

from utils.load_config import exec_command, ToLog, ToFile

import os
import sys
import subprocess
import shlex
import logging
import configparser
from datetime import datetime
from lxml import etree
from lxml.html import soupparser
from pathlib import Path


BASE_DIR = Path( os.path.realpath(__file__) ).parent

# the new-style footnotes that use LaTeX bigfoot show up in the following order:
# global variables
footnote_groups = ["decimal", "lower-latin"]

#########################
# Bibliography settings #
#########################
allowed_bibentry_types = ["book", "booklet", "report", "thesis", "misc", "incollection", "inproceedings", "article", "newspaper"]

def enable_preamble(
        input_file,
        output_file,
        pdf_or_xml
):
    logging.debug(f"Enabling preamble {pdf_or_xml}. Input file is {input_file}, outputting to {output_file}")

    with open( input_file, "r" ) as i:
        with open( output_file, "w" ) as o:
            if( pdf_or_xml == "pdf" ):
                o.write( "\input{preambel/pre_eoa}\n" )
            else:
                o.write( "\input{preambel/pre_xml}\n" )
            o.write( i.read() )

def get_bigfoot_data(chapter):
    """
    footnotes are per-chapter
    footnote numbers reset for each chapter
    this helper takes a chapter and returns a collection containing its new-style footnotes that use LaTeX bigfoot
    the result is an association list: a list of key-value pairs
    the values are, for each type of footnote, a list of the footnotes of that type, in the order in which they appear in the chapter
    """
    xmlBigfootNotes = list(chapter.findall(".//EOAbigfoot"))
    return [ # a list
        ( # of tuples
            grouping, # the key
            [ # the value: a filter of the above list
                note
                for note
                in xmlBigfootNotes
                if grouping == note.get("list-style-type")
            ],
        )
        for grouping
        in footnote_groups # the types we support
    ]
# def get_bigfoot_data ends here

def sanitizeImage(
        strImagepath,
        tmp_dir,
        GM_PATH,
        PDFCROP_EXEC,
        # TL_PATH
):
    """Adjust and convert image for epub standard"""

    tmp_dir = Path( tmp_dir )
    strImagepath = Path( strImagepath )
    if not (tmp_dir / "tmp_images").exists():
        os.makedirs(tmp_dir / "tmp_images/")

    tmp_image_dir = tmp_dir / "tmp_images"

    logging.debug(strImagepath)
    intImageWidth = int(subprocess.check_output(
        shlex.split( f"{GM_PATH} identify -format \"%w\" {strImagepath}" ),
        universal_newlines=True
    ))
    if intImageWidth > 700:
        exec_command(
            f"{GM_PATH} convert {strImagepath} -colorspace RGB -resize 700x\\> {strImagepath}"
        )
    intImageHeight = int( subprocess.check_output(
        shlex.split( f"{GM_PATH} identify -format \"%h\" {strImagepath}" ),
        universal_newlines=True
    ))
    if intImageHeight > 1000:
        exec_command(
            f"{GM_PATH} convert {strImagepath} -colorspace RGB -resize x1000\\> {strImagepath}"
        )
    strFileFormat_bytes =  subprocess.check_output(
        shlex.split( f"{GM_PATH} identify -format \"%m\" {strImagepath}" )
    )
    strFileFormat = strFileFormat_bytes.decode("utf-8").strip()
    logging.debug(f"Image has been recognized as having format {strFileFormat} by {GM_PATH}.")
    if strFileFormat == "JPEG":
        pass
        # print("looking at jpeg file")
        # strNewImagepath = os.path.splitext(strImagepath)[0]
        # strCommand = GM_PATH + " convert " + strImagepath + " " + strNewImagepath + ".jpg"
        # listArguments = shlex.split(strCommand)
        # subprocess.call(listArguments)
        # os.remove(strImagepath)
        # strImagepath = strNewImagepath + ".jpg"
    elif strFileFormat == "PNG":
        pass
        # print("looking at png file")
        # strNewImagepath = os.path.splitext(strImagepath)[0]
        # strCommand = GM_PATH + " convert " + strImagepath + " " + strNewImagepath + ".png"
        # listArguments = shlex.split(strCommand)
        # subprocess.call(listArguments)
        # os.remove(strImagepath)
        # strImagepath = strNewImagepath + ".png"
    elif strFileFormat == "PDF":
        strNewImagepath = os.path.splitext(str(strImagepath))[0]
        clipped_file = str(strImagepath).replace(".pdf", "-clipped.pdf")

        exec_command(
            f"{PDFCROP_EXEC} --margins 10 --clip --hires {strImagepath} {clipped_file}",
            # wd = tmp_image_dir
        )
        exec_command(
            f"{GM_PATH} convert -density 400 {clipped_file} {strNewImagepath}.png"
        )
        logging.debug("Removing two files: %s and %s " % (clipped_file, strImagepath))
        os.remove(clipped_file)
        os.remove(strImagepath)
        strImagepath = strNewImagepath + ".png"
    else:
        logging.error("Image format not recognized. Exiting.")
        sys.exit( 1 )

    # print ("Hier ein Pfad zu einem Bild:")
    # print (strImagepath)

    return strImagepath
# def sanitizeImage ends here

def gettext(xmlElement):
    """Return plain text out of nested elements"""

    xmlText = xmlElement.text or ""
    for xmlChild in xmlElement:
        xmlText += gettext(xmlChild)
        if xmlChild.tail:
            xmlText += xmlChild.tail
    return xmlText
# def gettext ends here

def deb_var(obj):
    """https://stackoverflow.com/questions/592746/how-can-you-print-a-variable-name-in-python"""
    name = [name for name in globals() if globals()[name] is obj][0]
    print("DEBUG: %s: %s" % (name, obj))
# def deb_var ends here

def two_letter_language(language_string):
    """Return a two letter code for a language"""

    if language_string in ["english", "en"]:
        return "en"
    elif language_string in ["german", "deutsch", "de"]:
        return "de"
    elif language_string in ["french", "fr"]:
        return "fr"
    elif language_string in ["italian", "it"]:
        return "it"
# two_letter_language ends here

def plural(num, noun, plural=None):
    """Return singular or plural form of noun, depending on num.

    Plural form defaults to "s", but can be specified as keyword
    argument

    """

    if not plural:
        plural = f"{noun}s"

    if num == 0:
        phrase = f"no {plural}"
    elif num == 1:
        phrase = f"1 {noun}"
    else:
        phrase = f"{num} {plural}"

    return phrase
# def plural ends here


def format_citations_tex4ht(
        used_citekeys,
        bibdata,
        language,
        tmp_filename,
        tmp_dir
):
    """Return a formatted xmlstring of the used citations"""

    tmp_path_md = Path(tmp_dir) / (tmp_filename + ".tex")
    tmp_path_html = Path(tmp_dir) / (tmp_filename + ".html")

    return references
# def format_citations_tex4ht ends here

def format_citations(
        used_citekeys,
        bibdata,
        language,
        tmp_filename,
        csl_file,
        log_to = ToLog
):
    """Return a formatted xmlstring of the used citations"""

    tmp_path_md = tmp_filename.with_suffix(".md")
    tmp_path_html = tmp_filename.with_suffix(".html")
    '''
    tmp_path_md = "tmp_files" + os.path.sep + tmp_filename + ".md"
    tmp_path_html = "tmp_files" + os.path.sep + tmp_filename + ".html"
    '''

    md_file_header = "---\nlang: %s\ntitle: Citations\n...\n\n" % two_letter_language(language)

    with open(tmp_path_md, "w") as citation_formatter:
        citation_formatter.write(md_file_header)
        citation_formatter.write("# citeauthoryear\n")
        for entry in used_citekeys:
            citation_formatter.write("[@%s]\n" % entry)
        citation_formatter.write("\n# citeyear\n")
        for entry in used_citekeys:
            citation_formatter.write("[-@%s]\n" % entry)
        # citation_formatter.write("\n# yearparen\n")
        # for entry in used_citekeys:
        #     citation_formatter.write("@%s\n" % entry)
        citation_formatter.write("\n# References\n")

    exec_command(
        f"pandoc -o {tmp_path_html} -t html --filter=pandoc-citeproc --bibliography={bibdata} --csl={csl_file} {tmp_path_md}",
        output_to = log_to
    )

    with open(tmp_path_html, "r") as ding:
        dd = soupparser.fromstring(ding, features="html.parser")

    references = dd.xpath("//div[@class='references']")
    return references
# def format_citations ends here

def fix_bib_entries(div_snippet):
    """Modify the html code returned by pandoc-citeproc"""

    entries = div_snippet.findall(".//div")

    for entry in entries:
        entry_id = entry.get("id")
        entry.set("class", "bibliography")
        etree.strip_tags(entry, "p")
        entry.tag = "p"
        internal_markup = entry.findall(".//em")
        for markup in internal_markup:
            markup.tag = "i"

    return div_snippet
# def fix_bib_entries ends here

def debug_xml_here(
        xml_tree,
        xml_filename,
        output_dir
):
    """Dump current state of an XML tree into a file for inspection"""

    '''
    if not os.path.exists("debug"):
        os.makedirs(os.path.expanduser("debug"))

    xml_path = "%s/debug/debug_%s.xml" % (os.getcwd(), xml_filename)
    '''
    xml_path = (Path(output_dir) / xml_filename).with_suffix( ".xml")

    if isinstance(xml_tree, etree._ElementTree):
        pass
    else:
        xml_tree = etree.ElementTree(xml_tree)

    xml_tree.write( str(xml_path), pretty_print=True, xml_declaration=True,encoding="utf-8")
    logging.info(f"Wrote XML file for debugging purposes: {xml_path}.")
# def debug_xml_here ends here

def wrap_into_element(wrapper, wrappee):
    """Wrap an existing element into a new one"""

    old_tail = wrappee.tail
    wrappee.tail = ""
    wrappee.addprevious(wrapper)
    wrapper.insert(0, wrappee)
    wrapper.tail = old_tail

    return
# def wrap_into_element ends here

def remove_wrapping_element(wrapper):
    """Put child elements one level up and delete surrounding element"""

    wrappees = list(wrapper)
    wrapper_parent = wrapper.getparent()

    wrapper_text = wrapper.text
    if wrapper_text is not None:
        wrapper_text = wrapper_text.strip()
        if len(wrapper_text) > 0:
            logging.warning("Wrapping element contains text: %s", wrapper_text)
    wrapper_tail = wrapper.tail
    if wrapper_tail is not None:
        wrapper_tail = wrapper_tail.strip()
        if len(wrapper_tail) > 0:
            logging.warning("Wrapping element contains has tail: %s", wrapper_tail)

    wrapper_pos = wrapper_parent.index(wrapper)
    insert_position = wrapper_parent.index(wrapper)
    insertioncounter = insert_position
    for child in wrappees:
        wrapper_parent.insert(insertioncounter, child)
        insertioncounter += 1

    wrapper.clear()
    wrapper_parent.remove(wrapper)

    return
# def remove_wrapping_element ends here

def change_attribute_name(element, attribute, newname, add_hash=False):
    """Change name of an XML attribute, but retain value"""

    attribute_value = element.get(attribute)
    if attribute_value is not None:
        if add_hash is True:
            attribute_value = "#" + attribute_value
        else:
            pass
        element.set(newname, attribute_value)
        del element.attrib[attribute]
    else:
        # logging.warning("No attribute %s found.", attribute)
        pass

    return
# def change_attribute_name ends here

def transfer_xml_attributes(old_element_attributes, new_element):
    """Transfer the attributes of one element to another element.

    Expects the old elements in dictionary form"""

    for attrib in old_element_attributes:
        new_element.attrib[attrib] = old_element_attributes[attrib]

    return
# def transfer_xml_attributes ends here

def split_with_milestone_element(element, milestone, splitter):
    """Split the text of an element by inserting milestone tags."""

    element_text = element.text
    textparts = element_text.split(splitter)

    element_attributes = element.attrib
    element.clear()

    element.text = textparts[0] + splitter

    for part in textparts[1:-1]:
        lb_element = etree.Element(milestone)
        lb_element.tail = part + splitter
        element.append(lb_element)

    lb_element = etree.Element(milestone)
    lb_element.tail = textparts[-1]
    element.append(lb_element)
    transfer_xml_attributes(element_attributes, element)

    return
# def split_with_milestone_element ends here

def get_place_in_xml_tree(element, tree):
    """Find out the position of an element in a tree.

    Return the index. Example: how to insert an element after a specific
    element
    """

    xml_children = list(tree)
    position = xml_children.index(element)

    return position
# def get_place_in_xml_tree ends here

def assign_xml_id(element, identifier):
    """Assign an xml:id to an element"""

    element.attrib["{http://www.w3.org/XML/1998/namespace}id"] = identifier

    return
# def assign_xml_id ends here

def get_appinfo(ident, version, xmlid, text, date):
    """Log the change of a TEI document in the appinfo element"""

    logging.info("Writing appinfo")
    fix_tei_info = etree.Element("application", ident=ident, version=version, when=date)
    fix_tei_info.attrib["{http://www.w3.org/XML/1998/namespace}id"] = xmlid
    fix_tei_label = etree.SubElement(fix_tei_info, "label").text = text

    return fix_tei_info
# def get_appinfo ends here

def translate(term, publang, translation_file):
    """Translate a term"""

    translation_xml = etree.parse( str( translation_file ) )
    try:
        term_to_translate = translation_xml.find(f"//entry[@name='{term}']").attrib
    except Exception:
        logging.error(f"Term {term} not found in translation file. Please add it to {translation_file}. Exiting.")
        sys.exit(1)

    translated_term = term_to_translate.get(publang)

    if not translated_term:
        logging.error(f"Translation for term '{term}' in language with code {publang} is missing. Please add it to {translation_file}. Exiting.")
        sys.exit(1)
    else:
        return translated_term
# def translate ends here


def restore_xml_tags(text):
    """Convert XML entities back to code

    &lt; => <
    """

    replacements = {
        "&lt;" : "<",
        "&gt;" : ">",
        "&apos;" : "'",
        "&amp;" : "&"
        }

    for item in replacements:
        text = text.replace(item, replacements[item])

    return text
# def restore_xml_tags ends here


def escape_xml(raw_text, decode=True):
    """Convert xml markup to entities"""

    if decode:
        text = raw_text.decode("utf-8")
    else:
        text = raw_text

    replacements = {
        "&" : "&amp;",
        "<" : "&lt;" ,
        ">" : "&gt;",
        "'" : "&apos;",
        '"' : "&quot;",
        }

    for item in replacements:
        text = text.replace(item, replacements[item])

    return text
# def escape_xml ends here


def format_hyperlinks_django_epub(xmlHyperlink, strLanguage):
    """Convert IMXML element to href and append localized accessed date"""

    strURL = xmlHyperlink.get('url')
    if strURL.startswith("http://") == False:
        if strURL.startswith("https://") == False:
            strURL = "http://" + strURL
    xmlHyperlink.tag = "a"
    del xmlHyperlink.attrib["url"]
    xmlHyperlink.set("href", strURL)
    etree.strip_elements(xmlHyperlink, with_tail=True, *['allowbreak'])
    accessed_date_element = xmlHyperlink.find("./date")
    if accessed_date_element is not None:
        accessed_date = accessed_date_element.get("when")
        formatted_date = format_date(accessed_date, two_letter_language(strLanguage))
        # etree.strip_elements(accessed_date_element, with_tail=True)
        accessed_date_element.tag = "elementtoberemoved"
        accessed_date_element.tail = ""
        url_tail = xmlHyperlink.tail
        xmlHyperlink.tail = f", {formatted_date}{url_tail}"
        xmlHyperlink.text = strURL
    else:
        logging.warning(f"Found no accessed date at url {strURL}. Proceeding without accessed date.")
        url_tail = xmlHyperlink.tail
        xmlHyperlink.tail = f"{url_tail}"
        xmlHyperlink.text = strURL
    return
# def format_hyperlinks_django_epub ends here


def format_date(accessed_date, language):
    """Format date string"""

    parsed_date = datetime.strptime(accessed_date, "%Y-%m-%d")

    if language == "en":
        accessed_string = f"accessed {parsed_date:%B} {parsed_date.day}, {parsed_date:%Y}"
    elif language == "de":
        accessed_string = f"besucht am {parsed_date:%d}.{parsed_date:%m}.{parsed_date:%Y}"
    else:
        logging.error("Got an unrecognized language: %s. Exiting.", language)
        sys.exit(1)

    return accessed_string
# def format_date ends here


def has_text_or_children(cr):
    """Check whether an element contains text or further elements"""
    if cr.text or len(list(cr)) > 0:
        htoc = True
    else:
        htoc = False
    return htoc
# def has_text_or_children ends here


def progress(count, total, status=''):
    """Progress bar for command line. Taken from
    https://gist.github.com/vladignatyev/06860ec2040cb497f0f3"""

    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))

    percents = round(100.0 * count / float(total), 1)
    bar = '#' * filled_len + '-' * (bar_len - filled_len)

    sys.stdout.write('[%s] %s%s ... %s\r' % (bar, percents, '%', status))
    sys.stdout.flush()
# def progress ends here


def pdf_burst(input_file, tmpDir):
    """Split PDF file into single pages"""
    from PyPDF2 import PdfFileWriter, PdfFileReader

    input1 = PdfFileReader(open(tmpDir / input_file, "rb"))
    logging.debug("Input is %s and has %d pages." % (input_file, input1.getNumPages()))

    for pageno in range(input1.getNumPages()):
        output = PdfFileWriter()
        output.addPage(input1.getPage(pageno))

        output_filename = tmpDir / ("EOAformulas_%d.pdf" % (pageno + 1))
        output_stream = open(output_filename, 'wb')
        output.write(output_stream)
        output_stream.close()
        logging.debug("Wrote %s." % output_filename)

        pageno += 1
# def pdf_burst ends here