libeoaconvert.py

#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-

"""A collection of functions for the different conversion steps"""

import os
import sys
import subprocess
import shlex
import logging
import configparser
from lxml import etree
from lxml.html import soupparser

##################################
# Reading the configuration file #
##################################
CONFIG_FILE = os.path.dirname(os.path.realpath(__file__)) + os.path.sep + "config" + os.path.sep + "eoaconvert.cfg"
# CONFIG_FILE = os.path.abspath(os.path.dirname(sys.argv[0])) + "/config/
CONFIG = configparser.ConfigParser()
CONFIG.read(CONFIG_FILE)

######################
# Setting up logging #
######################
LOGFILE = CONFIG['General']['logfile']
LOGLEVEL = CONFIG['General']['loglevel']

CSL_FILE = CONFIG['Auxiliaries']['CSL_FILE']
TRANSLATION_FILE = CONFIG['Auxiliaries']['TRANSLATIONS']

logging.basicConfig(level=LOGLEVEL, format='%(asctime)s - %(levelname)s - %(message)s')

if not os.path.exists("tmp_files/tmp_images/"):
    os.makedirs(os.path.expanduser("tmp_files/tmp_images/"))

tmp_image_dir = os.getcwd() + "/tmp_files/tmp_images/"
Datei = open('tmp_files/xelatex-run-images.log', 'w')

# Setup of various dictionaries for localization of various elements
# dictLangFootnotes = {"it" : "Note a piè pagina", "fr" : "notes en bas de page", "de" : "Fußnoten", "en" : "Footnotes"}
# dict_and = {"en" : "and", "de" : "und", "fr" : "et", "it" : "e"}
# dict_ed = {"en" : "ed.", "de" : "Hrsg."}
# dict_eds = {"en" : "eds.", "de" : "Hrsg."}

# use the translation file that is used also for XSL
translation_xml = etree.parse(TRANSLATION_FILE)
dictLangFootnotes = translation_xml.find("//entry[@name='footnotes']").attrib
dict_and = translation_xml.find("//entry[@name='and']").attrib
dict_ed = translation_xml.find("//entry[@name='editor-abbr']").attrib
dict_eds = translation_xml.find("//entry[@name='editors-abbr']").attrib

# the new-style footnotes that use LaTeX bigfoot show up in the following order:
footnote_groups = ["decimal", "lower-latin"]

#########################
# Bibliography settings #
#########################
allowed_bibentry_types = ["book", "booklet", "report", "thesis", "misc", "incollection", "inproceedings", "article", "newspaper"]

def get_bigfoot_data(chapter):
    """
    footnotes are per-chapter
    footnote numbers reset for each chapter
    this helper takes a chapter and returns a collection containing its new-style footnotes that use LaTeX bigfoot
    the result is an association list: a list of key-value pairs
    the values are, for each type of footnote, a list of the footnotes of that type, in the order in which they appear in the chapter
    """
    xmlBigfootNotes = list(chapter.findall(".//EOAbigfoot"))
    return [ # a list
        ( # of tuples
            grouping, # the key
            [ # the value: a filter of the above list
                note
                for note
                in xmlBigfootNotes
                if grouping == note.get("list-style-type")
            ],
        )
        for grouping
        in footnote_groups # the types we support
    ]
# def get_bigfoot_data ends here

def sanitizeImage(strImagepath, GM_PATH, TL_PATH):
    """Adjust and convert image for epub standard"""

    logging.debug(strImagepath)
    strCommand = GM_PATH + " identify -format \"%w\" " + strImagepath
    listArguments = shlex.split(strCommand)
    exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
    intImageWidth = int(exeShell)
    if intImageWidth > 700:
        strCommand = GM_PATH + " convert " + strImagepath + " -resize 700x\\> " + strImagepath
        listArguments = shlex.split(strCommand)
        subprocess.check_output(listArguments, shell=False)
    strCommand = GM_PATH + " identify -format \"%h\" " + strImagepath
    listArguments = shlex.split(strCommand)
    exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
    intImageHeight = int(exeShell)
    if intImageHeight > 1000:
        strCommand = GM_PATH + " convert " + strImagepath + " -resize x1000\\> " + strImagepath
        listArguments = shlex.split(strCommand)
        subprocess.check_output(listArguments, shell=False)
    strCommand = GM_PATH + " identify -format \"%m\" " + strImagepath
    listArguments = shlex.split(strCommand)
    exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
    strFileFormat = str(exeShell)
    strFileFormat = strFileFormat.strip()
    if strFileFormat == "JPEG":
        pass
        # print("looking at jpeg file")
        # strNewImagepath = os.path.splitext(strImagepath)[0]
        # strCommand = GM_PATH + " convert " + strImagepath + " " + strNewImagepath + ".jpg"
        # listArguments = shlex.split(strCommand)
        # subprocess.call(listArguments)
        # os.remove(strImagepath)
        # strImagepath = strNewImagepath + ".jpg"
    elif strFileFormat == "PNG":
        pass
        # print("looking at png file")
        # strNewImagepath = os.path.splitext(strImagepath)[0]
        # strCommand = GM_PATH + " convert " + strImagepath + " " + strNewImagepath + ".png"
        # listArguments = shlex.split(strCommand)
        # subprocess.call(listArguments)
        # os.remove(strImagepath)
        # strImagepath = strNewImagepath + ".png"
    elif strFileFormat == "PDF":
        strNewImagepath = os.path.splitext(strImagepath)[0]
        clipped_file = strImagepath.replace(".pdf", "-clipped.pdf")

        Kommando = TL_PATH + "texmf-dist/scripts/pdfcrop/pdfcrop.pl --margins 10 --clip --hires " + strImagepath + " " + clipped_file
        logging.debug(Kommando)

        Argumente = shlex.split(Kommando)
        subprocess.call(Argumente, cwd=tmp_image_dir, stdout=Datei)

        strCommand = GM_PATH + " convert -density 400 " + clipped_file + " " + strNewImagepath + ".png"
        listArguments = shlex.split(strCommand)
        subprocess.call(listArguments)
        logging.debug("Removing two files: %s and %s " % (clipped_file, strImagepath))
        os.remove(clipped_file)
        os.remove(strImagepath)
        strImagepath = strNewImagepath + ".png"

    # print ("Hier ein Pfad zu einem Bild:")
    # print (strImagepath)

    return strImagepath
# def sanitizeImage ends here

def gettext(xmlElement):
    """Return plain text out of nested elements"""

    xmlText = xmlElement.text or ""
    for xmlChild in xmlElement:
        xmlText += gettext(xmlChild)
        if xmlChild.tail:
            xmlText += xmlChild.tail
    return xmlText
# def gettext ends here

def deb_var(obj):
    """https://stackoverflow.com/questions/592746/how-can-you-print-a-variable-name-in-python"""
    name = [name for name in globals() if globals()[name] is obj][0]
    print("DEBUG: %s: %s" % (name, obj))
# def deb_var ends here

def two_letter_language(language_string):
    """Return a two letter code for a language"""

    if language_string in ["english", "en"]:
        return "en"
    elif language_string in ["german", "deutsch", "de"]:
        return "de"
    elif language_string in ["french", "fr"]:
        return "fr"
    elif language_string in ["italian", "it"]:
        return "it"
# two_letter_language ends here

def plural(num, noun):
    """Return singular or plural form of noun, depending on num.

    Works only when a noun's plural is formed with 's'. """

    if num == 1:
        return noun
    else:
        return noun + "s"
# def plural ends here

def format_citations(used_citekeys, bibdata, language, tmp_filename):
    """Return a formatted xmlstring of the used citations"""

    tmp_path_md = "tmp_files" + os.path.sep + tmp_filename + ".md"
    tmp_path_html = "tmp_files" + os.path.sep + tmp_filename + ".html"

    md_file_header = "---\nlang: %s\ntitle: Citations\n...\n\n" % two_letter_language(language)

    with open(tmp_path_md, "w") as citation_formatter:
        citation_formatter.write(md_file_header)
        citation_formatter.write("# citeauthoryear\n")
        for entry in used_citekeys:
            citation_formatter.write("[@%s]\n" % entry)
        citation_formatter.write("\n# citeyear\n")
        for entry in used_citekeys:
            citation_formatter.write("[-@%s]\n" % entry)
        # citation_formatter.write("\n# yearparen\n")
        # for entry in used_citekeys:
        #     citation_formatter.write("@%s\n" % entry)
        citation_formatter.write("\n# References\n")

    command = "pandoc -o %s -t html --filter=pandoc-citeproc --bibliography=%s --csl=%s %s" % (tmp_path_html, bibdata, CSL_FILE, tmp_path_md)
    arguments = shlex.split(command)
    logging.info("Using external command pandoc with command %s" % command)
    subprocess.call(arguments)

    with open(tmp_path_html, "r") as ding:
        dd = soupparser.fromstring(ding, features="html.parser")

    references = dd.xpath("//div[@class='references']")
    return references
# def format_citations ends here

def fix_bib_entries(div_snippet):
    """Modify the html code returned by pandoc-citeproc"""

    entries = div_snippet.findall(".//div")

    for entry in entries:
        entry_id = entry.get("id")
        entry.set("class", "bibliography")
        etree.strip_tags(entry, "p")
        entry.tag = "p"
        internal_markup = entry.findall(".//em")
        for markup in internal_markup:
            markup.tag = "i"

    return div_snippet
# def fix_bib_entries ends here

def debug_xml_here(xml_tree, xml_filename):
    """Dump current state of an XML tree into a file for inspection"""

    if not os.path.exists("debug"):
        os.makedirs(os.path.expanduser("debug"))

    xml_path = "%s/debug/debug_%s.xml" % (os.getcwd(), xml_filename)

    if isinstance(xml_tree, etree._ElementTree):
        pass
    else:
        xml_tree = etree.ElementTree(xml_tree)

    xml_tree.write(xml_path, pretty_print=True, xml_declaration=True,encoding="utf-8")
    logging.info("Wrote %s." % xml_path)
# def debug_xml_here ends here