libeoaconvert.py

#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-

import os
import sys
import subprocess
import shlex
import logging
import configparser
from lxml import etree
from lxml.html import soupparser

##################################
# Reading the configuration file #
##################################
CONFIG_FILE = os.path.dirname(sys.argv[0]) + "/config/eoaconvert.cfg"
CONFIG = configparser.ConfigParser()
CONFIG.read(CONFIG_FILE)

######################
# Setting up logging #
######################
LOGFILE = CONFIG['General']['logfile']
LOGLEVEL = CONFIG['General']['loglevel']

CSL_FILE = CONFIG['Auxiliaries']['CSL_FILE']

logging.basicConfig(level=LOGLEVEL, format='%(asctime)s - %(levelname)s - %(message)s')


if not os.path.exists("tmp_files/tmp_images/"):
    os.makedirs(os.path.expanduser("tmp_files/tmp_images/"))

tmp_image_dir = os.getcwd() + "/tmp_files/tmp_images/"
Datei = open('tmp_files/xelatex-run-images.log', 'w')

# Setup of various dictionaries for localization of various elements
dictLangFootnotes = {"it" : "Note a piè pagina", "fr" : "notes en bas de page", "de" : "Fußnoten", "en" : "Footnotes"}
dict_and = {"en" : "and", "de" : "und", "fr" : "et", "it" : "e"}
dict_ed = {"en" : "ed.", "de" : "Hrsg."}
dict_eds = {"en" : "eds.", "de" : "Hrsg."}

# the new-style footnotes that use LaTeX bigfoot show up in the following order:
footnote_groups = ["decimal", "lower-latin"]

#########################
# Bibliography settings #
#########################
allowed_bibentry_types = ["book", "booklet", "report", "thesis", "misc", "incollection", "inproceedings", "article", "newspaper"]

def get_bigfoot_data(chapter):
    """
    footnotes are per-chapter
    footnote numbers reset for each chapter
    this helper takes a chapter and returns a collection containing its new-style footnotes that use LaTeX bigfoot
    the result is an association list: a list of key-value pairs
    the values are, for each type of footnote, a list of the footnotes of that type, in the order in which they appear in the chapter
    """
    xmlBigfootNotes = list(chapter.findall(".//EOAbigfoot"))
    return [ # a list
        ( # of tuples
            grouping, # the key
            [ # the value: a filter of the above list
                note
                for note
                in xmlBigfootNotes
                if grouping == note.get("list-style-type")
            ],
        )
        for grouping
        in footnote_groups # the types we support
    ]
# def get_bigfoot_data ends here

def sanitizeImage(strImagepath, GM_PATH, TL_PATH):
    """Adjust and convert image for epub standard"""

    logging.debug(strImagepath)
    strCommand = GM_PATH + " identify -format \"%w\" " + strImagepath
    listArguments = shlex.split(strCommand)
    exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
    intImageWidth = int(exeShell)
    if intImageWidth > 700:
        strCommand = GM_PATH + " convert " + strImagepath + " -resize 700x\\> " + strImagepath
        listArguments = shlex.split(strCommand)
        subprocess.check_output(listArguments, shell=False)
    strCommand = GM_PATH + " identify -format \"%h\" " + strImagepath
    listArguments = shlex.split(strCommand)
    exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
    intImageHeight = int(exeShell)
    if intImageHeight > 1000:
        strCommand = GM_PATH + " convert " + strImagepath + " -resize x1000\\> " + strImagepath
        listArguments = shlex.split(strCommand)
        subprocess.check_output(listArguments, shell=False)
    strCommand = GM_PATH + " identify -format \"%m\" " + strImagepath
    listArguments = shlex.split(strCommand)
    exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
    strFileFormat = str(exeShell)
    strFileFormat = strFileFormat.strip()
    if strFileFormat == "JPEG":
        pass
        # print("looking at jpeg file")
        # strNewImagepath = os.path.splitext(strImagepath)[0]
        # strCommand = GM_PATH + " convert " + strImagepath + " " + strNewImagepath + ".jpg"
        # listArguments = shlex.split(strCommand)
        # subprocess.call(listArguments)
        # os.remove(strImagepath)
        # strImagepath = strNewImagepath + ".jpg"
    elif strFileFormat == "PNG":
        pass
        # print("looking at png file")
        # strNewImagepath = os.path.splitext(strImagepath)[0]
        # strCommand = GM_PATH + " convert " + strImagepath + " " + strNewImagepath + ".png"
        # listArguments = shlex.split(strCommand)
        # subprocess.call(listArguments)
        # os.remove(strImagepath)
        # strImagepath = strNewImagepath + ".png"
    elif strFileFormat == "PDF":
        strNewImagepath = os.path.splitext(strImagepath)[0]
        clipped_file = strImagepath.replace(".pdf", "-clipped.pdf")

        Kommando = TL_PATH + "texmf-dist/scripts/pdfcrop/pdfcrop.pl --margins 10 --clip --hires " + strImagepath + " " + clipped_file
        logging.debug(Kommando)

        Argumente = shlex.split(Kommando)
        subprocess.call(Argumente, cwd=tmp_image_dir, stdout=Datei)

        strCommand = GM_PATH + " convert -density 400 " + clipped_file + " " + strNewImagepath + ".png"
        listArguments = shlex.split(strCommand)
        subprocess.call(listArguments)
        logging.debug("Removing two files: %s and %s " % (clipped_file, strImagepath))
        os.remove(clipped_file)
        os.remove(strImagepath)
        strImagepath = strNewImagepath + ".png"

    # print ("Hier ein Pfad zu einem Bild:")
    # print (strImagepath)

    return strImagepath
# def sanitizeImage ends here

def gettext(xmlElement):
    """Return plain text out of nested elements"""

    xmlText = xmlElement.text or ""
    for xmlChild in xmlElement:
        xmlText += gettext(xmlChild)
        if xmlChild.tail:
            xmlText += xmlChild.tail
    return xmlText
# def gettext ends here

def deb_var(obj):
    """https://stackoverflow.com/questions/592746/how-can-you-print-a-variable-name-in-python"""
    name = [name for name in globals() if globals()[name] is obj][0]
    print("DEBUG: %s: %s" % (name, obj))
# def deb_var ends here

def two_letter_language(language_string):
    """Return a two letter code for a language"""

    if language_string in ["english", "en"]:
        return "en"
    elif language_string in ["german", "deutsch", "de"]:
        return "de"
    elif language_string in ["french", "fr"]:
        return "fr"
    elif language_string in ["italian", "it"]:
        return "it"
# two_letter_language ends here

def plural(num, noun):
    """Return singular or plural form of noun, depending on num.

    Works only when a noun's plural is formed with 's'. """

    if num == 1:
        return noun
    else:
        return noun + "s"
# def plural ends here

def format_citations(used_citekeys, bibdata, language, tmp_filename):
    """Return a formatted xmlstring of the used citations"""

    tmp_path_md = "tmp_files" + os.path.sep + tmp_filename + ".md"
    tmp_path_html = "tmp_files" + os.path.sep + tmp_filename + ".html"

    md_file_header = "---\nlang: %s\ntitle: Citations\n...\n\n" % two_letter_language(language)

    with open(tmp_path_md, "w") as citation_formatter:
        citation_formatter.write(md_file_header)
        citation_formatter.write("# citeauthoryear\n")
        for entry in used_citekeys:
            citation_formatter.write("[@%s]\n" % entry)
        citation_formatter.write("\n# citeyear\n")
        for entry in used_citekeys:
            citation_formatter.write("[-@%s]\n" % entry)
        # citation_formatter.write("\n# yearparen\n")
        # for entry in used_citekeys:
        #     citation_formatter.write("@%s\n" % entry)
        citation_formatter.write("\n# References\n")

    command = "pandoc -o %s -t html --filter=pandoc-citeproc --bibliography=%s --csl=%s %s" % (tmp_path_html, bibdata, CSL_FILE, tmp_path_md)
    arguments = shlex.split(command)
    logging.info("Using external command pandoc with command %s" % command)
    subprocess.call(arguments)

    with open(tmp_path_html, "r") as ding:
        dd = soupparser.fromstring(ding, features="html.parser")

    references = dd.xpath("//div[@class='references']")
    return references
# def format_citations ends here

def fix_bib_entries(div_snippet):
    """Modify the html code returned by pandoc-citeproc"""

    entries = div_snippet.findall(".//div")

    for entry in entries:
        entry_id = entry.get("id")
        entry.set("class", "bibliography")
        etree.strip_tags(entry, "p")
        entry.tag = "p"
        internal_markup = entry.findall(".//em")
        for markup in internal_markup:
            markup.tag = "i"

    return div_snippet
# def fix_bib_entries ends here

def debug_xml_here(xml_tree, xml_filename):
    """Dump current state of an XML tree into a file for inspection"""

    xml_path = "%s/debug/debug_%s.xml" % (os.getcwd(), xml_filename)

    if isinstance(xml_tree, etree._ElementTree):
        pass
    else:
        xml_tree = etree.ElementTree(xml_tree)

    xml_tree.write(xml_path, pretty_print=True, xml_declaration=True,encoding="utf-8")
    logging.info("Wrote %s." % xml_path)
# def debug_xml_here ends here
	#!/usr/bin/env python3
	# -- coding: utf-8; mode: python --

	import os
	import sys
	import subprocess
	import shlex
	import logging
	import configparser
	from lxml import etree
	from lxml.html import soupparser

	##################################
	# Reading the configuration file #
	##################################
	CONFIG_FILE = os.path.dirname(sys.argv[0]) + "/config/eoaconvert.cfg"
	CONFIG = configparser.ConfigParser()
	CONFIG.read(CONFIG_FILE)

	######################
	# Setting up logging #
	######################
	LOGFILE = CONFIG['General']['logfile']
	LOGLEVEL = CONFIG['General']['loglevel']

	CSL_FILE = CONFIG['Auxiliaries']['CSL_FILE']

	logging.basicConfig(level=LOGLEVEL, format='%(asctime)s - %(levelname)s - %(message)s')


	if not os.path.exists("tmp_files/tmp_images/"):
	os.makedirs(os.path.expanduser("tmp_files/tmp_images/"))

	tmp_image_dir = os.getcwd() + "/tmp_files/tmp_images/"
	Datei = open('tmp_files/xelatex-run-images.log', 'w')

	# Setup of various dictionaries for localization of various elements
	dictLangFootnotes = {"it" : "Note a piè pagina", "fr" : "notes en bas de page", "de" : "Fußnoten", "en" : "Footnotes"}
	dict_and = {"en" : "and", "de" : "und", "fr" : "et", "it" : "e"}
	dict_ed = {"en" : "ed.", "de" : "Hrsg."}
	dict_eds = {"en" : "eds.", "de" : "Hrsg."}

	# the new-style footnotes that use LaTeX bigfoot show up in the following order:
	footnote_groups = ["decimal", "lower-latin"]

	#########################
	# Bibliography settings #
	#########################
	allowed_bibentry_types = ["book", "booklet", "report", "thesis", "misc", "incollection", "inproceedings", "article", "newspaper"]

	def get_bigfoot_data(chapter):
	"""
	footnotes are per-chapter
	footnote numbers reset for each chapter
	this helper takes a chapter and returns a collection containing its new-style footnotes that use LaTeX bigfoot
	the result is an association list: a list of key-value pairs
	the values are, for each type of footnote, a list of the footnotes of that type, in the order in which they appear in the chapter
	"""
	xmlBigfootNotes = list(chapter.findall(".//EOAbigfoot"))
	return [ # a list
	( # of tuples
	grouping, # the key
	[ # the value: a filter of the above list
	note
	for note
	in xmlBigfootNotes
	if grouping == note.get("list-style-type")
	],
	)
	for grouping
	in footnote_groups # the types we support
	]
	# def get_bigfoot_data ends here

	def sanitizeImage(strImagepath, GM_PATH, TL_PATH):
	"""Adjust and convert image for epub standard"""

	logging.debug(strImagepath)
	strCommand = GM_PATH + " identify -format \"%w\" " + strImagepath
	listArguments = shlex.split(strCommand)
	exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
	intImageWidth = int(exeShell)
	if intImageWidth > 700:
	strCommand = GM_PATH + " convert " + strImagepath + " -resize 700x\\> " + strImagepath
	listArguments = shlex.split(strCommand)
	subprocess.check_output(listArguments, shell=False)
	strCommand = GM_PATH + " identify -format \"%h\" " + strImagepath
	listArguments = shlex.split(strCommand)
	exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
	intImageHeight = int(exeShell)
	if intImageHeight > 1000:
	strCommand = GM_PATH + " convert " + strImagepath + " -resize x1000\\> " + strImagepath
	listArguments = shlex.split(strCommand)
	subprocess.check_output(listArguments, shell=False)
	strCommand = GM_PATH + " identify -format \"%m\" " + strImagepath
	listArguments = shlex.split(strCommand)
	exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
	strFileFormat = str(exeShell)
	strFileFormat = strFileFormat.strip()
	if strFileFormat == "JPEG":
	pass
	# print("looking at jpeg file")
	# strNewImagepath = os.path.splitext(strImagepath)[0]
	# strCommand = GM_PATH + " convert " + strImagepath + " " + strNewImagepath + ".jpg"
	# listArguments = shlex.split(strCommand)
	# subprocess.call(listArguments)
	# os.remove(strImagepath)
	# strImagepath = strNewImagepath + ".jpg"
	elif strFileFormat == "PNG":
	pass
	# print("looking at png file")
	# strNewImagepath = os.path.splitext(strImagepath)[0]
	# strCommand = GM_PATH + " convert " + strImagepath + " " + strNewImagepath + ".png"
	# listArguments = shlex.split(strCommand)
	# subprocess.call(listArguments)
	# os.remove(strImagepath)
	# strImagepath = strNewImagepath + ".png"
	elif strFileFormat == "PDF":
	strNewImagepath = os.path.splitext(strImagepath)[0]
	clipped_file = strImagepath.replace(".pdf", "-clipped.pdf")

	Kommando = TL_PATH + "texmf-dist/scripts/pdfcrop/pdfcrop.pl --margins 10 --clip --hires " + strImagepath + " " + clipped_file
	logging.debug(Kommando)

	Argumente = shlex.split(Kommando)
	subprocess.call(Argumente, cwd=tmp_image_dir, stdout=Datei)

	strCommand = GM_PATH + " convert -density 400 " + clipped_file + " " + strNewImagepath + ".png"
	listArguments = shlex.split(strCommand)
	subprocess.call(listArguments)
	logging.debug("Removing two files: %s and %s " % (clipped_file, strImagepath))
	os.remove(clipped_file)
	os.remove(strImagepath)
	strImagepath = strNewImagepath + ".png"

	# print ("Hier ein Pfad zu einem Bild:")
	# print (strImagepath)

	return strImagepath
	# def sanitizeImage ends here

	def gettext(xmlElement):
	"""Return plain text out of nested elements"""

	xmlText = xmlElement.text or ""
	for xmlChild in xmlElement:
	xmlText += gettext(xmlChild)
	if xmlChild.tail:
	xmlText += xmlChild.tail
	return xmlText
	# def gettext ends here

	def deb_var(obj):
	"""https://stackoverflow.com/questions/592746/how-can-you-print-a-variable-name-in-python"""
	name = [name for name in globals() if globals()[name] is obj][0]
	print("DEBUG: %s: %s" % (name, obj))
	# def deb_var ends here

	def two_letter_language(language_string):
	"""Return a two letter code for a language"""

	if language_string in ["english", "en"]:
	return "en"
	elif language_string in ["german", "deutsch", "de"]:
	return "de"
	elif language_string in ["french", "fr"]:
	return "fr"
	elif language_string in ["italian", "it"]:
	return "it"
	# two_letter_language ends here

	def plural(num, noun):
	"""Return singular or plural form of noun, depending on num.

	Works only when a noun's plural is formed with 's'. """

	if num == 1:
	return noun
	else:
	return noun + "s"
	# def plural ends here

	def format_citations(used_citekeys, bibdata, language, tmp_filename):
	"""Return a formatted xmlstring of the used citations"""

	tmp_path_md = "tmp_files" + os.path.sep + tmp_filename + ".md"
	tmp_path_html = "tmp_files" + os.path.sep + tmp_filename + ".html"

	md_file_header = "---\nlang: %s\ntitle: Citations\n...\n\n" % two_letter_language(language)

	with open(tmp_path_md, "w") as citation_formatter:
	citation_formatter.write(md_file_header)
	citation_formatter.write("# citeauthoryear\n")
	for entry in used_citekeys:
	citation_formatter.write("[@%s]\n" % entry)
	citation_formatter.write("\n# citeyear\n")
	for entry in used_citekeys:
	citation_formatter.write("[-@%s]\n" % entry)
	# citation_formatter.write("\n# yearparen\n")
	# for entry in used_citekeys:
	# citation_formatter.write("@%s\n" % entry)
	citation_formatter.write("\n# References\n")

	command = "pandoc -o %s -t html --filter=pandoc-citeproc --bibliography=%s --csl=%s %s" % (tmp_path_html, bibdata, CSL_FILE, tmp_path_md)
	arguments = shlex.split(command)
	logging.info("Using external command pandoc with command %s" % command)
	subprocess.call(arguments)

	with open(tmp_path_html, "r") as ding:
	dd = soupparser.fromstring(ding, features="html.parser")

	references = dd.xpath("//div[@class='references']")
	return references
	# def format_citations ends here

	def fix_bib_entries(div_snippet):
	"""Modify the html code returned by pandoc-citeproc"""

	entries = div_snippet.findall(".//div")

	for entry in entries:
	entry_id = entry.get("id")
	entry.set("class", "bibliography")
	etree.strip_tags(entry, "p")
	entry.tag = "p"
	internal_markup = entry.findall(".//em")
	for markup in internal_markup:
	markup.tag = "i"

	return div_snippet
	# def fix_bib_entries ends here

	def debug_xml_here(xml_tree, xml_filename):
	"""Dump current state of an XML tree into a file for inspection"""

	xml_path = "%s/debug/debug_%s.xml" % (os.getcwd(), xml_filename)

	if isinstance(xml_tree, etree._ElementTree):
	pass
	else:
	xml_tree = etree.ElementTree(xml_tree)

	xml_tree.write(xml_path, pretty_print=True, xml_declaration=True,encoding="utf-8")
	logging.info("Wrote %s." % xml_path)
	# def debug_xml_here ends here