libeoaconvert.py

#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-

import os
import sys
import subprocess
import shlex
import logging
import configparser
from lxml.html import soupparser

##################################
# Reading the configuration file #
##################################
CONFIG_FILE = os.path.dirname(sys.argv[0]) + "/config/eoaconvert.cfg"
CONFIG = configparser.ConfigParser()
CONFIG.read(CONFIG_FILE)

######################
# Setting up logging #
######################
LOGFILE = CONFIG['General']['logfile']
LOGLEVEL = CONFIG['General']['loglevel']

CSL_FILE = CONFIG['Auxiliaries']['CSL_FILE']

logging.basicConfig(level=LOGLEVEL, format='%(asctime)s - %(levelname)s - %(message)s')


if not os.path.exists("tmp_files/tmp_images/"):
    os.makedirs(os.path.expanduser("tmp_files/tmp_images/"))

tmp_image_dir = os.getcwd() + "/tmp_files/tmp_images/"
Datei = open('tmp_files/xelatex-run-images.log', 'w')

# Setup of various dictionaries for localization of various elements
dictLangFootnotes = {"it" : "Note a piè pagina", "fr" : "notes en bas de page", "de" : "Fußnoten", "en" : "Footnotes"}

# the new-style footnotes that use LaTeX bigfoot show up in the following order:
footnote_groups = ["decimal", "lower-latin"]

#########################
# Bibliography settings #
#########################
allowed_bibentry_types = ["book", "booklet", "report", "thesis", "misc", "incollection", "inproceedings", "article", "newspaper"]

def get_bigfoot_data(chapter):
    """
    footnotes are per-chapter
    footnote numbers reset for each chapter
    this helper takes a chapter and returns a collection containing its new-style footnotes that use LaTeX bigfoot
    the result is an association list: a list of key-value pairs
    the values are, for each type of footnote, a list of the footnotes of that type, in the order in which they appear in the chapter
    """
    xmlBigfootNotes = list(chapter.findall(".//EOAbigfoot"))
    return [ # a list
        ( # of tuples
            grouping, # the key
            [ # the value: a filter of the above list
                note
                for note
                in xmlBigfootNotes
                if grouping == note.get("list-style-type")
            ],
        )
        for grouping
        in footnote_groups # the types we support
    ]
# def get_bigfoot_data ends here

def sanitizeImage(strImagepath, GM_PATH, TL_PATH):
    """Adjust and convert image for epub standard"""
    logging.debug(strImagepath)
    strCommand = GM_PATH + " identify -format \"%w\" " + strImagepath
    listArguments = shlex.split(strCommand)
    exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
    intImageWidth = int(exeShell)
    if intImageWidth > 1500:
        strCommand = GM_PATH + " convert " + strImagepath + " -resize 1500x\\> " + strImagepath
        listArguments = shlex.split(strCommand)
        subprocess.check_output(listArguments, shell=False)
    strCommand = GM_PATH + " identify -format \"%h\" " + strImagepath
    listArguments = shlex.split(strCommand)
    exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
    intImageHeight = int(exeShell)
    if intImageHeight > 2000:
        strCommand = GM_PATH + " convert " + strImagepath + " -resize x2000\\> " + strImagepath
        listArguments = shlex.split(strCommand)
        subprocess.check_output(listArguments, shell=False)
    strCommand = GM_PATH + " identify -format \"%m\" " + strImagepath
    listArguments = shlex.split(strCommand)
    exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
    strFileFormat = str(exeShell)
    strFileFormat = strFileFormat.strip()
    if strFileFormat == "PNG":
        strNewImagepath = os.path.splitext(strImagepath)[0]
        strCommand = GM_PATH + " convert " + strImagepath + " " + strNewImagepath + ".jpg"
        listArguments = shlex.split(strCommand)
        subprocess.call(listArguments)
        os.remove(strImagepath)
        strImagepath = strNewImagepath + ".jpg"
    elif strFileFormat == "PDF":
        strNewImagepath = os.path.splitext(strImagepath)[0]
        clipped_file = strImagepath.replace(".pdf", "-clipped.pdf")

        Kommando = TL_PATH + "texmf-dist/scripts/pdfcrop/pdfcrop.pl --margins 10 --clip --hires " + strImagepath + " " + clipped_file
        logging.debug(Kommando)

        Argumente = shlex.split(Kommando)
        subprocess.call(Argumente, cwd=tmp_image_dir, stdout=Datei)

        strCommand = GM_PATH + " convert -density 400 " + clipped_file + " " + strNewImagepath + ".png"
        listArguments = shlex.split(strCommand)
        subprocess.call(listArguments)
        logging.debug("Removing two files: %s and %s " % (clipped_file, strImagepath))
        os.remove(clipped_file)
        os.remove(strImagepath)
        strImagepath = strNewImagepath + ".png"

    # print ("Hier ein Pfad zu einem Bild:")
    # print (strImagepath)

    return strImagepath
# def sanitizeImage ends here

def gettext(xmlElement):
    """Return plain text out of nested elements"""

    xmlText = xmlElement.text or ""
    for xmlChild in xmlElement:
        xmlText += gettext(xmlChild)
        if xmlChild.tail:
            xmlText += xmlChild.tail
    return xmlText
# def gettext ends here

def deb_var(obj):
    """https://stackoverflow.com/questions/592746/how-can-you-print-a-variable-name-in-python"""
    name = [name for name in globals() if globals()[name] is obj][0]
    print("DEBUG: %s: %s" % (name, obj))
# def deb_var ends here

def two_letter_language(language_string):
    """Return a two letter code for a language"""

    if language_string in ["english", "en"]:
        return "en"
    elif language_string in ["german", "deutsch", "de"]:
        return "de"
    elif language_string in ["french", "fr"]:
        return "fr"
    elif language_string in ["italian", "it"]:
        return "it"
# two_letter_language ends here

# next function adapted from TEI2EOADjango
def format_citations(used_citekeys, bibdata, language):
    """Return a formatted xmlstring of the used citations"""

    md_file_header = "---\nlang: %s\ntitle: Citations\n...\n\n" % two_letter_language(language)

    with open("tmp_files/used_citations.md", "w") as citation_formatter:
        citation_formatter.write(md_file_header)
        citation_formatter.write("# citeauthoryear\n")
        for entry in used_citekeys:
            citation_formatter.write("[@%s]\n" % entry)
        citation_formatter.write("\n# citeyear\n")
        for entry in used_citekeys:
            citation_formatter.write("[-@%s]\n" % entry)
        # citation_formatter.write("\n# yearparen\n")
        # for entry in used_citekeys:
        #     citation_formatter.write("@%s\n" % entry)
        citation_formatter.write("\n# References\n")

    command = "pandoc -o tmp_files/formatted_citations.html -t html --filter=pandoc-citeproc --bibliography=%s --csl=%s tmp_files/used_citations.md" % (CSL_FILE, bibdata)
    arguments = shlex.split(command)
    logging.info("Using external command pandoc with command %s" % command)
    subprocess.call(arguments)

    with open("tmp_files/formatted_citations.html", "r") as ding:
        dd = soupparser.fromstring(ding, features="html.parser")

    references = dd.xpath("//div[@class='references']")
    return references
# def format_citations ends here
	#!/usr/bin/env python3
	# -- coding: utf-8; mode: python --

	import os
	import sys
	import subprocess
	import shlex
	import logging
	import configparser
	from lxml.html import soupparser

	##################################
	# Reading the configuration file #
	##################################
	CONFIG_FILE = os.path.dirname(sys.argv[0]) + "/config/eoaconvert.cfg"
	CONFIG = configparser.ConfigParser()
	CONFIG.read(CONFIG_FILE)

	######################
	# Setting up logging #
	######################
	LOGFILE = CONFIG['General']['logfile']
	LOGLEVEL = CONFIG['General']['loglevel']

	CSL_FILE = CONFIG['Auxiliaries']['CSL_FILE']

	logging.basicConfig(level=LOGLEVEL, format='%(asctime)s - %(levelname)s - %(message)s')


	if not os.path.exists("tmp_files/tmp_images/"):
	os.makedirs(os.path.expanduser("tmp_files/tmp_images/"))

	tmp_image_dir = os.getcwd() + "/tmp_files/tmp_images/"
	Datei = open('tmp_files/xelatex-run-images.log', 'w')

	# Setup of various dictionaries for localization of various elements
	dictLangFootnotes = {"it" : "Note a piè pagina", "fr" : "notes en bas de page", "de" : "Fußnoten", "en" : "Footnotes"}

	# the new-style footnotes that use LaTeX bigfoot show up in the following order:
	footnote_groups = ["decimal", "lower-latin"]

	#########################
	# Bibliography settings #
	#########################
	allowed_bibentry_types = ["book", "booklet", "report", "thesis", "misc", "incollection", "inproceedings", "article", "newspaper"]

	def get_bigfoot_data(chapter):
	"""
	footnotes are per-chapter
	footnote numbers reset for each chapter
	this helper takes a chapter and returns a collection containing its new-style footnotes that use LaTeX bigfoot
	the result is an association list: a list of key-value pairs
	the values are, for each type of footnote, a list of the footnotes of that type, in the order in which they appear in the chapter
	"""
	xmlBigfootNotes = list(chapter.findall(".//EOAbigfoot"))
	return [ # a list
	( # of tuples
	grouping, # the key
	[ # the value: a filter of the above list
	note
	for note
	in xmlBigfootNotes
	if grouping == note.get("list-style-type")
	],
	)
	for grouping
	in footnote_groups # the types we support
	]
	# def get_bigfoot_data ends here

	def sanitizeImage(strImagepath, GM_PATH, TL_PATH):
	"""Adjust and convert image for epub standard"""
	logging.debug(strImagepath)
	strCommand = GM_PATH + " identify -format \"%w\" " + strImagepath
	listArguments = shlex.split(strCommand)
	exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
	intImageWidth = int(exeShell)
	if intImageWidth > 1500:
	strCommand = GM_PATH + " convert " + strImagepath + " -resize 1500x\\> " + strImagepath
	listArguments = shlex.split(strCommand)
	subprocess.check_output(listArguments, shell=False)
	strCommand = GM_PATH + " identify -format \"%h\" " + strImagepath
	listArguments = shlex.split(strCommand)
	exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
	intImageHeight = int(exeShell)
	if intImageHeight > 2000:
	strCommand = GM_PATH + " convert " + strImagepath + " -resize x2000\\> " + strImagepath
	listArguments = shlex.split(strCommand)
	subprocess.check_output(listArguments, shell=False)
	strCommand = GM_PATH + " identify -format \"%m\" " + strImagepath
	listArguments = shlex.split(strCommand)
	exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
	strFileFormat = str(exeShell)
	strFileFormat = strFileFormat.strip()
	if strFileFormat == "PNG":
	strNewImagepath = os.path.splitext(strImagepath)[0]
	strCommand = GM_PATH + " convert " + strImagepath + " " + strNewImagepath + ".jpg"
	listArguments = shlex.split(strCommand)
	subprocess.call(listArguments)
	os.remove(strImagepath)
	strImagepath = strNewImagepath + ".jpg"
	elif strFileFormat == "PDF":
	strNewImagepath = os.path.splitext(strImagepath)[0]
	clipped_file = strImagepath.replace(".pdf", "-clipped.pdf")

	Kommando = TL_PATH + "texmf-dist/scripts/pdfcrop/pdfcrop.pl --margins 10 --clip --hires " + strImagepath + " " + clipped_file
	logging.debug(Kommando)

	Argumente = shlex.split(Kommando)
	subprocess.call(Argumente, cwd=tmp_image_dir, stdout=Datei)

	strCommand = GM_PATH + " convert -density 400 " + clipped_file + " " + strNewImagepath + ".png"
	listArguments = shlex.split(strCommand)
	subprocess.call(listArguments)
	logging.debug("Removing two files: %s and %s " % (clipped_file, strImagepath))
	os.remove(clipped_file)
	os.remove(strImagepath)
	strImagepath = strNewImagepath + ".png"

	# print ("Hier ein Pfad zu einem Bild:")
	# print (strImagepath)

	return strImagepath
	# def sanitizeImage ends here

	def gettext(xmlElement):
	"""Return plain text out of nested elements"""

	xmlText = xmlElement.text or ""
	for xmlChild in xmlElement:
	xmlText += gettext(xmlChild)
	if xmlChild.tail:
	xmlText += xmlChild.tail
	return xmlText
	# def gettext ends here

	def deb_var(obj):
	"""https://stackoverflow.com/questions/592746/how-can-you-print-a-variable-name-in-python"""
	name = [name for name in globals() if globals()[name] is obj][0]
	print("DEBUG: %s: %s" % (name, obj))
	# def deb_var ends here

	def two_letter_language(language_string):
	"""Return a two letter code for a language"""

	if language_string in ["english", "en"]:
	return "en"
	elif language_string in ["german", "deutsch", "de"]:
	return "de"
	elif language_string in ["french", "fr"]:
	return "fr"
	elif language_string in ["italian", "it"]:
	return "it"
	# two_letter_language ends here

	# next function adapted from TEI2EOADjango
	def format_citations(used_citekeys, bibdata, language):
	"""Return a formatted xmlstring of the used citations"""

	md_file_header = "---\nlang: %s\ntitle: Citations\n...\n\n" % two_letter_language(language)

	with open("tmp_files/used_citations.md", "w") as citation_formatter:
	citation_formatter.write(md_file_header)
	citation_formatter.write("# citeauthoryear\n")
	for entry in used_citekeys:
	citation_formatter.write("[@%s]\n" % entry)
	citation_formatter.write("\n# citeyear\n")
	for entry in used_citekeys:
	citation_formatter.write("[-@%s]\n" % entry)
	# citation_formatter.write("\n# yearparen\n")
	# for entry in used_citekeys:
	# citation_formatter.write("@%s\n" % entry)
	citation_formatter.write("\n# References\n")

	command = "pandoc -o tmp_files/formatted_citations.html -t html --filter=pandoc-citeproc --bibliography=%s --csl=%s tmp_files/used_citations.md" % (CSL_FILE, bibdata)
	arguments = shlex.split(command)
	logging.info("Using external command pandoc with command %s" % command)
	subprocess.call(arguments)

	with open("tmp_files/formatted_citations.html", "r") as ding:
	dd = soupparser.fromstring(ding, features="html.parser")

	references = dd.xpath("//div[@class='references']")
	return references
	# def format_citations ends here