Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
259 lines (214 sloc) 9.77 KB
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
"""A collection of functions for the different conversion steps"""
import os
import sys
import subprocess
import shlex
import logging
import configparser
from lxml import etree
from lxml.html import soupparser
##################################
# Reading the configuration file #
##################################
CONFIG_FILE = os.path.dirname(os.path.realpath(__file__)) + os.path.sep + "config" + os.path.sep + "eoaconvert.cfg"
# CONFIG_FILE = os.path.abspath(os.path.dirname(sys.argv[0])) + "/config/
CONFIG = configparser.ConfigParser()
CONFIG.read(CONFIG_FILE)
######################
# Setting up logging #
######################
LOGFILE = CONFIG['General']['logfile']
LOGLEVEL = CONFIG['General']['loglevel']
CSL_FILE = CONFIG['Auxiliaries']['CSL_FILE']
TRANSLATION_FILE = CONFIG['Auxiliaries']['TRANSLATIONS']
logging.basicConfig(level=LOGLEVEL, format='%(asctime)s - %(levelname)s - %(message)s')
if not os.path.exists("tmp_files/tmp_images/"):
os.makedirs(os.path.expanduser("tmp_files/tmp_images/"))
tmp_image_dir = os.getcwd() + "/tmp_files/tmp_images/"
Datei = open('tmp_files/xelatex-run-images.log', 'w')
# Setup of various dictionaries for localization of various elements
# dictLangFootnotes = {"it" : "Note a piè pagina", "fr" : "notes en bas de page", "de" : "Fußnoten", "en" : "Footnotes"}
# dict_and = {"en" : "and", "de" : "und", "fr" : "et", "it" : "e"}
# dict_ed = {"en" : "ed.", "de" : "Hrsg."}
# dict_eds = {"en" : "eds.", "de" : "Hrsg."}
# use the translation file that is used also for XSL
translation_xml = etree.parse(TRANSLATION_FILE)
dictLangFootnotes = translation_xml.find("//entry[@name='footnotes']").attrib
dict_and = translation_xml.find("//entry[@name='and']").attrib
dict_ed = translation_xml.find("//entry[@name='editor-abbr']").attrib
dict_eds = translation_xml.find("//entry[@name='editors-abbr']").attrib
# the new-style footnotes that use LaTeX bigfoot show up in the following order:
footnote_groups = ["decimal", "lower-latin"]
#########################
# Bibliography settings #
#########################
allowed_bibentry_types = ["book", "booklet", "report", "thesis", "misc", "incollection", "inproceedings", "article", "newspaper"]
def get_bigfoot_data(chapter):
"""
footnotes are per-chapter
footnote numbers reset for each chapter
this helper takes a chapter and returns a collection containing its new-style footnotes that use LaTeX bigfoot
the result is an association list: a list of key-value pairs
the values are, for each type of footnote, a list of the footnotes of that type, in the order in which they appear in the chapter
"""
xmlBigfootNotes = list(chapter.findall(".//EOAbigfoot"))
return [ # a list
( # of tuples
grouping, # the key
[ # the value: a filter of the above list
note
for note
in xmlBigfootNotes
if grouping == note.get("list-style-type")
],
)
for grouping
in footnote_groups # the types we support
]
# def get_bigfoot_data ends here
def sanitizeImage(strImagepath, GM_PATH, TL_PATH):
"""Adjust and convert image for epub standard"""
logging.debug(strImagepath)
strCommand = GM_PATH + " identify -format \"%w\" " + strImagepath
listArguments = shlex.split(strCommand)
exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
intImageWidth = int(exeShell)
if intImageWidth > 700:
strCommand = GM_PATH + " convert " + strImagepath + " -resize 700x\\> " + strImagepath
listArguments = shlex.split(strCommand)
subprocess.check_output(listArguments, shell=False)
strCommand = GM_PATH + " identify -format \"%h\" " + strImagepath
listArguments = shlex.split(strCommand)
exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
intImageHeight = int(exeShell)
if intImageHeight > 1000:
strCommand = GM_PATH + " convert " + strImagepath + " -resize x1000\\> " + strImagepath
listArguments = shlex.split(strCommand)
subprocess.check_output(listArguments, shell=False)
strCommand = GM_PATH + " identify -format \"%m\" " + strImagepath
listArguments = shlex.split(strCommand)
exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
strFileFormat = str(exeShell)
strFileFormat = strFileFormat.strip()
if strFileFormat == "JPEG":
pass
# print("looking at jpeg file")
# strNewImagepath = os.path.splitext(strImagepath)[0]
# strCommand = GM_PATH + " convert " + strImagepath + " " + strNewImagepath + ".jpg"
# listArguments = shlex.split(strCommand)
# subprocess.call(listArguments)
# os.remove(strImagepath)
# strImagepath = strNewImagepath + ".jpg"
elif strFileFormat == "PNG":
pass
# print("looking at png file")
# strNewImagepath = os.path.splitext(strImagepath)[0]
# strCommand = GM_PATH + " convert " + strImagepath + " " + strNewImagepath + ".png"
# listArguments = shlex.split(strCommand)
# subprocess.call(listArguments)
# os.remove(strImagepath)
# strImagepath = strNewImagepath + ".png"
elif strFileFormat == "PDF":
strNewImagepath = os.path.splitext(strImagepath)[0]
clipped_file = strImagepath.replace(".pdf", "-clipped.pdf")
Kommando = TL_PATH + "texmf-dist/scripts/pdfcrop/pdfcrop.pl --margins 10 --clip --hires " + strImagepath + " " + clipped_file
logging.debug(Kommando)
Argumente = shlex.split(Kommando)
subprocess.call(Argumente, cwd=tmp_image_dir, stdout=Datei)
strCommand = GM_PATH + " convert -density 400 " + clipped_file + " " + strNewImagepath + ".png"
listArguments = shlex.split(strCommand)
subprocess.call(listArguments)
logging.debug("Removing two files: %s and %s " % (clipped_file, strImagepath))
os.remove(clipped_file)
os.remove(strImagepath)
strImagepath = strNewImagepath + ".png"
# print ("Hier ein Pfad zu einem Bild:")
# print (strImagepath)
return strImagepath
# def sanitizeImage ends here
def gettext(xmlElement):
"""Return plain text out of nested elements"""
xmlText = xmlElement.text or ""
for xmlChild in xmlElement:
xmlText += gettext(xmlChild)
if xmlChild.tail:
xmlText += xmlChild.tail
return xmlText
# def gettext ends here
def deb_var(obj):
"""https://stackoverflow.com/questions/592746/how-can-you-print-a-variable-name-in-python"""
name = [name for name in globals() if globals()[name] is obj][0]
print("DEBUG: %s: %s" % (name, obj))
# def deb_var ends here
def two_letter_language(language_string):
"""Return a two letter code for a language"""
if language_string in ["english", "en"]:
return "en"
elif language_string in ["german", "deutsch", "de"]:
return "de"
elif language_string in ["french", "fr"]:
return "fr"
elif language_string in ["italian", "it"]:
return "it"
# two_letter_language ends here
def plural(num, noun):
"""Return singular or plural form of noun, depending on num.
Works only when a noun's plural is formed with 's'. """
if num == 1:
return noun
else:
return noun + "s"
# def plural ends here
def format_citations(used_citekeys, bibdata, language, tmp_filename):
"""Return a formatted xmlstring of the used citations"""
tmp_path_md = "tmp_files" + os.path.sep + tmp_filename + ".md"
tmp_path_html = "tmp_files" + os.path.sep + tmp_filename + ".html"
md_file_header = "---\nlang: %s\ntitle: Citations\n...\n\n" % two_letter_language(language)
with open(tmp_path_md, "w") as citation_formatter:
citation_formatter.write(md_file_header)
citation_formatter.write("# citeauthoryear\n")
for entry in used_citekeys:
citation_formatter.write("[@%s]\n" % entry)
citation_formatter.write("\n# citeyear\n")
for entry in used_citekeys:
citation_formatter.write("[-@%s]\n" % entry)
# citation_formatter.write("\n# yearparen\n")
# for entry in used_citekeys:
# citation_formatter.write("@%s\n" % entry)
citation_formatter.write("\n# References\n")
command = "pandoc -o %s -t html --filter=pandoc-citeproc --bibliography=%s --csl=%s %s" % (tmp_path_html, bibdata, CSL_FILE, tmp_path_md)
arguments = shlex.split(command)
logging.info("Using external command pandoc with command %s" % command)
subprocess.call(arguments)
with open(tmp_path_html, "r") as ding:
dd = soupparser.fromstring(ding, features="html.parser")
references = dd.xpath("//div[@class='references']")
return references
# def format_citations ends here
def fix_bib_entries(div_snippet):
"""Modify the html code returned by pandoc-citeproc"""
entries = div_snippet.findall(".//div")
for entry in entries:
entry_id = entry.get("id")
entry.set("class", "bibliography")
etree.strip_tags(entry, "p")
entry.tag = "p"
internal_markup = entry.findall(".//em")
for markup in internal_markup:
markup.tag = "i"
return div_snippet
# def fix_bib_entries ends here
def debug_xml_here(xml_tree, xml_filename):
"""Dump current state of an XML tree into a file for inspection"""
if not os.path.exists("debug"):
os.makedirs(os.path.expanduser("debug"))
xml_path = "%s/debug/debug_%s.xml" % (os.getcwd(), xml_filename)
if isinstance(xml_tree, etree._ElementTree):
pass
else:
xml_tree = etree.ElementTree(xml_tree)
xml_tree.write(xml_path, pretty_print=True, xml_declaration=True,encoding="utf-8")
logging.info("Wrote %s." % xml_path)
# def debug_xml_here ends here