Skip to content
Permalink
0db1ccae5d
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
258 lines (214 sloc) 9.77 KB
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
"""A collection of functions for the different conversion steps"""
import os
import sys
import subprocess
import shlex
import logging
import configparser
from lxml import etree
from lxml.html import soupparser
##################################
# Reading the configuration file #
##################################
CONFIG_FILE = os.path.dirname(os.path.realpath(__file__)) + os.path.sep + "config" + os.path.sep + "eoaconvert.cfg"
# CONFIG_FILE = os.path.abspath(os.path.dirname(sys.argv[0])) + "/config/
CONFIG = configparser.ConfigParser()
CONFIG.read(CONFIG_FILE)
######################
# Setting up logging #
######################
LOGFILE = CONFIG['General']['logfile']
LOGLEVEL = CONFIG['General']['loglevel']
CSL_FILE = CONFIG['Auxiliaries']['CSL_FILE']
TRANSLATION_FILE = CONFIG['Auxiliaries']['TRANSLATIONS']
logging.basicConfig(level=LOGLEVEL, format='%(asctime)s - %(levelname)s - %(message)s')
if not os.path.exists("tmp_files/tmp_images/"):
os.makedirs(os.path.expanduser("tmp_files/tmp_images/"))
tmp_image_dir = os.getcwd() + "/tmp_files/tmp_images/"
Datei = open('tmp_files/xelatex-run-images.log', 'w')
# Setup of various dictionaries for localization of various elements
# dictLangFootnotes = {"it" : "Note a piè pagina", "fr" : "notes en bas de page", "de" : "Fußnoten", "en" : "Footnotes"}
# dict_and = {"en" : "and", "de" : "und", "fr" : "et", "it" : "e"}
# dict_ed = {"en" : "ed.", "de" : "Hrsg."}
# dict_eds = {"en" : "eds.", "de" : "Hrsg."}
# use the translation file that is used also for XSL
translation_xml = etree.parse(TRANSLATION_FILE)
dictLangFootnotes = translation_xml.find("//entry[@name='footnotes']").attrib
dict_and = translation_xml.find("//entry[@name='and']").attrib
dict_ed = translation_xml.find("//entry[@name='editor-abbr']").attrib
dict_eds = translation_xml.find("//entry[@name='editors-abbr']").attrib
# the new-style footnotes that use LaTeX bigfoot show up in the following order:
footnote_groups = ["decimal", "lower-latin"]
#########################
# Bibliography settings #
#########################
allowed_bibentry_types = ["book", "booklet", "report", "thesis", "misc", "incollection", "inproceedings", "article", "newspaper"]
def get_bigfoot_data(chapter):
"""
footnotes are per-chapter
footnote numbers reset for each chapter
this helper takes a chapter and returns a collection containing its new-style footnotes that use LaTeX bigfoot
the result is an association list: a list of key-value pairs
the values are, for each type of footnote, a list of the footnotes of that type, in the order in which they appear in the chapter
"""
xmlBigfootNotes = list(chapter.findall(".//EOAbigfoot"))
return [ # a list
( # of tuples
grouping, # the key
[ # the value: a filter of the above list
note
for note
in xmlBigfootNotes
if grouping == note.get("list-style-type")
],
)
for grouping
in footnote_groups # the types we support
]
# def get_bigfoot_data ends here
def sanitizeImage(strImagepath, GM_PATH, TL_PATH):
"""Adjust and convert image for epub standard"""
logging.debug(strImagepath)
strCommand = GM_PATH + " identify -format \"%w\" " + strImagepath
listArguments = shlex.split(strCommand)
exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
intImageWidth = int(exeShell)
if intImageWidth > 700:
strCommand = GM_PATH + " convert " + strImagepath + " -resize 700x\\> " + strImagepath
listArguments = shlex.split(strCommand)
subprocess.check_output(listArguments, shell=False)
strCommand = GM_PATH + " identify -format \"%h\" " + strImagepath
listArguments = shlex.split(strCommand)
exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
intImageHeight = int(exeShell)
if intImageHeight > 1000:
strCommand = GM_PATH + " convert " + strImagepath + " -resize x1000\\> " + strImagepath
listArguments = shlex.split(strCommand)
subprocess.check_output(listArguments, shell=False)
strCommand = GM_PATH + " identify -format \"%m\" " + strImagepath
listArguments = shlex.split(strCommand)
exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
strFileFormat = str(exeShell)
strFileFormat = strFileFormat.strip()
if strFileFormat == "JPEG":
pass
# print("looking at jpeg file")
# strNewImagepath = os.path.splitext(strImagepath)[0]
# strCommand = GM_PATH + " convert " + strImagepath + " " + strNewImagepath + ".jpg"
# listArguments = shlex.split(strCommand)
# subprocess.call(listArguments)
# os.remove(strImagepath)
# strImagepath = strNewImagepath + ".jpg"
elif strFileFormat == "PNG":
pass
# print("looking at png file")
# strNewImagepath = os.path.splitext(strImagepath)[0]
# strCommand = GM_PATH + " convert " + strImagepath + " " + strNewImagepath + ".png"
# listArguments = shlex.split(strCommand)
# subprocess.call(listArguments)
# os.remove(strImagepath)
# strImagepath = strNewImagepath + ".png"
elif strFileFormat == "PDF":
strNewImagepath = os.path.splitext(strImagepath)[0]
clipped_file = strImagepath.replace(".pdf", "-clipped.pdf")
Kommando = TL_PATH + "texmf-dist/scripts/pdfcrop/pdfcrop.pl --margins 10 --clip --hires " + strImagepath + " " + clipped_file
logging.debug(Kommando)
Argumente = shlex.split(Kommando)
subprocess.call(Argumente, cwd=tmp_image_dir, stdout=Datei)
strCommand = GM_PATH + " convert -density 400 " + clipped_file + " " + strNewImagepath + ".png"
listArguments = shlex.split(strCommand)
subprocess.call(listArguments)
logging.debug("Removing two files: %s and %s " % (clipped_file, strImagepath))
os.remove(clipped_file)
os.remove(strImagepath)
strImagepath = strNewImagepath + ".png"
# print ("Hier ein Pfad zu einem Bild:")
# print (strImagepath)
return strImagepath
# def sanitizeImage ends here
def gettext(xmlElement):
"""Return plain text out of nested elements"""
xmlText = xmlElement.text or ""
for xmlChild in xmlElement:
xmlText += gettext(xmlChild)
if xmlChild.tail:
xmlText += xmlChild.tail
return xmlText
# def gettext ends here
def deb_var(obj):
"""https://stackoverflow.com/questions/592746/how-can-you-print-a-variable-name-in-python"""
name = [name for name in globals() if globals()[name] is obj][0]
print("DEBUG: %s: %s" % (name, obj))
# def deb_var ends here
def two_letter_language(language_string):
"""Return a two letter code for a language"""
if language_string in ["english", "en"]:
return "en"
elif language_string in ["german", "deutsch", "de"]:
return "de"
elif language_string in ["french", "fr"]:
return "fr"
elif language_string in ["italian", "it"]:
return "it"
# two_letter_language ends here
def plural(num, noun):
"""Return singular or plural form of noun, depending on num.
Works only when a noun's plural is formed with 's'. """
if num == 1:
return noun
else:
return noun + "s"
# def plural ends here
def format_citations(used_citekeys, bibdata, language, tmp_filename):
"""Return a formatted xmlstring of the used citations"""
tmp_path_md = "tmp_files" + os.path.sep + tmp_filename + ".md"
tmp_path_html = "tmp_files" + os.path.sep + tmp_filename + ".html"
md_file_header = "---\nlang: %s\ntitle: Citations\n...\n\n" % two_letter_language(language)
with open(tmp_path_md, "w") as citation_formatter:
citation_formatter.write(md_file_header)
citation_formatter.write("# citeauthoryear\n")
for entry in used_citekeys:
citation_formatter.write("[@%s]\n" % entry)
citation_formatter.write("\n# citeyear\n")
for entry in used_citekeys:
citation_formatter.write("[-@%s]\n" % entry)
# citation_formatter.write("\n# yearparen\n")
# for entry in used_citekeys:
# citation_formatter.write("@%s\n" % entry)
citation_formatter.write("\n# References\n")
command = "pandoc -o %s -t html --filter=pandoc-citeproc --bibliography=%s --csl=%s %s" % (tmp_path_html, bibdata, CSL_FILE, tmp_path_md)
arguments = shlex.split(command)
logging.info("Using external command pandoc with command %s" % command)
subprocess.call(arguments)
with open(tmp_path_html, "r") as ding:
dd = soupparser.fromstring(ding, features="html.parser")
references = dd.xpath("//div[@class='references']")
return references
# def format_citations ends here
def fix_bib_entries(div_snippet):
"""Modify the html code returned by pandoc-citeproc"""
entries = div_snippet.findall(".//div")
for entry in entries:
entry_id = entry.get("id")
entry.set("class", "bibliography")
etree.strip_tags(entry, "p")
entry.tag = "p"
internal_markup = entry.findall(".//em")
for markup in internal_markup:
markup.tag = "i"
return div_snippet
# def fix_bib_entries ends here
def debug_xml_here(xml_tree, xml_filename):
"""Dump current state of an XML tree into a file for inspection"""
if not os.path.exists("debug"):
os.makedirs(os.path.expanduser("debug"))
xml_path = "%s/debug/debug_%s.xml" % (os.getcwd(), xml_filename)
if isinstance(xml_tree, etree._ElementTree):
pass
else:
xml_tree = etree.ElementTree(xml_tree)
xml_tree.write(xml_path, pretty_print=True, xml_declaration=True,encoding="utf-8")
logging.info("Wrote %s." % xml_path)
# def debug_xml_here ends here