Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
EOASkripts/libeoaconvert.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
386 lines (309 sloc)
13.7 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8; mode: python -*- | |
"""A collection of functions for the different conversion steps""" | |
import os | |
import sys | |
import subprocess | |
import shlex | |
import logging | |
import configparser | |
from lxml import etree | |
from lxml.html import soupparser | |
################################## | |
# Reading the configuration file # | |
################################## | |
CONFIG_FILE = os.path.dirname(os.path.realpath(__file__)) + os.path.sep + "config" + os.path.sep + "eoaconvert.cfg" | |
# CONFIG_FILE = os.path.abspath(os.path.dirname(sys.argv[0])) + "/config/ | |
CONFIG = configparser.ConfigParser() | |
CONFIG.read(CONFIG_FILE) | |
###################### | |
# Setting up logging # | |
###################### | |
LOGFILE = CONFIG['General']['logfile'] | |
LOGLEVEL = CONFIG['General']['loglevel'] | |
CSL_FILE = CONFIG['Auxiliaries']['CSL_FILE'] | |
TRANSLATION_FILE = CONFIG['Auxiliaries']['TRANSLATIONS'] | |
logging.basicConfig(level=LOGLEVEL, format='%(asctime)s - %(levelname)s - %(message)s') | |
# Setup of various dictionaries for localization of various elements | |
# dictLangFootnotes = {"it" : "Note a piè pagina", "fr" : "notes en bas de page", "de" : "Fußnoten", "en" : "Footnotes"} | |
# dict_and = {"en" : "and", "de" : "und", "fr" : "et", "it" : "e"} | |
# dict_ed = {"en" : "ed.", "de" : "Hrsg."} | |
# dict_eds = {"en" : "eds.", "de" : "Hrsg."} | |
# use the translation file that is used also for XSL | |
translation_xml = etree.parse(TRANSLATION_FILE) | |
dictLangFootnotes = translation_xml.find("//entry[@name='footnotes']").attrib | |
dict_and = translation_xml.find("//entry[@name='and']").attrib | |
dict_ed = translation_xml.find("//entry[@name='editor-abbr']").attrib | |
dict_eds = translation_xml.find("//entry[@name='editors-abbr']").attrib | |
# the new-style footnotes that use LaTeX bigfoot show up in the following order: | |
footnote_groups = ["decimal", "lower-latin"] | |
######################### | |
# Bibliography settings # | |
######################### | |
allowed_bibentry_types = ["book", "booklet", "report", "thesis", "misc", "incollection", "inproceedings", "article", "newspaper"] | |
def get_bigfoot_data(chapter): | |
""" | |
footnotes are per-chapter | |
footnote numbers reset for each chapter | |
this helper takes a chapter and returns a collection containing its new-style footnotes that use LaTeX bigfoot | |
the result is an association list: a list of key-value pairs | |
the values are, for each type of footnote, a list of the footnotes of that type, in the order in which they appear in the chapter | |
""" | |
xmlBigfootNotes = list(chapter.findall(".//EOAbigfoot")) | |
return [ # a list | |
( # of tuples | |
grouping, # the key | |
[ # the value: a filter of the above list | |
note | |
for note | |
in xmlBigfootNotes | |
if grouping == note.get("list-style-type") | |
], | |
) | |
for grouping | |
in footnote_groups # the types we support | |
] | |
# def get_bigfoot_data ends here | |
def sanitizeImage(strImagepath, GM_PATH, TL_PATH): | |
"""Adjust and convert image for epub standard""" | |
if not os.path.exists("tmp_files/tmp_images/"): | |
os.makedirs(os.path.expanduser("tmp_files/tmp_images/")) | |
tmp_image_dir = os.getcwd() + "/tmp_files/tmp_images/" | |
xelatex_sanitizeimage_logfile = open('tmp_files/xelatex-run-images.log', 'w') | |
logging.debug(strImagepath) | |
strCommand = GM_PATH + " identify -format \"%w\" " + strImagepath | |
listArguments = shlex.split(strCommand) | |
exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True) | |
intImageWidth = int(exeShell) | |
if intImageWidth > 700: | |
strCommand = GM_PATH + " convert " + strImagepath + " -resize 700x\\> " + strImagepath | |
listArguments = shlex.split(strCommand) | |
subprocess.check_output(listArguments, shell=False) | |
strCommand = GM_PATH + " identify -format \"%h\" " + strImagepath | |
listArguments = shlex.split(strCommand) | |
exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True) | |
intImageHeight = int(exeShell) | |
if intImageHeight > 1000: | |
strCommand = GM_PATH + " convert " + strImagepath + " -resize x1000\\> " + strImagepath | |
listArguments = shlex.split(strCommand) | |
subprocess.check_output(listArguments, shell=False) | |
strCommand = GM_PATH + " identify -format \"%m\" " + strImagepath | |
listArguments = shlex.split(strCommand) | |
exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True) | |
strFileFormat = str(exeShell) | |
strFileFormat = strFileFormat.strip() | |
if strFileFormat == "JPEG": | |
pass | |
# print("looking at jpeg file") | |
# strNewImagepath = os.path.splitext(strImagepath)[0] | |
# strCommand = GM_PATH + " convert " + strImagepath + " " + strNewImagepath + ".jpg" | |
# listArguments = shlex.split(strCommand) | |
# subprocess.call(listArguments) | |
# os.remove(strImagepath) | |
# strImagepath = strNewImagepath + ".jpg" | |
elif strFileFormat == "PNG": | |
pass | |
# print("looking at png file") | |
# strNewImagepath = os.path.splitext(strImagepath)[0] | |
# strCommand = GM_PATH + " convert " + strImagepath + " " + strNewImagepath + ".png" | |
# listArguments = shlex.split(strCommand) | |
# subprocess.call(listArguments) | |
# os.remove(strImagepath) | |
# strImagepath = strNewImagepath + ".png" | |
elif strFileFormat == "PDF": | |
strNewImagepath = os.path.splitext(strImagepath)[0] | |
clipped_file = strImagepath.replace(".pdf", "-clipped.pdf") | |
Kommando = TL_PATH + "texmf-dist/scripts/pdfcrop/pdfcrop.pl --margins 10 --clip --hires " + strImagepath + " " + clipped_file | |
logging.debug(Kommando) | |
Argumente = shlex.split(Kommando) | |
subprocess.call(Argumente, cwd=tmp_image_dir, stdout=xelatex_sanitizeimage_logfile) | |
strCommand = GM_PATH + " convert -density 400 " + clipped_file + " " + strNewImagepath + ".png" | |
listArguments = shlex.split(strCommand) | |
subprocess.call(listArguments) | |
logging.debug("Removing two files: %s and %s " % (clipped_file, strImagepath)) | |
os.remove(clipped_file) | |
os.remove(strImagepath) | |
strImagepath = strNewImagepath + ".png" | |
# print ("Hier ein Pfad zu einem Bild:") | |
# print (strImagepath) | |
return strImagepath | |
# def sanitizeImage ends here | |
def gettext(xmlElement): | |
"""Return plain text out of nested elements""" | |
xmlText = xmlElement.text or "" | |
for xmlChild in xmlElement: | |
xmlText += gettext(xmlChild) | |
if xmlChild.tail: | |
xmlText += xmlChild.tail | |
return xmlText | |
# def gettext ends here | |
def deb_var(obj): | |
"""https://stackoverflow.com/questions/592746/how-can-you-print-a-variable-name-in-python""" | |
name = [name for name in globals() if globals()[name] is obj][0] | |
print("DEBUG: %s: %s" % (name, obj)) | |
# def deb_var ends here | |
def two_letter_language(language_string): | |
"""Return a two letter code for a language""" | |
if language_string in ["english", "en"]: | |
return "en" | |
elif language_string in ["german", "deutsch", "de"]: | |
return "de" | |
elif language_string in ["french", "fr"]: | |
return "fr" | |
elif language_string in ["italian", "it"]: | |
return "it" | |
# two_letter_language ends here | |
def plural(num, noun): | |
"""Return singular or plural form of noun, depending on num. | |
Works only when a noun's plural is formed with 's'. """ | |
if num == 1: | |
return noun | |
else: | |
return noun + "s" | |
# def plural ends here | |
def format_citations(used_citekeys, bibdata, language, tmp_filename): | |
"""Return a formatted xmlstring of the used citations""" | |
tmp_path_md = "tmp_files" + os.path.sep + tmp_filename + ".md" | |
tmp_path_html = "tmp_files" + os.path.sep + tmp_filename + ".html" | |
md_file_header = "---\nlang: %s\ntitle: Citations\n...\n\n" % two_letter_language(language) | |
with open(tmp_path_md, "w") as citation_formatter: | |
citation_formatter.write(md_file_header) | |
citation_formatter.write("# citeauthoryear\n") | |
for entry in used_citekeys: | |
citation_formatter.write("[@%s]\n" % entry) | |
citation_formatter.write("\n# citeyear\n") | |
for entry in used_citekeys: | |
citation_formatter.write("[-@%s]\n" % entry) | |
# citation_formatter.write("\n# yearparen\n") | |
# for entry in used_citekeys: | |
# citation_formatter.write("@%s\n" % entry) | |
citation_formatter.write("\n# References\n") | |
command = "pandoc -o %s -t html --filter=pandoc-citeproc --bibliography=%s --csl=%s %s" % (tmp_path_html, bibdata, CSL_FILE, tmp_path_md) | |
arguments = shlex.split(command) | |
logging.info("Using external command pandoc with command %s" % command) | |
subprocess.call(arguments) | |
with open(tmp_path_html, "r") as ding: | |
dd = soupparser.fromstring(ding, features="html.parser") | |
references = dd.xpath("//div[@class='references']") | |
return references | |
# def format_citations ends here | |
def fix_bib_entries(div_snippet): | |
"""Modify the html code returned by pandoc-citeproc""" | |
entries = div_snippet.findall(".//div") | |
for entry in entries: | |
entry_id = entry.get("id") | |
entry.set("class", "bibliography") | |
etree.strip_tags(entry, "p") | |
entry.tag = "p" | |
internal_markup = entry.findall(".//em") | |
for markup in internal_markup: | |
markup.tag = "i" | |
return div_snippet | |
# def fix_bib_entries ends here | |
def debug_xml_here(xml_tree, xml_filename): | |
"""Dump current state of an XML tree into a file for inspection""" | |
if not os.path.exists("debug"): | |
os.makedirs(os.path.expanduser("debug")) | |
xml_path = "%s/debug/debug_%s.xml" % (os.getcwd(), xml_filename) | |
if isinstance(xml_tree, etree._ElementTree): | |
pass | |
else: | |
xml_tree = etree.ElementTree(xml_tree) | |
xml_tree.write(xml_path, pretty_print=True, xml_declaration=True,encoding="utf-8") | |
logging.info("Wrote %s." % xml_path) | |
# def debug_xml_here ends here | |
def wrap_into_element(wrapper, wrappee): | |
"""Wrap an existing element into a new one""" | |
old_tail = wrappee.tail | |
wrappee.tail = "" | |
wrappee.addprevious(wrapper) | |
wrapper.insert(0, wrappee) | |
wrapper.tail = old_tail | |
return | |
# def wrap_into_element ends here | |
def remove_wrapping_element(wrapper): | |
"""Put child elements one level up and delete surrounding element""" | |
wrappees = wrapper.getchildren() | |
wrapper_parent = wrapper.getparent() | |
wrapper_text = wrapper.text | |
if wrapper_text is not None: | |
wrapper_text = wrapper_text.strip() | |
if len(wrapper_text) > 0: | |
logging.warning("Wrapping element contains text: %s", wrapper_text) | |
wrapper_tail = wrapper.tail | |
if wrapper_tail is not None: | |
wrapper_tail = wrapper_tail.strip() | |
if len(wrapper_tail) > 0: | |
logging.warning("Wrapping element contains has tail: %s", wrapper_tail) | |
wrapper_pos = wrapper_parent.index(wrapper) | |
insert_position = wrapper_parent.index(wrapper) | |
insertioncounter = insert_position | |
for child in wrappees: | |
wrapper_parent.insert(insertioncounter, child) | |
insertioncounter += 1 | |
wrapper.clear() | |
wrapper_parent.remove(wrapper) | |
return | |
# def remove_wrapping_element ends here | |
def change_attribute_name(element, attribute, newname, add_hash=False): | |
"""Change name of an XML attribute, but retain value""" | |
attribute_value = element.get(attribute) | |
if attribute_value is not None: | |
if add_hash is True: | |
attribute_value = "#" + attribute_value | |
else: | |
pass | |
element.set(newname, attribute_value) | |
del element.attrib[attribute] | |
else: | |
# logging.warning("No attribute %s found.", attribute) | |
pass | |
return | |
# def change_attribute_name ends here | |
def transfer_xml_attributes(old_element_attributes, new_element): | |
"""Transfer the attributes of one element to another element. | |
Expects the old elements in dictionary form""" | |
for attrib in old_element_attributes: | |
new_element.attrib[attrib] = old_element_attributes[attrib] | |
return | |
# def transfer_xml_attributes ends here | |
def split_with_milestone_element(element, milestone, splitter): | |
"""Split the text of an element by inserting milestone tags.""" | |
element_text = element.text | |
textparts = element_text.split(splitter) | |
element_attributes = element.attrib | |
element.clear() | |
element.text = textparts[0] + splitter | |
for part in textparts[1:-1]: | |
lb_element = etree.Element(milestone) | |
lb_element.tail = part + splitter | |
element.append(lb_element) | |
lb_element = etree.Element(milestone) | |
lb_element.tail = textparts[-1] | |
element.append(lb_element) | |
transfer_xml_attributes(element_attributes, element) | |
return | |
# def split_with_milestone_element ends here | |
def get_place_in_xml_tree(element, tree): | |
"""Find out the position of an element in a tree. | |
Return the index. Example: how to insert an element after a specific | |
element | |
""" | |
xml_children = tree.getchildren() | |
position = xml_children.index(element) | |
return position | |
# def get_place_in_xml_tree ends here | |
def assign_xml_id(element, identifier): | |
"""Assign an xml:id to an element""" | |
element.attrib["{http://www.w3.org/XML/1998/namespace}id"] = identifier | |
return | |
# def assign_xml_id ends here | |
def write_appinfo(xml_tree, ident, version, xmlid, text, date): | |
"""Log the change of a TEI document in the appinfo element""" | |
appinfo = xml_tree.find("//t:encodingDesc/t:appInfo", namespaces={"t": "http://www.tei-c.org/ns/1.0",}) | |
fix_tei_info = etree.Element("application", ident=ident, version=version, when=date) | |
fix_tei_info.attrib["{http://www.w3.org/XML/1998/namespace}id"] = xmlid | |
fix_tei_label = etree.SubElement(fix_tei_info, "label").text = text | |
appinfo.insert(0, fix_tei_info) | |
return | |
# def write_appinfo ends here |