Skip to content
Permalink
10458fa200
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
583 lines (465 sloc) 17.9 KB
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
"""A collection of functions for the different conversion steps"""
from utils.load_config import exec_command, ToLog, ToFile
import os
import sys
import subprocess
import shlex
import logging
import configparser
from datetime import datetime
from lxml import etree
from lxml.html import soupparser
from pathlib import Path
BASE_DIR = Path( os.path.realpath(__file__) ).parent
# the new-style footnotes that use LaTeX bigfoot show up in the following order:
# global variables
footnote_groups = ["decimal", "lower-latin"]
#########################
# Bibliography settings #
#########################
allowed_bibentry_types = ["book", "booklet", "report", "thesis", "misc", "incollection", "inproceedings", "article", "newspaper"]
def enable_preamble(
input_file,
output_file,
pdf_or_xml
):
logging.debug(f"Enabling preamble {pdf_or_xml}. Input file is {input_file}, outputting to {output_file}")
with open( input_file, "r" ) as i:
with open( output_file, "w" ) as o:
if( pdf_or_xml == "pdf" ):
o.write( "\input{preambel/pre_eoa}\n" )
else:
o.write( "\input{preambel/pre_xml}\n" )
o.write( i.read() )
def get_bigfoot_data(chapter):
"""
footnotes are per-chapter
footnote numbers reset for each chapter
this helper takes a chapter and returns a collection containing its new-style footnotes that use LaTeX bigfoot
the result is an association list: a list of key-value pairs
the values are, for each type of footnote, a list of the footnotes of that type, in the order in which they appear in the chapter
"""
xmlBigfootNotes = list(chapter.findall(".//EOAbigfoot"))
return [ # a list
( # of tuples
grouping, # the key
[ # the value: a filter of the above list
note
for note
in xmlBigfootNotes
if grouping == note.get("list-style-type")
],
)
for grouping
in footnote_groups # the types we support
]
# def get_bigfoot_data ends here
def sanitizeImage(
strImagepath,
tmp_dir,
GM_PATH,
PDFCROP_EXEC,
# TL_PATH
):
"""Adjust and convert image for epub standard"""
tmp_dir = Path( tmp_dir )
strImagepath = Path( strImagepath )
if not (tmp_dir / "tmp_images").exists():
os.makedirs(tmp_dir / "tmp_images/")
tmp_image_dir = tmp_dir / "tmp_images"
logging.debug(strImagepath)
intImageWidth = int(subprocess.check_output(
shlex.split( f"{GM_PATH} identify -format \"%w\" {strImagepath}" ),
universal_newlines=True
))
if intImageWidth > 700:
exec_command(
f"{GM_PATH} convert {strImagepath} -colorspace RGB -resize 700x\\> {strImagepath}"
)
intImageHeight = int( subprocess.check_output(
shlex.split( f"{GM_PATH} identify -format \"%h\" {strImagepath}" ),
universal_newlines=True
))
if intImageHeight > 1000:
exec_command(
f"{GM_PATH} convert {strImagepath} -colorspace RGB -resize x1000\\> {strImagepath}"
)
strFileFormat_bytes = subprocess.check_output(
shlex.split( f"{GM_PATH} identify -format \"%m\" {strImagepath}" )
)
strFileFormat = strFileFormat_bytes.decode("utf-8").strip()
logging.debug(f"Image has been recognized as having format {strFileFormat} by {GM_PATH}.")
if strFileFormat == "JPEG":
pass
# print("looking at jpeg file")
# strNewImagepath = os.path.splitext(strImagepath)[0]
# strCommand = GM_PATH + " convert " + strImagepath + " " + strNewImagepath + ".jpg"
# listArguments = shlex.split(strCommand)
# subprocess.call(listArguments)
# os.remove(strImagepath)
# strImagepath = strNewImagepath + ".jpg"
elif strFileFormat == "PNG":
pass
# print("looking at png file")
# strNewImagepath = os.path.splitext(strImagepath)[0]
# strCommand = GM_PATH + " convert " + strImagepath + " " + strNewImagepath + ".png"
# listArguments = shlex.split(strCommand)
# subprocess.call(listArguments)
# os.remove(strImagepath)
# strImagepath = strNewImagepath + ".png"
elif strFileFormat == "PDF":
strNewImagepath = os.path.splitext(str(strImagepath))[0]
clipped_file = str(strImagepath).replace(".pdf", "-clipped.pdf")
exec_command(
f"{PDFCROP_EXEC} --margins 10 --clip --hires {strImagepath} {clipped_file}",
# wd = tmp_image_dir
)
exec_command(
f"{GM_PATH} convert -density 400 {clipped_file} {strNewImagepath}.png"
)
logging.debug("Removing two files: %s and %s " % (clipped_file, strImagepath))
os.remove(clipped_file)
os.remove(strImagepath)
strImagepath = strNewImagepath + ".png"
else:
logging.error("Image format not recognized. Exiting.")
sys.exit( 1 )
# print ("Hier ein Pfad zu einem Bild:")
# print (strImagepath)
return strImagepath
# def sanitizeImage ends here
def gettext(xmlElement):
"""Return plain text out of nested elements"""
xmlText = xmlElement.text or ""
for xmlChild in xmlElement:
xmlText += gettext(xmlChild)
if xmlChild.tail:
xmlText += xmlChild.tail
return xmlText
# def gettext ends here
def deb_var(obj):
"""https://stackoverflow.com/questions/592746/how-can-you-print-a-variable-name-in-python"""
name = [name for name in globals() if globals()[name] is obj][0]
print("DEBUG: %s: %s" % (name, obj))
# def deb_var ends here
def two_letter_language(language_string):
"""Return a two letter code for a language"""
if language_string in ["english", "en"]:
return "en"
elif language_string in ["german", "deutsch", "de"]:
return "de"
elif language_string in ["french", "fr"]:
return "fr"
elif language_string in ["italian", "it"]:
return "it"
# two_letter_language ends here
def plural(num, noun, plural=None):
"""Return singular or plural form of noun, depending on num.
Plural form defaults to "s", but can be specified as keyword
argument
"""
if not plural:
plural = f"{noun}s"
if num == 0:
phrase = f"no {plural}"
elif num == 1:
phrase = f"1 {noun}"
else:
phrase = f"{num} {plural}"
return phrase
# def plural ends here
def format_citations_tex4ht(
used_citekeys,
bibdata,
language,
tmp_filename,
tmp_dir
):
"""Return a formatted xmlstring of the used citations"""
tmp_path_md = Path(tmp_dir) / (tmp_filename + ".tex")
tmp_path_html = Path(tmp_dir) / (tmp_filename + ".html")
return references
# def format_citations_tex4ht ends here
def format_citations(
used_citekeys,
bibdata,
language,
tmp_filename,
csl_file,
log_to = ToLog
):
"""Return a formatted xmlstring of the used citations"""
tmp_path_md = tmp_filename.with_suffix(".md")
tmp_path_html = tmp_filename.with_suffix(".html")
'''
tmp_path_md = "tmp_files" + os.path.sep + tmp_filename + ".md"
tmp_path_html = "tmp_files" + os.path.sep + tmp_filename + ".html"
'''
md_file_header = "---\nlang: %s\ntitle: Citations\n...\n\n" % two_letter_language(language)
with open(tmp_path_md, "w") as citation_formatter:
citation_formatter.write(md_file_header)
citation_formatter.write("# citeauthoryear\n")
for entry in used_citekeys:
citation_formatter.write("[@%s]\n" % entry)
citation_formatter.write("\n# citeyear\n")
for entry in used_citekeys:
citation_formatter.write("[-@%s]\n" % entry)
# citation_formatter.write("\n# yearparen\n")
# for entry in used_citekeys:
# citation_formatter.write("@%s\n" % entry)
citation_formatter.write("\n# References\n")
exec_command(
f"pandoc -o {tmp_path_html} -t html --filter=pandoc-citeproc --bibliography={bibdata} --csl={csl_file} {tmp_path_md}",
output_to = log_to
)
with open(tmp_path_html, "r") as ding:
dd = soupparser.fromstring(ding, features="html.parser")
references = dd.xpath("//div[@class='references']")
return references
# def format_citations ends here
def fix_bib_entries(div_snippet):
"""Modify the html code returned by pandoc-citeproc"""
entries = div_snippet.findall(".//div")
for entry in entries:
entry_id = entry.get("id")
entry.set("class", "bibliography")
etree.strip_tags(entry, "p")
entry.tag = "p"
internal_markup = entry.findall(".//em")
for markup in internal_markup:
markup.tag = "i"
return div_snippet
# def fix_bib_entries ends here
def debug_xml_here(
xml_tree,
xml_filename,
output_dir
):
"""Dump current state of an XML tree into a file for inspection"""
'''
if not os.path.exists("debug"):
os.makedirs(os.path.expanduser("debug"))
xml_path = "%s/debug/debug_%s.xml" % (os.getcwd(), xml_filename)
'''
xml_path = (Path(output_dir) / xml_filename).with_suffix( ".xml")
if isinstance(xml_tree, etree._ElementTree):
pass
else:
xml_tree = etree.ElementTree(xml_tree)
xml_tree.write( str(xml_path), pretty_print=True, xml_declaration=True,encoding="utf-8")
logging.info(f"Wrote XML file for debugging purposes: {xml_path}.")
# def debug_xml_here ends here
def wrap_into_element(wrapper, wrappee):
"""Wrap an existing element into a new one"""
old_tail = wrappee.tail
wrappee.tail = ""
wrappee.addprevious(wrapper)
wrapper.insert(0, wrappee)
wrapper.tail = old_tail
return
# def wrap_into_element ends here
def remove_wrapping_element(wrapper):
"""Put child elements one level up and delete surrounding element"""
wrappees = list(wrapper)
wrapper_parent = wrapper.getparent()
wrapper_text = wrapper.text
if wrapper_text is not None:
wrapper_text = wrapper_text.strip()
if len(wrapper_text) > 0:
logging.warning("Wrapping element contains text: %s", wrapper_text)
wrapper_tail = wrapper.tail
if wrapper_tail is not None:
wrapper_tail = wrapper_tail.strip()
if len(wrapper_tail) > 0:
logging.warning("Wrapping element contains has tail: %s", wrapper_tail)
wrapper_pos = wrapper_parent.index(wrapper)
insert_position = wrapper_parent.index(wrapper)
insertioncounter = insert_position
for child in wrappees:
wrapper_parent.insert(insertioncounter, child)
insertioncounter += 1
wrapper.clear()
wrapper_parent.remove(wrapper)
return
# def remove_wrapping_element ends here
def change_attribute_name(element, attribute, newname, add_hash=False):
"""Change name of an XML attribute, but retain value"""
attribute_value = element.get(attribute)
if attribute_value is not None:
if add_hash is True:
attribute_value = "#" + attribute_value
else:
pass
element.set(newname, attribute_value)
del element.attrib[attribute]
else:
# logging.warning("No attribute %s found.", attribute)
pass
return
# def change_attribute_name ends here
def transfer_xml_attributes(old_element_attributes, new_element):
"""Transfer the attributes of one element to another element.
Expects the old elements in dictionary form"""
for attrib in old_element_attributes:
new_element.attrib[attrib] = old_element_attributes[attrib]
return
# def transfer_xml_attributes ends here
def split_with_milestone_element(element, milestone, splitter):
"""Split the text of an element by inserting milestone tags."""
element_text = element.text
textparts = element_text.split(splitter)
element_attributes = element.attrib
element.clear()
element.text = textparts[0] + splitter
for part in textparts[1:-1]:
lb_element = etree.Element(milestone)
lb_element.tail = part + splitter
element.append(lb_element)
lb_element = etree.Element(milestone)
lb_element.tail = textparts[-1]
element.append(lb_element)
transfer_xml_attributes(element_attributes, element)
return
# def split_with_milestone_element ends here
def get_place_in_xml_tree(element, tree):
"""Find out the position of an element in a tree.
Return the index. Example: how to insert an element after a specific
element
"""
xml_children = list(tree)
position = xml_children.index(element)
return position
# def get_place_in_xml_tree ends here
def assign_xml_id(element, identifier):
"""Assign an xml:id to an element"""
element.attrib["{http://www.w3.org/XML/1998/namespace}id"] = identifier
return
# def assign_xml_id ends here
def get_appinfo(ident, version, xmlid, text, date):
"""Log the change of a TEI document in the appinfo element"""
logging.info("Writing appinfo")
fix_tei_info = etree.Element("application", ident=ident, version=version, when=date)
fix_tei_info.attrib["{http://www.w3.org/XML/1998/namespace}id"] = xmlid
fix_tei_label = etree.SubElement(fix_tei_info, "label").text = text
return fix_tei_info
# def get_appinfo ends here
def translate(term, publang, translation_file):
"""Translate a term"""
translation_xml = etree.parse( str( translation_file ) )
try:
term_to_translate = translation_xml.find(f"//entry[@name='{term}']").attrib
except Exception:
logging.error(f"Term {term} not found in translation file. Please add it to {translation_file}. Exiting.")
sys.exit(1)
translated_term = term_to_translate.get(publang)
if not translated_term:
logging.error(f"Translation for term '{term}' in language with code {publang} is missing. Please add it to {translation_file}. Exiting.")
sys.exit(1)
else:
return translated_term
# def translate ends here
def restore_xml_tags(text):
"""Convert XML entities back to code
&lt; => <
"""
replacements = {
"&lt;" : "<",
"&gt;" : ">",
"&apos;" : "'",
"&amp;" : "&"
}
for item in replacements:
text = text.replace(item, replacements[item])
return text
# def restore_xml_tags ends here
def escape_xml(raw_text, decode=True):
"""Convert xml markup to entities"""
if decode:
text = raw_text.decode("utf-8")
else:
text = raw_text
replacements = {
"&" : "&amp;",
"<" : "&lt;" ,
">" : "&gt;",
"'" : "&apos;",
'"' : "&quot;",
}
for item in replacements:
text = text.replace(item, replacements[item])
return text
# def escape_xml ends here
def format_hyperlinks_django_epub(xmlHyperlink, strLanguage):
"""Convert IMXML element to href and append localized accessed date"""
strURL = xmlHyperlink.get('url')
if strURL.startswith("http://") == False:
if strURL.startswith("https://") == False:
strURL = "http://" + strURL
xmlHyperlink.tag = "a"
del xmlHyperlink.attrib["url"]
xmlHyperlink.set("href", strURL)
etree.strip_elements(xmlHyperlink, with_tail=True, *['allowbreak'])
accessed_date_element = xmlHyperlink.find("./date")
if accessed_date_element is not None:
accessed_date = accessed_date_element.get("when")
formatted_date = format_date(accessed_date, two_letter_language(strLanguage))
# etree.strip_elements(accessed_date_element, with_tail=True)
accessed_date_element.tag = "elementtoberemoved"
accessed_date_element.tail = ""
url_tail = xmlHyperlink.tail
xmlHyperlink.tail = f", {formatted_date}{url_tail}"
xmlHyperlink.text = strURL
else:
logging.warning(f"Found no accessed date at url {strURL}. Proceeding without accessed date.")
url_tail = xmlHyperlink.tail
xmlHyperlink.tail = f"{url_tail}"
xmlHyperlink.text = strURL
return
# def format_hyperlinks_django_epub ends here
def format_date(accessed_date, language):
"""Format date string"""
parsed_date = datetime.strptime(accessed_date, "%Y-%m-%d")
if language == "en":
accessed_string = f"accessed {parsed_date:%B} {parsed_date.day}, {parsed_date:%Y}"
elif language == "de":
accessed_string = f"besucht am {parsed_date:%d}.{parsed_date:%m}.{parsed_date:%Y}"
else:
logging.error("Got an unrecognized language: %s. Exiting.", language)
sys.exit(1)
return accessed_string
# def format_date ends here
def has_text_or_children(cr):
"""Check whether an element contains text or further elements"""
if cr.text or len(list(cr)) > 0:
htoc = True
else:
htoc = False
return htoc
# def has_text_or_children ends here
def progress(count, total, status=''):
"""Progress bar for command line. Taken from
https://gist.github.com/vladignatyev/06860ec2040cb497f0f3"""
bar_len = 60
filled_len = int(round(bar_len * count / float(total)))
percents = round(100.0 * count / float(total), 1)
bar = '#' * filled_len + '-' * (bar_len - filled_len)
sys.stdout.write('[%s] %s%s ... %s\r' % (bar, percents, '%', status))
sys.stdout.flush()
# def progress ends here
def pdf_burst(input_file, tmpDir):
"""Split PDF file into single pages"""
from PyPDF2 import PdfFileWriter, PdfFileReader
input1 = PdfFileReader(open(tmpDir / input_file, "rb"))
logging.debug("Input is %s and has %d pages." % (input_file, input1.getNumPages()))
for pageno in range(input1.getNumPages()):
output = PdfFileWriter()
output.addPage(input1.getPage(pageno))
output_filename = tmpDir / ("EOAformulas_%d.pdf" % (pageno + 1))
output_stream = open(output_filename, 'wb')
output.write(output_stream)
output_stream.close()
logging.debug("Wrote %s." % output_filename)
pageno += 1
# def pdf_burst ends here