Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
EOASkripts/src/imxml2epub.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
1957 lines (1722 sloc)
81.6 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8; mode: python -*- | |
# Time-stamp: <2020-09-25 08:08:41 (kthoden)> | |
""" Convert a customized DocBook XML file into a set of files that | |
constitute the contents of an EPUB file. | |
Input file is a customized DocBook XML that has been generated either | |
with eoatex2imxml or tei2imxml. | |
""" | |
from utils.load_config import load_config | |
import utils.libeoaconvert as libeoaconvert | |
import os | |
import sys | |
import argparse | |
import re | |
import shutil | |
import pickle | |
import shlex | |
import string | |
import subprocess | |
import logging | |
from copy import deepcopy | |
from lxml import etree | |
from pathlib import Path | |
from PIL import ImageFont | |
import configparser | |
BASE_DIR = Path( __file__ ).resolve().parent | |
SCRIPT_PATH = Path( __file__ ) | |
SCRIPT_NAME = SCRIPT_PATH.stem | |
DEFAULT_INPUT_DIR = \ | |
Path(os.environ['INPUT_DIR'] if 'INPUT_DIR' in os.environ else './input') | |
DEFAULT_OUTPUT_DIR = \ | |
Path(os.environ['OUTPUT_DIR'] if 'OUTPUT_DIR' in os.environ else './output') | |
##################### | |
# Parsing arguments # | |
##################### | |
parser = argparse.ArgumentParser( | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter | |
) | |
parser.add_argument( | |
"-c", "--config", | |
dest="CONFIG_FILE", | |
default = BASE_DIR / "config" / "eoaconvert.cfg", | |
help="Name of configuration file", | |
metavar="CONFIGURATION" | |
) | |
parser.add_argument( | |
"--log-level", | |
default = "INFO", | |
help="log level: choose between DEBUG, INFO, WARNING, ERROR, CRITICAL" | |
) | |
parser.add_argument( | |
"-i", "--input-dir", | |
help = f"directory containing some intermediate xml created by previous steps. default: {DEFAULT_OUTPUT_DIR}/PUBLICATION_NAME/imxml", | |
type = Path, | |
) | |
parser.add_argument( | |
"-o", "--output-dir", | |
help = f"output directory. default: {DEFAULT_OUTPUT_DIR}/PUBLICATION_NAME/epub", | |
type = Path, | |
) | |
parser.add_argument( | |
"-f", "--font", | |
help="Font to be used, default is TeX Gyre Termes", | |
default="termes" | |
) | |
parser.add_argument( | |
"-ne", "--no-epub", | |
help="Disable creation of epub file.", | |
action="store_true" | |
) | |
parser.add_argument( | |
"--extra-font-selector", | |
help="Specify the css class selector for the extra font." | |
) | |
parser.add_argument( | |
"--extra-font-files-directory", | |
help="Specify the directory with files of the font (the font itself, License)", | |
) | |
parser.add_argument( | |
"PUBLICATION_DIR", | |
help = "directory containing the publication (including resources like pictures, etc.)", | |
type = Path, | |
) | |
parser.add_argument( | |
"-him", "--hyperimage", | |
help="Link hyperlink references to online version.", | |
action="store_true" | |
) | |
args = parser.parse_args() | |
######################## | |
# Paths to executables # | |
######################## | |
EPUB_FILES = BASE_DIR / "data/epub_files/" | |
# EPUB_FILES = os.path.dirname(sys.argv[0]) + "/data/epub_files/" | |
GM_PATH = "gm" | |
PDFCROP_EXEC = "pdfcrop" | |
############################ | |
# Paths: | |
############################ | |
PUBLICATION_DIR = args.PUBLICATION_DIR | |
INPUT_DIR = \ | |
args.input_dir if args.input_dir is not None else DEFAULT_OUTPUT_DIR / PUBLICATION_DIR.resolve().stem / "imxml" | |
OUTPUT_DIR = \ | |
args.output_dir if args.output_dir is not None else (DEFAULT_OUTPUT_DIR / PUBLICATION_DIR.resolve().stem) / "epub" | |
LOG_DIR = OUTPUT_DIR / "log" | |
LOG_FILE = (LOG_DIR / SCRIPT_NAME) . with_suffix( ".log" ) | |
TEMP_DIR = OUTPUT_DIR / "tmp_files" | |
DEBUG_DIR = OUTPUT_DIR / "debug" | |
################################## | |
# Reading the configuration file # | |
################################## | |
config_file = args.CONFIG_FILE | |
print(f"The config file is {config_file}") | |
logseparator = "-"*53 + "\n" | |
CONFIG = load_config( | |
config_file, | |
args.log_level, | |
LOG_FILE, | |
# args.log_file, | |
) | |
############################ | |
# Paths to auxiliary files # | |
############################ | |
TRANSLATION_FILE = BASE_DIR / CONFIG['Auxiliaries']['TRANSLATIONS'] | |
translation_xml = etree.parse( str( TRANSLATION_FILE ) ) | |
if not TEMP_DIR.exists(): | |
os.makedirs( TEMP_DIR ) | |
if not DEBUG_DIR.exists(): | |
os.makedirs( DEBUG_DIR ) | |
# Check for folder and necessary files | |
if not os.path.exists( OUTPUT_DIR / "publication.cfg"): | |
logging.info(f"The publication.cfg file is missing in {OUTPUT_DIR} directory.") | |
if os.path.exists(INPUT_DIR / "publication.cfg"): | |
shutil.copy(INPUT_DIR / "publication.cfg", OUTPUT_DIR) | |
logging.info(f"Copied from {INPUT_DIR}.") | |
else: | |
logging.error("Found no publication.cfg. Exiting") | |
sys.exit( 1 ) | |
if not os.path.exists( OUTPUT_DIR / "Cover.jpg"): | |
logging.info(f"The file Cover.jpg in {OUTPUT_DIR} directory is missing.") | |
if os.path.exists(INPUT_DIR / "Cover.jpg"): | |
shutil.copy(INPUT_DIR / "Cover.jpg", OUTPUT_DIR / "cover.jpg") | |
logging.info("Copied from current directory.") | |
else: | |
logging.error("No coverfile found. You can create a temporary one with the mkimage.py script") | |
sys.exit( 1 ) | |
# Datei = open( TEMP_DIR / 'intermediate.log', 'w') | |
xmlTree = etree.parse( str(INPUT_DIR / "IntermediateXMLFile.xml") ) | |
with open(INPUT_DIR / 'tmp_files/data.pickle', 'rb') as f: | |
data = pickle.load(f) | |
dictSections = data["secdict"] | |
dictEquations = data["eqdict"] | |
dictLists = data["listdict"] | |
dictChapters = data["chapterdict"] | |
dictFigures = data["figdict"] | |
dictFootnotes = data["fndict"] | |
dictTheorems = data["theoremdict"] | |
dictTables = data["tabdict"] | |
dictPagelabels = data["pagelabeldict"] | |
if args.hyperimage: | |
logging.info("Enabled Hyperimage support") | |
else: | |
pass | |
def get_mimetype(filename_suffix): | |
"""Return mimetype of image""" | |
if filename_suffix.lower() in [".jpg", ".jpeg"]: | |
mimetype = "jpg" | |
elif filename_suffix.lower() in [".png", ".pdf"]: | |
mimetype = "png" | |
else: | |
logging.error(f"Found an unrecognized image suffix: {filename_suffix}") | |
sys.exit(1) | |
return mimetype | |
# def get_mimetype ends here | |
def remove_processinginstruction(xml_tree, pi_name): | |
"""Remove processing instructions with a specific name""" | |
proc_insts = xml_tree.xpath("//processing-instruction('{}')".format(pi_name)) | |
# counter = 1 | |
# for instruction in proc_insts: | |
# logging.debug("looking at pi %d" % counter) | |
# instruction_previous = instruction.getprevious() | |
# instruction_parent = instruction.getparent() | |
# if instruction_previous is not None: | |
# if instruction_previous.tail is not None: | |
# instruction_previous_tail = instruction_previous.tail | |
# else: | |
# instruction_previous_tail = "" | |
# else: | |
# instruction_previous_tail = "" | |
# instruction_tail = instruction.tail | |
# instruction_parent_text = instruction_parent.text | |
# print("parent text", instruction_parent_text) | |
# print("previous ", instruction_previous_tail) | |
# print("pi tail", instruction_tail) | |
# if instruction_previous is not None: | |
# logging.debug("case 1") | |
# if instruction_tail is not None: | |
# logging.debug("case 2") | |
# instruction_previous_tail += instruction_tail | |
# else: | |
# logging.debug("case 3") | |
# if instruction_tail is not None: | |
# instruction_parent.text += instruction_tail | |
# instruction_parent.remove(instruction) | |
# counter += 1 | |
# Leaving that out for now. | |
# Found solution below on https://stackoverflow.com/questions/31522162/, but that | |
# seems only to work in all cases | |
for instruction in proc_insts: | |
etree.strip_tags(instruction.getparent(), instruction.tag) | |
logging.debug("Removed %s processing instructions of type %s." % (len(proc_insts), pi_name)) | |
return xml_tree | |
# def remove_processinginstruction ends here | |
def addToContentopf(contentopf, Filename, FileID, Mediatype): | |
"""Function to add Elements to Content-OPF (epub)""" | |
# logging.debug("considering adding %s with FileID %s to content.opf" % (Filename, FileID)) | |
global listContentopf | |
# Sanitizing FileID, id-attribute may not contain _ : or / | |
# FileID may also not start with a number | |
FileID = re.sub("\_", "", FileID) | |
FileID = re.sub("\.", "", FileID) | |
FileID = re.sub("\/", "", FileID) | |
FileID = re.sub("^[0-9]", "", FileID) | |
FileID = re.sub("^[0-9]", "", FileID) | |
FileID = re.sub("^[0-9]", "", FileID) | |
if FileID in listContentopf: | |
# logging.debug("Not adding %s, because something with a FileID %s is already there" % (Filename, FileID)) | |
return contentopf | |
else: | |
# Sanitizing FileID, id-attribute may not contain _ : or / | |
# FileID may also not start with a number | |
FileID = re.sub("\_", "", FileID) | |
FileID = re.sub("\.", "", FileID) | |
FileID = re.sub("\/", "", FileID) | |
FileID = re.sub("^[0-9]", "", FileID) | |
FileID = re.sub("^[0-9]", "", FileID) | |
FileID = re.sub("^[0-9]", "", FileID) | |
dictMediatypes = { | |
"txt" : "text/plain", | |
"ttf" : "application/x-font-truetype", | |
"otf" : "application/vnd.ms-opentype", | |
"xml" : "application/xhtml+xml", | |
"jpg" : "image/jpeg", | |
"png" : "image/png", | |
"bitstream" : "application/octet-stream" | |
} | |
contentopfns = "{http://www.idpf.org/2007/opf}" | |
xmlManifest = contentopf.find(".//" + contentopfns + "manifest") | |
xmlItem = etree.Element("item") | |
xmlItem.set("id", FileID) | |
xmlItem.set("media-type", dictMediatypes[Mediatype]) | |
xmlItem.set("href", str(Filename)) | |
xmlManifest.append(xmlItem) | |
# logging.debug("Added %s, with FileID %s" % (Filename, FileID)) | |
# if it's an XML-File also extend <spine> | |
if Mediatype == "xml": | |
xmlSpine = contentopf.find(".//" + contentopfns + "spine") | |
xmlItemref = etree.Element("itemref") | |
xmlItemref.set("idref", FileID) | |
xmlSpine.append(xmlItemref) | |
listContentopf.append(FileID) | |
return contentopf | |
# def addToContentopf ends here | |
def addToTocncx(tocncx, Label, intTechnicalChapterNumber): | |
"""Function to add Chapters to Table of Contents (epub)""" | |
tocncxns = "{http://www.daisy.org/z3986/2005/ncx/}" | |
xmlNavMap = tocncx.find(".//" + tocncxns + "navMap") | |
xmlNavPoint = etree.Element("navPoint") | |
xmlNavPoint.set("playOrder", str(intTechnicalChapterNumber + 1)) | |
xmlNavPoint.set("id", "chapter" + str(intTechnicalChapterNumber)) | |
xmlNavLabel = etree.Element("navLabel") | |
xmlNavLabelText = etree.Element("text") | |
xmlNavLabelText.text = Label | |
xmlNavLabel.append(xmlNavLabelText) | |
xmlNavPoint.append(xmlNavLabel) | |
xmlContent = etree.Element("content") | |
xmlContent.set("src", "chapter" + str(intTechnicalChapterNumber) + ".xhtml") | |
xmlNavPoint.append(xmlContent) | |
xmlNavMap.append(xmlNavPoint) | |
return tocncx | |
# def addToTocncx ends here | |
def create_epub_container(filename, OUTPUT_DIR): | |
"""Use zip to create the epub container file""" | |
import zipfile | |
epub_filename = f"{filename.lower()}-raw.epub" | |
output_dir = OUTPUT_DIR.resolve() | |
cwd = Path.cwd() | |
os.chdir( output_dir ) | |
epubcontainer = zipfile.ZipFile(epub_filename, "w") | |
epubcontainer.write("mimetype", compresslevel=None) | |
epubcontainer.write("META-INF/container.xml") | |
for dirname, subdirs, files in os.walk("OEBPS"): | |
epubcontainer.write(dirname) | |
for contentfile in files: | |
epubcontainer.write(os.path.join(dirname, contentfile)) | |
epubcontainer.close() | |
logging.info(f"Wrote {output_dir}/{epub_filename}.") | |
os.chdir( cwd ) | |
# def create_epub_container ends here | |
def add_fonts_to_opf(font_files, contentopf, otf_counter, txt_counter, fontdir=""): | |
"""Add font files to OPF, return new values of id counters""" | |
if not fontdir: | |
fontdir = EPUB_FILES | |
for fontfile in font_files: | |
shutil.copy( | |
Path(fontdir) / fontfile, | |
OUTPUT_DIR / "OEBPS/fonts/" | |
) | |
base_file_name, file_extension = os.path.splitext(fontfile) | |
if file_extension == ".otf": | |
contentopf = addToContentopf( | |
contentopf, | |
Path("fonts") / fontfile, | |
"fontfile" + str(otf_counter), | |
file_extension[1:] | |
) | |
otf_counter += 1 | |
elif file_extension == ".ttf": | |
contentopf = addToContentopf( | |
contentopf, | |
Path("fonts") / fontfile, | |
"fontfile" + str(otf_counter), | |
"ttf" | |
) | |
otf_counter += 1 | |
elif file_extension == ".txt": | |
contentopf = addToContentopf( | |
contentopf, | |
Path("fonts") / fontfile, | |
"font-txt" + str(txt_counter), | |
file_extension[1:] | |
) | |
txt_counter += 1 | |
elif file_extension == "": | |
contentopf = addToContentopf( | |
contentopf, | |
Path("fonts") / fontfile, | |
"font-txt" + str(txt_counter), | |
"bitstream" | |
) | |
txt_counter += 1 | |
else: | |
logging.error(f"Other file found with extension '{file_extension}'. Exiting") | |
sys.exit() | |
return otf_counter, txt_counter | |
# def add_fonts_to_opf ends here | |
def create_extra_font_css(css_selector, extrafontfiles): | |
"""Create CSS code for extra font""" | |
fontfiles = [x for x in extrafontfiles if x.endswith(".otf") or x.endswith(".ttf")] | |
if len(fontfiles) > 1: | |
logging.error("Found more than one font file in the list. Exiting") | |
sys.exit() | |
else: | |
fontfile = fontfiles[0] | |
font_object = ImageFont.truetype(fontfile) | |
font_label = font_object.getname()[0] | |
css_code = f""".{css_selector} {{ | |
font-family: '{font_label}'; | |
}} | |
@font-face {{ | |
font-family: "{font_label}"; | |
src: url("fonts/{fontfile}"); | |
}} | |
""" | |
return css_code | |
# def create_extra_font_css ends here | |
def add_css_snippet(css_snippet, css_file): | |
"""Add extra font CSS to epub CSS file""" | |
with open(css_file, "a") as filehandler: | |
filehandler.write(f"\n{css_snippet}") | |
logging.info("Added extra code to css file") | |
# def create_extra_font_css ends here | |
############################################################## | |
# Create .epub basic structure # | |
############################################################## | |
if not os.path.exists( OUTPUT_DIR / "META-INF" ): | |
os.mkdir( OUTPUT_DIR / "META-INF" ) | |
if not os.path.exists( OUTPUT_DIR / "OEBPS" ): | |
os.mkdir( OUTPUT_DIR / "OEBPS" ) | |
if not os.path.exists( OUTPUT_DIR / "OEBPS" / "images" ): | |
os.mkdir( OUTPUT_DIR / "OEBPS" / "images" ) | |
if not os.path.exists( OUTPUT_DIR / "OEBPS" / "fonts" ): | |
os.mkdir( OUTPUT_DIR / "OEBPS" / "fonts" ) | |
# Copy containter.xml and mimetype | |
shutil.copy(EPUB_FILES / "epubcontainer.xml", OUTPUT_DIR / "META-INF/container.xml") | |
shutil.copy(EPUB_FILES / "epubmimetype", OUTPUT_DIR / "mimetype") | |
# Preparing content.opf | |
xmlContentopfParser = etree.XMLParser(no_network=False,load_dtd=False) | |
contentopf = etree.parse( str(EPUB_FILES/"epubcontentopf.xml"), xmlContentopfParser) | |
# This list includes all files which have already been included to avoid duplicates | |
listContentopf = [] | |
######### | |
# Fonts # | |
######### | |
libertine_fonts = ["GPL.txt", "LICENCE.txt", "LinLibertine_R.otf", "LinLibertine_RI.otf", "LinLibertine_RZ.otf", "LinLibertine_RZI.otf", "OFL-1.1.txt"] | |
termes_fonts = ["texgyretermes-bold.otf", "texgyretermes-bolditalic.otf", "texgyretermes-italic.otf", "texgyretermes-regular.otf"] | |
font_files = termes_fonts | |
mainfont_string = "TeXGyreTermes" | |
css_rules = EPUB_FILES / "fontfacetermes.css" | |
if args.font == "termes": | |
logging.info("Using default font TeX Gyre Termes.") | |
elif args.font == "libertine": | |
font_files = libertine_fonts | |
mainfont_string = "Linux Libertine O" | |
css_rules = EPUB_FILES / "fontfacelibertine.css" | |
else: | |
logging.info("Font not recognized, falling back to default (TeX Gyre Termes).") | |
with open(EPUB_FILES / "eoa-epub-core.css", "r") as template_file: | |
css_template = template_file.read() | |
with open(css_rules, "r") as filehandler: | |
fontface_string = filehandler.read() | |
css_template_string = string.Template(css_template) | |
css_replacement = css_template_string.substitute( | |
MAINFONT=mainfont_string, | |
FONTFACESPEC=fontface_string) | |
final_epub_css = OUTPUT_DIR / "OEBPS/eoa-epub.css" | |
with open(final_epub_css, "w") as write_css: | |
write_css.write(css_replacement) | |
otf_id_counter = 1 | |
txt_id_counter = 1 | |
otf_id_counter, txt_id_counter = add_fonts_to_opf(font_files, contentopf, otf_id_counter, txt_id_counter) | |
if args.extra_font_files_directory: | |
extra_fonts = os.listdir(args.extra_font_files_directory) | |
otf_id_counter, txt_id_counter = add_fonts_to_opf(extra_fonts, contentopf, otf_id_counter, txt_id_counter, fontdir=args.extra_font_files_directory) | |
if args.extra_font_selector: | |
css_selector = args.extra_font_selector | |
css_snippet = create_extra_font_css(css_selector, extra_fonts) | |
add_css_snippet(css_snippet, OUTPUT_DIR / "OEBPS/eoa-epub.css") | |
# Shortcut for namespace | |
htmlns = "{http://www.w3.org/1999/xhtml}" | |
# Load Template for Chapter HTML | |
xmlChapterParser = etree.XMLParser(no_network=False,load_dtd=False) #resolve_entities=False | |
# Preparing toc.ncx | |
xmlTocncxParser = etree.XMLParser(no_network=False,load_dtd=False) | |
tocncx = etree.parse(str(EPUB_FILES / "epubtocncx.xml"), xmlTocncxParser) | |
logging.info(f"{logseparator}Preparing content.opf") | |
xmlMetadata = contentopf.find(".//{http://www.idpf.org/2007/opf}metadata") | |
# Prepare Metadata based on Publication.cfg | |
cfgPublication = configparser.RawConfigParser() | |
logging.debug(f"Reading publication.cfg from {OUTPUT_DIR / 'publication.cfg'}.") | |
cfgPublication.read( OUTPUT_DIR / "publication.cfg") | |
publication_series = cfgPublication.get("Technical", "Serie") | |
publication_number = cfgPublication.get("Technical", "Number") | |
publication_license = cfgPublication.get("Technical", "License") | |
publication_landingpage = cfgPublication.get("Technical", "LandingPage") | |
try: | |
publication_isbn = cfgPublication.get("Technical", "ISBN-epub") | |
except: | |
publication_isbn = cfgPublication.get("Technical", "ISBN") | |
# Prepare Author String | |
strAuthorString = cfgPublication.get("Authors", "Author1") | |
if cfgPublication.get("Authors", "Author2") != "": | |
strAuthorString = cfgPublication.get("Authors", "Author1") + " and " + cfgPublication.get("Authors", "Author2") | |
if cfgPublication.get("Authors", "Author3") != "": | |
strAuthorString = cfgPublication.get("Authors", "Author1") + ", " + cfgPublication.get("Authors", "Author2") + " and " + cfgPublication.get("Authors", "Author3") | |
if cfgPublication.get("Authors", "Author4") != "": | |
strAuthorString = cfgPublication.get("Authors", "Author1") + ", " + cfgPublication.get("Authors", "Author2") + ", " + cfgPublication.get("Authors", "Author3") + " and " + cfgPublication.get("Authors", "Author4") | |
xmlAuthor = etree.Element("{http://purl.org/dc/elements/1.1/}creator") | |
xmlAuthor.text = strAuthorString | |
xmlMetadata.append(xmlAuthor) | |
# Prepare Title-String | |
strTitleString = cfgPublication.get("Technical", "Title") | |
xmlTitle = etree.Element("{http://purl.org/dc/elements/1.1/}title") | |
xmlTitle.text = strTitleString | |
xmlMetadata.append(xmlTitle) | |
# Prepare Description via Subtitle | |
strSubtitleString = cfgPublication.get("Technical", "Subtitle") | |
if strSubtitleString != "": | |
xmlSubtitle = etree.Element("{http://purl.org/dc/elements/1.1/}description") | |
xmlSubtitle.text = strSubtitleString | |
xmlMetadata.append(xmlSubtitle) | |
# Prepare Identifier | |
strIdentifier = publication_isbn | |
xmlIdentifier = etree.Element("{http://purl.org/dc/elements/1.1/}identifier") | |
xmlIdentifier.text = strIdentifier | |
xmlIdentifier.set("id", "BookId") | |
xmlMetadata.append(xmlIdentifier) | |
# Prepare Type | |
xmlType = etree.Element("{http://purl.org/dc/elements/1.1/}type") | |
xmlType.text = "Text" | |
xmlMetadata.append(xmlType) | |
#Prepare Date | |
strPublicationDate = cfgPublication.get("Technical", "PublicationDate") | |
xmlDate = etree.Element("{http://purl.org/dc/elements/1.1/}date") | |
xmlDate.text = strPublicationDate | |
xmlDate.set("{http://www.idpf.org/2007/opf}event", "creation") | |
xmlMetadata.append(xmlDate) | |
# Prepare Publisher | |
xmlPublisher = etree.Element("{http://purl.org/dc/elements/1.1/}publisher") | |
xmlPublisher.text = "Edition Open Access" | |
xmlMetadata.append(xmlPublisher) | |
# Prepare Rights | |
xmlPublisher = etree.Element("{http://purl.org/dc/elements/1.1/}rights") | |
xmlPublisher.text = "Published under Creative Commons by-nc-sa 3.0 Germany Licence" | |
xmlMetadata.append(xmlPublisher) | |
# Prepare Source | |
xmlSource = etree.Element("{http://purl.org/dc/elements/1.1/}source") | |
xmlSource.text = "Max Planck Research Library for the History and Development of Knowledge" | |
xmlMetadata.append(xmlSource) | |
# Prepare Subject | |
strSubject = cfgPublication.get("General", "Keyword1") | |
xmlSubject = etree.Element("{http://purl.org/dc/elements/1.1/}subject") | |
xmlSubject.text = strSubject | |
xmlMetadata.append(xmlSubject) | |
# Prepare Language | |
strLanguage = cfgPublication.get("Technical", "Language") | |
xmlLanguage = etree.Element("{http://purl.org/dc/elements/1.1/}language") | |
xmlLanguage.text = strLanguage | |
xmlMetadata.append(xmlLanguage) | |
#Prepare Cover | |
xmlCover = etree.Element("meta") | |
xmlCover.set("content", "cover_pic") | |
xmlCover.set("name", "cover") | |
xmlMetadata.append(xmlCover) | |
xmlManifest = contentopf.find(".//{http://www.idpf.org/2007/opf}manifest") | |
xmlItem = etree.Element("item") | |
xmlItem.set("id", "cover_pic") | |
xmlItem.set("href", "images/cover.jpg") | |
xmlItem.set("media-type", "image/jpeg") | |
xmlManifest.append(xmlItem) | |
shutil.copy( | |
OUTPUT_DIR / "cover.jpg", | |
OUTPUT_DIR / "OEBPS/images/" | |
) | |
xmlItem = etree.Element("item") | |
xmlItem.set("id", "cover") | |
xmlItem.set("href", "cover.xhtml") | |
xmlItem.set("media-type", "application/xhtml+xml") | |
xmlManifest.append(xmlItem) | |
shutil.copy(EPUB_FILES / "epubcover.xhtml", OUTPUT_DIR / "OEBPS/cover.xhtml") | |
logging.info(f"{logseparator}Preparing intro.xhtml") | |
if publication_series == "Sources": | |
tmpFilePath = EPUB_FILES / "epubintro-sources.xhtml" | |
else: | |
tmpFilePath = EPUB_FILES / "epubintro.xhtml" | |
tmpFile = open(tmpFilePath, "r") | |
strIntroHTML = tmpFile.read() | |
tmpFile.close() | |
strIntroHTML = re.sub("author", strAuthorString, strIntroHTML) | |
strIntroHTML = re.sub("TITLE", strTitleString, strIntroHTML) | |
strIntroHTML = re.sub("year", cfgPublication.get("Technical", "PublicationYear"), strIntroHTML) | |
strIntroHTML = re.sub("series", publication_series, strIntroHTML) | |
strIntroHTML = re.sub("number", publication_number, strIntroHTML) | |
strIntroHTML = re.sub("epubisbn", publication_isbn, strIntroHTML) | |
if publication_license == "by-nc-sa": | |
license_string = """Published under Creative Commons by-nc-sa 3.0 Germany Licence<br /> | |
https://creativecommons.org/licenses/by-nc-sa/3.0/de/<br />""" | |
elif publication_license == "by-sa": | |
license_string = """Published under Creative Commons Attribution-ShareAlike 4.0 International Licence<br /> | |
https://creativecommons.org/licenses/by-sa/4.0/<br />""" | |
else: | |
logging.error("No license found. Exiting") | |
sys.exit( 1 ) | |
strIntroHTML = re.sub("LicenseInformation", license_string, strIntroHTML) | |
try: | |
strIntroHTML = re.sub("AdditionalInformation", "<p>" + cfgPublication.get("General", "AdditionalInformation") + "</p>", strIntroHTML) | |
except configparser.NoOptionError: | |
strIntroHTML = re.sub("AdditionalInformation", "", strIntroHTML) | |
tmpFilePath = OUTPUT_DIR / "OEBPS/intro.xhtml" | |
# tmpFilePath = os.getcwd() + "/CONVERT/epub/OEBPS/intro.xhtml" | |
tmpFile = open(tmpFilePath, "w") | |
tmpFile.write(strIntroHTML) | |
logging.info(f"{logseparator}Preparing toc.ncx") | |
xmlHead = tocncx.find("//{http://www.daisy.org/z3986/2005/ncx/}head") | |
xmlMeta = etree.Element("meta") | |
xmlMeta.set("name", "dtb:uid") | |
xmlMeta.set("content", publication_isbn) | |
xmlHead.append(xmlMeta) | |
xmlTitle = tocncx.find("//{http://www.daisy.org/z3986/2005/ncx/}docTitle") | |
xmlText = etree.Element("text") | |
xmlText.text = strTitleString | |
xmlTitle.append(xmlText) | |
xmlAuthor = tocncx.find("//{http://www.daisy.org/z3986/2005/ncx/}docAuthor") | |
xmlText = etree.Element("text") | |
xmlText.text = strAuthorString | |
xmlAuthor.append(xmlText) | |
############################################################## | |
# Convert Tralics-XML to Epub # | |
############################################################## | |
#xmlTree = remove_processinginstruction(xmlTree, 'hyperimage') | |
# Copy xmlTree to xmlEbookTree | |
xmlEbookTree = deepcopy(xmlTree) | |
# xmlChapters is a list containing all chapters | |
xmlChapters = xmlEbookTree.findall("//div1") | |
# Convert Chapters, Sections, Subsections and Subsubsections to h1, h2, h3, h4 | |
# Insert Number from Dictionary where needed | |
logging.info(f"{logseparator}Convert EOAChapter to H1") | |
for xmlChapter in xmlChapters: | |
xmlChapter.find("head").tag = "h1" | |
if xmlChapter.get("rend") != "nonumber": | |
idChapter = xmlChapter.get("id") | |
# logging.info(idChapter + " konvertierung into h1") | |
# logging.info(dictChapters[idChapter]) | |
strHeadline = xmlChapter.find("h1").text or "" | |
xmlChapter.find("h1").text = str(dictChapters[idChapter]) + ". " + strHeadline | |
if xmlChapter.find(".//EOAauthor") is not None: | |
tmpXML = etree.Element("p") | |
tmpXML.append(etree.Element("i")) | |
tmpXML[0].text = xmlChapter.find(".//EOAauthor").text | |
xmlChapter.insert(1, tmpXML) | |
# Remove unwanted EOAauthor here | |
xmlChapter.find(".//EOAauthor").text = "" | |
xmlChapter = etree.strip_tags(xmlChapter, "EOAauthor") | |
# logging.info(dictSections) | |
logging.info(f"{logseparator}Convert EOAsection to H2") | |
xmlSections = xmlEbookTree.findall(".//div2") | |
for xmlSection in xmlSections: | |
xmlSection.find("head").tag = "h2" | |
idSection = xmlSection.get("id") | |
strHeadline = xmlSection.find("h2").text or "" | |
logging.info(strHeadline) | |
if xmlSection.get("rend") != "nonumber": | |
xmlSection.find("h2").text = str(dictSections[idSection]) + " " + strHeadline | |
else: | |
xmlSection.find("h2").text = strHeadline | |
logging.info(f"{logseparator}Convert EOAsubsection to H3") | |
xmlSubsections = xmlEbookTree.findall(".//div3") | |
for xmlSubsection in xmlSubsections: | |
xmlSubsection.find("head").tag = "h3" | |
idSection = xmlSubsection.get("id") | |
strHeadline = xmlSubsection.find("h3").text or "" | |
logging.info(strHeadline) | |
if xmlSubsection.get("rend") != "nonumber": | |
xmlSubsection.find("h3").text = str(dictSections[idSection]) + " " + strHeadline | |
else: | |
xmlSubsection.find("h3").text = strHeadline | |
logging.info(f"{logseparator}Convert EOAsubsubsection to H4") | |
xmlSubsubsections = xmlEbookTree.findall(".//div4") | |
for xmlSubsubsection in xmlSubsubsections: | |
xmlSubsubsection.find("head").tag = "h4" | |
#if xmlSubsubsection.get("rend") != "nonumber": | |
#idSection = xmlSubsection.get("id") | |
#strHeadline = xmlSubsection.find("h4").text | |
#xmlSubsection.find("h3").text = str(dictSections[idSection]) + " " + strHeadline | |
logging.info(f"{logseparator}Convert EOAparagraph to H5") | |
xmlParagraphs = xmlEbookTree.findall(".//div5") | |
for xmlParagraph in xmlParagraphs: | |
logging.info("Found a paragraph.") | |
xmlParagraph.find("head").tag = "h5" | |
logging.info(f"{logseparator}Dealing with dividing milestone") | |
xmlParagraphs = xmlEbookTree.findall(".//p[@class='divider']") | |
for xmlParagraph in xmlParagraphs: | |
logging.info("Found a divider.") | |
xmlParagraph.text = "*" | |
logging.info(f"{logseparator}Preparing Figures") | |
xmlFigures = xmlEbookTree.xpath(".//EOAfigure[not(contains(@type,'hionly'))]") | |
logging.info("Found %s figures", len(xmlFigures)) | |
for xmlFigure in xmlFigures: | |
# Copy File of the Image | |
# If it's in a subfolder, name of folder and name of image will be merged | |
strImageFileString = xmlFigure.find(".//file").text | |
logging.debug(f"Working on image {strImageFileString}.") | |
strImageFileString = strImageFileString.rstrip("\n") | |
strImageFileDir = os.path.dirname(strImageFileString) | |
# Remove / from path | |
strImageFileDir = re.sub("/", "", strImageFileDir) | |
strImageFileName = os.path.basename(strImageFileString) | |
strImageFileNamewoSuffix, strImageFileName_Suffix = os.path.splitext(strImageFileName) | |
if strImageFileName_Suffix == ".jpeg": | |
strImageFileName = strImageFileName.replace(".jpeg", ".jpg") | |
shutil.copy( | |
PUBLICATION_DIR / strImageFileString, | |
OUTPUT_DIR / "OEBPS/images" / (strImageFileDir + strImageFileName) | |
) | |
extension_and_mime = get_mimetype(strImageFileName_Suffix) | |
strImageFilepath = libeoaconvert.sanitizeImage( | |
OUTPUT_DIR / "OEBPS/images" / (strImageFileDir + strImageFileName), | |
TEMP_DIR, | |
GM_PATH, | |
PDFCROP_EXEC, | |
) | |
# strImageFilepath = libeoaconvert.sanitizeImage(os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strImageFileDir + strImageFileName, GM_PATH, PDFCROP_EXEC) | |
# Add copied file to contentopf | |
content_opf_filename = Path ("images") / "{}{}.{}".format(strImageFileDir, strImageFileNamewoSuffix, extension_and_mime) | |
content_opf_fileid = "{}{}{}".format(strImageFileDir, strImageFileNamewoSuffix, extension_and_mime) | |
contentopf = addToContentopf( | |
contentopf, | |
content_opf_filename, | |
content_opf_fileid, | |
extension_and_mime | |
) | |
''' | |
content_opf_filename = "images" + os.path.sep + "{}{}.{}".format(strImageFileDir, strImageFileNamewoSuffix, extension_and_mime) | |
content_opf_fileid = "{}{}{}".format(strImageFileDir, strImageFileNamewoSuffix, extension_and_mime) | |
contentopf = addToContentopf(contentopf, content_opf_filename, content_opf_fileid, extension_and_mime) | |
''' | |
idFigure = xmlFigure.find(".//anchor").get("id") | |
xmlFigureCaption = xmlFigure.find(".//caption") | |
intFigureNumber = dictFigures[idFigure] | |
if xmlFigure.tag == "EOAfigure": | |
strImageWidth = xmlFigure.find(".//width").text | |
strImageWidth = strImageWidth.rstrip("\n") | |
if xmlFigure.tag == "EOAlsfigure": | |
strImageWidth = "100" | |
xmlFigure.clear() | |
xmlFigure.tag = "p" | |
xmlFigure.set("class", "centered_image") | |
xmlFigureImage = etree.Element("img") | |
xmlFigureImage.set("src", "images/" + strImageFileDir + strImageFileNamewoSuffix + "." + extension_and_mime) | |
xmlFigureImage.set("alt", "") | |
xmlFigureImage.set("style", "width: " + strImageWidth + "%") | |
xmlFigure.append(xmlFigureImage) | |
xmlFigureCaption.tag = "p" | |
strFigureCaption = xmlFigureCaption.text or "" | |
# FIX | |
dictLangFigures = translation_xml.find("//entry[@name='fig']").attrib | |
xmlChapter = xmlFigure.xpath("./ancestor::div1")[0] | |
figures_text = dictLangFigures[libeoaconvert.two_letter_language(xmlChapter.get("language"))] | |
if len(strFigureCaption) == 0: | |
xmlFigureCaption.text = f"{figures_text} {str(intFigureNumber)}" | |
else: | |
xmlFigureCaption.text = f"{figures_text} {str(intFigureNumber)}: {strFigureCaption}" | |
xmlFigure.addnext(xmlFigureCaption) | |
# Change the tag of the parent <p>-Tag to <div> so that it may be removed | |
#xmlFigure.getparent().tag = "div" | |
xml_figures_hyperimage = xmlEbookTree.xpath(".//EOAfigure[contains(@type,'hionly')]") | |
logging.info("Found %s hyperimage figures", len(xml_figures_hyperimage)) | |
for fig in xml_figures_hyperimage: | |
fig.clear() | |
fig.tag = "EOAhifigure" | |
logging.info(f"{logseparator}Preparing not numbered Figures") | |
xmlFigures = xmlEbookTree.findall(".//EOAfigurenonumber") | |
for xmlFigure in xmlFigures: | |
# Copy File of the Image | |
# If it's in a subfolder, name of folder and name of image will be merged | |
strImageFileString = xmlFigure.find(".//file").text | |
strImageFileString = strImageFileString.rstrip("\n") | |
strImageFileDir = os.path.dirname(strImageFileString) | |
strImageFileDir = re.sub("/", "", strImageFileDir) | |
strImageFileName = os.path.basename(strImageFileString) | |
strImageFileNamewoSuffix, strImageFileName_Suffix = os.path.splitext(strImageFileName) | |
if strImageFileName_Suffix == ".jpeg": | |
strImageFileName = strImageFileName.replace(".jpeg", ".jpg") | |
shutil.copy( | |
PUBLICATION_DIR / strImageFileString, | |
OUTPUT_DIR / "OEBPS/images" / (strImageFileDir + strImageFileName) | |
) | |
strImageFilepath = libeoaconvert.sanitizeImage( | |
OUTPUT_DIR / "OEBPS/images" / (strImageFileDir + strImageFileName), | |
TEMP_DIR, | |
GM_PATH, | |
PDFCROP_EXEC | |
) | |
# shutil.copy(os.getcwd() + "/" + strImageFileString, os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strImageFileDir + strImageFileName) | |
# strImageFilepath = libeoaconvert.sanitizeImage(os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strImageFileDir + strImageFileName, GM_PATH, PDFCROP_EXEC) | |
# Add copied file to contentopf | |
extension_and_mime = get_mimetype(strImageFileName_Suffix) | |
contentopf = addToContentopf( | |
contentopf, | |
"images/" + strImageFileDir + strImageFileNamewoSuffix + "." + extension_and_mime, | |
strImageFileDir + strImageFileNamewoSuffix + "-nonumber-" + extension_and_mime, | |
extension_and_mime | |
) | |
# contentopf = addToContentopf(contentopf, "images/" + strImageFileDir + strImageFileNamewoSuffix + ".jpg", strImageFileDir + strImageFileNamewoSuffix + "-nonumber-jpg", "jpg") | |
logging.debug("Added a nonumber figure") | |
strImageWidth = xmlFigure.find(".//width").text | |
strImageWidth = strImageWidth.rstrip("\n") | |
xmlFigure.clear() | |
xmlFigure.tag = "p" | |
xmlFigureImage = etree.Element("img") | |
xmlFigureImage.set("src", "images/" + strImageFileDir + strImageFileNamewoSuffix + "." + extension_and_mime) | |
xmlFigureImage.set("alt", "") | |
xmlFigureImage.set("style", "width: " + strImageWidth + "%") | |
xmlFigure.append(xmlFigureImage) | |
logging.info(f"{logseparator}Preparing Footnotes") | |
def alph_footnote_index(fndex): | |
""" | |
lowercase Latin footnotes need to support more than 26 values | |
These are zero-indexed. | |
>>> alph_footnote_index(0) | |
'a' | |
>>> alph_footnote_index(1) | |
'b' | |
>>> alph_footnote_index(24) | |
'y' | |
>>> alph_footnote_index(25) | |
'z' | |
>>> alph_footnote_index(26) | |
'aa' | |
>>> alph_footnote_index(27) | |
'ab' | |
""" | |
alphabet = "abcdefghijklmnopqrstuvwxyz" | |
quotient, remainder = divmod(fndex, len(alphabet)) | |
if not quotient: return alphabet[fndex] | |
return alph_footnote_index(quotient - 1) + alph_footnote_index(remainder) | |
# def alph_footnote_index ends here | |
def replace_footnote_equations(footnote): | |
""" | |
captures reusable behavior from the existing code | |
potentially, some of the old code could be replaced by calls to this helper | |
usage: contentopf = replace_footnote_equations(my_footnote) | |
unfortunately, returning the result seemed like a better idea than mutating the global variable | |
""" | |
result = contentopf | |
for equation in footnote.findall(".//EOAequationnonumber"): | |
filename = equation.get("filename") | |
equation.clear() | |
equation.tag = "p" | |
img = etree.Element("img", src="images/%s" % filename, alt="") | |
equation.append(img) | |
shutil.copy( | |
INPUT_DIR / "items" / filename, | |
OUTPUT_DIR / "DEBPS/images" / filename | |
) | |
result = addToContentopf( | |
result, | |
"images/" + filename, | |
filename, | |
"png" | |
) | |
''' | |
cwd = os.getcwd() | |
shutil.copy("%s/items/%s" % (cwd, filename), "%s/CONVERT/epub/DEBPS/images/%s" % (cwd, filename)) | |
result = addToContentopf(result, "images/" + filename, filename, "png") | |
''' | |
logging.info("einmal durch replace_footnote_equations") | |
return result | |
# def replace_footnote_equations ends here | |
def replace_footnote_with_sup(note): | |
""" | |
captures reusable behavior from the existing code | |
potentially, some of the old code could be replaced by calls to this helper | |
this behavior showed up in a few places | |
I thought I would be able to extract a little more, but this was all that was actually common | |
""" | |
tail = note.tail | |
note.clear() | |
note.tail = tail | |
note.tag = "sup" | |
# def replace_footnote_with_sup ends here | |
def bring_footnote_down_epub(footnote, footnote_name, destination): | |
""" | |
captures reusable behavior from the existing code | |
potentially, some of the old code could be replaced by calls to this helper | |
usage: contentopf = bring_footnote_down_epub(my_footnote, "1", xmlNewFootnotes) | |
unfortunately, returning the result seemed like a better idea than mutating the global variable | |
""" | |
contentopf = replace_footnote_equations(footnote) # see usage note | |
kids = list(footnote.getchildren()) | |
prefix = "[%s]" % footnote_name | |
# we would like to prepend this footnote identifier to the footnote element | |
if footnote.text is not None: | |
# if the element starts with some text anyway, prepend it there | |
# footnote.text = "%s %s" % (prefix, footnote.text) | |
pass | |
else: | |
# if, however, the element begins with a child, prepend the text at the beginning of the first child instead | |
if len(kids): | |
first_child = kids[0] | |
# child_text = prefix | |
child_text = "" | |
# separate them with a space, unless the child had no text to begin with | |
child_suffix = first_child.text | |
if child_suffix is None: | |
child_suffix = "" | |
else: | |
child_text += " " | |
child_text += child_suffix | |
first_child.text = child_text | |
else: | |
# a totally empty footnote is weird, but who am I to judge? | |
footnote.text = prefix | |
footnote_text = footnote.text or "" | |
replace_footnote_with_sup(footnote) | |
footnote.text = "[" | |
note_link = etree.SubElement(footnote, "a") | |
note_link.set("href", "#fn" + footnote_name) | |
note_link.set("id", "body_fn-ref" + footnote_name) | |
note_link.text = "%s" % footnote_name | |
note_link.tail = "]" | |
# append any text the footnote used to have to the destination | |
destkids = list(destination.getchildren()) | |
if len(destkids): | |
# if the destination has children, append after the last one's tail | |
last_kid = destkids[-1] | |
prefix = last_kid.tail | |
if prefix is None: | |
prefix = "" | |
else: | |
prefix += " " | |
last_kid.tail = prefix + footnote_text | |
else: | |
# if the destination has no children, append to its text | |
prefix = destination.text | |
if prefix is None: | |
prefix = "" | |
else: | |
prefix += " " | |
destination.text = prefix + footnote_text | |
for kid in kids: | |
destination.append(kid) | |
return contentopf | |
# def bring_footnote_down_epub ends here | |
class FootnoteError(Exception): | |
""" | |
we only support one type of footnote per chapter | |
don't try to mix-and-match | |
""" | |
pass | |
# class FootnoteError ends here | |
intTechnicalChapterNumber = 1 | |
for xmlChapter in xmlChapters: | |
groupings = libeoaconvert.get_bigfoot_data(xmlChapter) | |
xmlFootnotes = xmlChapter.xpath(".//note[not(ancestor::opener)]") | |
logging.info(f"Found {str(len(xmlFootnotes))} footnotes in this chapter.") | |
has_old = 0 != len(xmlFootnotes) | |
has_new = 0 != len( | |
[ # flatten the association list whose values are lists, so we can take the length | |
note | |
for grouping, notes in groupings | |
for note in notes | |
] | |
) | |
# the XOR case falls through, the AND is an error, and the NOR skips to the next chapter | |
if has_old: | |
if has_new: | |
raise FootnoteError("Chapter %s contains both \\EOAfn and footnotes in the style of \\EOAfnalph" % xmlChapter.get("id-text")) | |
else: | |
if not has_new: | |
continue | |
xmlNewFootnotes = etree.Element("div") | |
xmlNewFootnotesHeader = etree.Element("h3") | |
dictLangFootnotes = translation_xml.find("//entry[@name='footnotes']").attrib | |
xmlNewFootnotesHeader.text = dictLangFootnotes[libeoaconvert.two_letter_language(xmlChapter.get("language"))] | |
xmlNewFootnotes.append(xmlNewFootnotesHeader) | |
for grouping, notes in groupings: | |
# do for the new-style footnotes what was being done for the old | |
for index, note in enumerate(notes): | |
footnote_name = str(index + 1) | |
if "lower-latin" == grouping: | |
footnote_name = alph_footnote_index(index) | |
para = etree.Element("p") | |
para.text = "[" | |
note_link = etree.SubElement(para, "a") | |
note_link.set("id", "fn" + footnote_name) | |
note_link.set("href", "#body_fn-ref" + footnote_name) | |
note_link.text = "%s" % footnote_name | |
note_link.tail = "]" | |
contentopf = bring_footnote_down_epub(note, footnote_name, para) | |
xmlNewFootnotes.append(para) | |
tmpFileName = "chapter" + (str(intTechnicalChapterNumber)) + ".xhtml" | |
intFootnoteNumber = 1 | |
for xmlFootnote in xmlFootnotes: | |
# Not numbered Equations may appear in a footnote, need to be treated differently | |
xmlEquationsnonumber = xmlFootnote.findall(".//EOAequationnonumber") | |
for xmlEquationnonumber in xmlEquationsnonumber: | |
strFilename = xmlEquationnonumber.get("filename") | |
xmlEquationnonumber.clear() | |
xmlEquationnonumber.tag = "p" | |
xmlIMG = etree.Element("img", src="images/"+ strFilename, alt="") | |
xmlEquationnonumber.append(xmlIMG) | |
shutil.copy( | |
INPUT_DIR / "items" / strFilename, | |
OUTPUT_DIR / "OEBPS/images" / strFilename | |
) | |
# shutil.copy(os.getcwd() + "/items/" + strFilename, os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strFilename) | |
contentopf = addToContentopf(contentopf, "images/" + strFilename, strFilename, "png") | |
tmp_fnstring = "fn" + str(intFootnoteNumber) | |
tmp_fnrefstring = "body_fn-ref" + str(intFootnoteNumber) | |
xmlFirstChild = xmlFootnote.getchildren()[0] | |
# this is for the reference text | |
if xmlFirstChild.text is None: | |
xmlNewFootnoteRefBottom = etree.SubElement(xmlFirstChild, "a", href = "#" + tmp_fnrefstring, id = tmp_fnstring) | |
xmlFirstChild.text = "[" | |
xmlNewFootnoteRefBottom.text = str(intFootnoteNumber) | |
xmlNewFootnoteRefBottom.tail = "] " | |
xmlFirstChild.insert(0, xmlNewFootnoteRefBottom) | |
else: | |
xmlNewFootnoteRefBottom = etree.Element("a", href = "#" + tmp_fnrefstring, id = tmp_fnstring) | |
xmlNewFootnoteRefBottom.text = str(intFootnoteNumber) | |
beginning_of_footnote = xmlFirstChild.text | |
xmlFirstChild.text = "[" | |
xmlNewFootnoteRefBottom.tail = "] " + beginning_of_footnote | |
xmlFirstChild.insert(0, xmlNewFootnoteRefBottom) | |
#Preserve tail and children of current <note>-Tag | |
xmlFootnoteContentsTail = xmlFootnote.tail | |
xmlFootnoteChildren = xmlFootnote.getchildren() | |
# Substitute current <note> with Number | |
xmlFootnote.clear() | |
xmlFootnote.tag = "sup" | |
xmlFootnote.text = "[" | |
xmlFootnote.tail = xmlFootnoteContentsTail | |
xmlNewFootnoteRef = etree.SubElement(xmlFootnote, "a", href = "#" + tmp_fnstring, id = tmp_fnrefstring) | |
xmlNewFootnoteRef.text = str(intFootnoteNumber) | |
xmlNewFootnoteRef.tail = "]" | |
if len(xmlFootnoteChildren) != 0: | |
for xmlFootnoteChild in xmlFootnoteChildren: | |
xmlNewFootnotes.append(xmlFootnoteChild) | |
intFootnoteNumber += 1 | |
xmlChapter.append(xmlNewFootnotes) | |
intTechnicalChapterNumber += 1 | |
logging.info(f"{logseparator}Preparing Lists") | |
for xmlChapter in xmlChapters: | |
xmlLists = xmlChapter.findall(".//list") | |
for xmlList in xmlLists: | |
if xmlList.get("type") == "description": | |
continue | |
if xmlList.get("type") == "ordered": | |
xmlList.tag = "ol" | |
xmlFirstItem = xmlList.find("..//item") | |
firstitemnumber = xmlFirstItem.get("id-text") | |
xmlList.set("start", firstitemnumber) | |
firstitemlabel = xmlFirstItem.get("label") | |
xmlFirstItem.tag = "li" | |
if firstitemlabel != f"({firstitemnumber})": | |
xmlFirstItem.set("style","list-style:none;") | |
xmlListItems = xmlList.findall(".//item") | |
for xmlListItem in xmlListItems: | |
xmlListItem.tag = "li" | |
itemnumber = xmlListItem.get("id-text") | |
itemlabel = xmlListItem.get("label") | |
if itemlabel != f"({itemnumber})": | |
xmlListItem.set("style","list-style:none;") | |
itemparagraph = xmlListItem.find("p") | |
paratext = itemparagraph.text | |
itemparagraph.text = f"{itemlabel} {paratext}" | |
if xmlList.get("type") == "simple": | |
xmlList.tag = "ul" | |
xmlListItems = xmlList.findall(".//item") | |
for xmlListItem in xmlListItems: | |
xmlListItem.tag = "li" | |
logging.info(f"{logseparator}Preparing Descriptions") | |
for xmlChapter in xmlChapters: | |
xmlDescriptions = xmlChapter.findall(".//list") | |
for xmlDescription in xmlDescriptions: | |
xmlDescription.tag = "dl" | |
del xmlDescription.attrib["type"] | |
for xmlChild in xmlDescription.iterchildren(): | |
if xmlChild.tag == "label": | |
xmlChild.tag = "dt" | |
if xmlChild.tag == "item": | |
xmlChild.tag = "dd" | |
del xmlChild.attrib["id"] | |
if xmlChild.get("id-text"): | |
del xmlChild.attrib["id-text"] | |
logging.info(f"{logseparator}Preparing Blockquotes") | |
xmlParagraphs = xmlEbookTree.findall(".//p") | |
for xmlParagraph in xmlParagraphs: | |
if xmlParagraph.get("rend") == "quoted": | |
strParagraphText = xmlParagraph.text | |
strParagraphTail = xmlParagraph.tail | |
xmlParagraphChildren = xmlParagraph.getchildren() | |
xmlParagraph.clear() | |
xmlParagraph.tag = "blockquote" | |
xmlNew = etree.Element("p") | |
if strParagraphText is not None: | |
xmlNew.text = strParagraphText | |
if len(xmlParagraphChildren) != 0: | |
for xmlParagraphChild in xmlParagraphChildren: | |
xmlNew.append(xmlParagraphChild) | |
if strParagraphTail is not None: | |
xmlNew.tail = strParagraphTail | |
xmlParagraph.append(xmlNew) | |
logging.info(f"{logseparator}Preparing Theorems") | |
for xmlChapter in xmlChapters: | |
xmlTheorems = xmlChapter.findall(".//theorem") | |
for xmlTheorem in xmlTheorems: | |
xmlTheoremHead = xmlTheorem.find(".//head") | |
strTheoremTitel = xmlTheorem.find(".//head").text | |
strTheoremText = xmlTheorem.find(".//p").text | |
xmlTheoremTextTail = xmlTheorem.find(".//p").tail | |
strTheoremNumber = xmlTheorem.get("id-text") | |
xmlTheorem.tag = "p" | |
xmlTheoremHead.tag = "b" | |
xmlTheoremHead.text = xmlTheoremHead.text + " " + strTheoremNumber | |
del xmlTheorem.attrib["style"] | |
del xmlTheorem.attrib["type"] | |
del xmlTheorem.attrib["id-text"] | |
del xmlTheorem.attrib["id"] | |
etree.strip_tags(xmlTheorem, "p") | |
logging.info(f"{logseparator}Preparing Hyperlinks") | |
for xmlChapter in xmlChapters: | |
xmlLanguage = xmlChapter.get("language") | |
if xmlLanguage is not None: | |
# KT changing this after separating the big script | |
strLanguage = xmlLanguage #or "english" | |
else: | |
strLanguage = "english" | |
xmlHyperlinks = xmlChapter.findall(".//xref") | |
for xmlHyperlink in xmlHyperlinks: | |
libeoaconvert.format_hyperlinks_django_epub(xmlHyperlink, strLanguage) | |
logging.info(f"{logseparator}Convert emphasized text") | |
for xmlChapter in xmlChapters: | |
xmlItalics = xmlChapter.findall(".//hi") | |
for xmlItalic in xmlItalics: | |
if xmlItalic.get("rend") == "it": | |
xmlItalic.tag = "em" | |
del xmlItalic.attrib["rend"] | |
logging.info(f"{logseparator}Convert bold text") | |
for xmlChapter in xmlChapters: | |
xmlBolds = xmlChapter.findall(".//hi") | |
for xmlBold in xmlBolds: | |
if xmlBold.get("rend") == "bold": | |
xmlBold.tag = "b" | |
del xmlBold.attrib["rend"] | |
logging.info(f"{logseparator}Convert EOAup to <sup>") | |
for xmlChapter in xmlChapters: | |
xmlUps = xmlChapter.findall(".//EOAup") | |
for xmlUp in xmlUps: | |
xmlUp.tag = "sup" | |
logging.info(f"{logseparator}Convert EOAdown to <sub>") | |
for xmlChapter in xmlChapters: | |
xmlDowns = xmlChapter.findall(".//EOAdown") | |
for xmlDown in xmlDowns: | |
xmlDown.tag = "sub" | |
logging.info(f"{logseparator}Convert EOAst to <span>") | |
for xmlChapter in xmlChapters: | |
xmlStrikeouts = xmlChapter.findall(".//EOAst") | |
for xmlStrikeout in xmlStrikeouts: | |
xmlStrikeout.tag = "span" | |
xmlStrikeout.set("style", "text-decoration: line-through;") | |
logging.info(f"{logseparator}Convert EOAls to something nice") | |
for xmlChapter in xmlChapters: | |
xmlLetterspaceds = xmlChapter.findall(".//EOAls") | |
for xmlLetterspaced in xmlLetterspaceds: | |
xmlLetterspaced.tag = "span" | |
xmlLetterspaced.set("style", "letter-spacing: 0.5em;") | |
logging.info(f"{logseparator}Convert EOAcaps to something nice") | |
for xmlChapter in xmlChapters: | |
xmlLetterspaceds = xmlChapter.findall(".//EOAcaps") | |
for xmlLetterspaced in xmlLetterspaceds: | |
xmlLetterspaced.tag = "span" | |
xmlLetterspaced.set("style", "font-variant:small-caps;") | |
logging.info(f"{logseparator}Convert EOAineq into appropriate IMG-Tags") | |
for xmlChapter in xmlChapters: | |
xmlInlineEquations = xmlChapter.findall(".//EOAineq") | |
for xmlInlineEquation in xmlInlineEquations: | |
xmlInlineEquation.tag = "img" | |
xmlInlineEquation.set("alt", xmlInlineEquation.get("TeX")) | |
del xmlInlineEquation.attrib["TeX"] | |
shutil.copy( | |
INPUT_DIR / "items" / xmlInlineEquation.get("src"), | |
OUTPUT_DIR / "OEBPS/images" / xmlInlineEquation.get("src") | |
) | |
xmlInlineEquation.set("src", "images/" + xmlInlineEquation.get("src")) | |
contentopf = addToContentopf(contentopf, xmlInlineEquation.get("src"), xmlInlineEquation.get("src"), "png") | |
logging.info(f"{logseparator}Convert EOAchem into appropriate IMG-Tags") | |
for xmlChapter in xmlChapters: | |
xml_inline_chems = xmlChapter.findall(".//EOAchem") | |
for xml_inline_chem in xml_inline_chems: | |
xml_inline_chem.tag = "img" | |
xml_inline_chem.set("alt", xml_inline_chem.get("TeX")) | |
del xml_inline_chem.attrib["TeX"] | |
shutil.copy( | |
INPUT_DIR / "items" / xml_inline_chem.get("src"), | |
OUTPUT_DIR / "OEBPS/images" / xml_inline_chem.get("src") | |
) | |
xml_inline_chem.set("src", "images/" + xml_inline_chem.get("src")) | |
contentopf = addToContentopf(contentopf, xml_inline_chem.get("src"), xml_inline_chem.get("src"), "png") | |
logging.info(f"{logseparator}Convert EOAinline into appropriate IMG-Tags") | |
for xmlChapter in xmlChapters: | |
xmlInlineElements = xmlChapter.findall(".//EOAinline") | |
for xmlInlineElement in xmlInlineElements: | |
xmlInlineElement.tag = "img" | |
xmlInlineElement.set("alt", "Too late") | |
strInlineElementFilePath = xmlInlineElement.text | |
# remove text from element. This is visible in epub (at least in calibre's e-book-viewer) | |
# however, the text is taken as id in content.opf | |
# set it to nil after the addToContentopf | |
strInlineElementFileName = os.path.basename(strInlineElementFilePath) | |
strInlineElementDirName = os.path.dirname(strInlineElementFilePath) | |
strInlineElementSubDirName = os.path.dirname(strInlineElementFilePath).split(os.path.sep)[-1] | |
strNewImagePath = OUTPUT_DIR / "OEBPS/images" / Path(strInlineElementSubDirName + strInlineElementFileName) | |
# strNewImagePath = os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strInlineElementDirName + strInlineElementFileName | |
# trouble when there are subdirectories in Image path! | |
# some thing goes wrong here: <EOAinline>Images/png_300dpi/A.png</EOAinline> | |
shutil.copy( | |
PUBLICATION_DIR / strInlineElementDirName / strInlineElementFileName, | |
strNewImagePath | |
) | |
# shutil.copy(os.getcwd() + "/" + strInlineElementDirName + "/" + strInlineElementFileName, strNewImagePath) | |
# strNewImagePath = os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strInlineElementDirName + strInlineElementFileName | |
strCommand = f"{GM_PATH} convert {strNewImagePath} -resize 20x20 {strNewImagePath}" | |
# strCommand = GM_PATH + " convert " + str(strNewImagePath) + " -resize 20x20 " + strNewImagePath | |
listArguments = shlex.split(strCommand) | |
subprocess.check_output(listArguments, shell=False) | |
xmlInlineElement.set("src", "images/" + strInlineElementSubDirName + strInlineElementFileName) | |
# contentopf, Filename, FileID, Mediatype | |
# <item id="Troublemaker" media-type="image/png" href="images/inlineA.jpg"/> | |
# Mediatype should not be hard coded!!! | |
# base this on file extension | |
extension = strInlineElementFileName.split(".")[-1] | |
contentopf = addToContentopf(contentopf, "images/" + strInlineElementSubDirName + strInlineElementFileName, xmlInlineElement.text, extension) | |
xmlInlineElement.text = "" | |
logging.info(f"{logseparator}Epigraphs") | |
for xmlChapter in xmlChapters: | |
xml_epigraphs = xmlChapter.findall(".//epigraph") | |
for epigraph in xml_epigraphs: | |
epigraph.tag = "tagtobestripped" | |
children = epigraph.getchildren() | |
for child in children: | |
child.set("class", "epigraph") | |
logging.info(f"{logseparator}Preparing Verses") | |
for xmlChapter in xmlChapters: | |
xml_verses = xmlChapter.findall(".//EOAverse") | |
for xml_verse in xml_verses: | |
xml_verse_children = xml_verse.getchildren() | |
for line in xml_verse_children[:-1]: | |
linebreak = etree.Element("br") | |
line.append(linebreak) | |
etree.strip_tags(xml_verse, "p") | |
xml_verse.tag = "p" | |
class_attribute = xml_verse.get("class") | |
if class_attribute is not None: | |
class_attribute += " verse" | |
xml_verse.set("class", class_attribute) | |
else: | |
xml_verse.set("class", "verse") | |
logging.info(f"{logseparator}Preparing Equations") | |
for xmlChapter in xmlChapters: | |
xmlEquations = xmlChapter.findall(".//EOAequation") | |
for xmlEquation in xmlEquations: | |
strNumber = xmlEquation.get("number") | |
strFilename = xmlEquation.get("filename") | |
# Copy image of Equation | |
shutil.copy( | |
INPUT_DIR / "items" / strFilename, | |
OUTPUT_DIR / "OEBPS/images" / strFilename | |
) | |
# shutil.copy(os.getcwd() + "/items/" + strFilename, os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strFilename) | |
contentopf = addToContentopf(contentopf, "images/" + strFilename, strFilename, "png") | |
# Find out Number of Equation to be appended in the last step | |
strEquationNumber = xmlEquation.get("number") | |
# Rework XML | |
xmlEquation.clear() | |
xmlEquation.tag = "p" | |
xmlEquationImage = etree.Element("img") | |
xmlEquationImage.set("src", "images/" + strFilename) | |
xmlEquationImage.set("alt", "") | |
xmlEquation.append(xmlEquationImage) | |
xmlNew = etree.Element('p') | |
xmlNew.text = "(" + strEquationNumber + ")" | |
xmlEquation.addnext(xmlNew) | |
# Parent tag of Equation should be <div> instead of <p>, so that it may be removed | |
#xmlEquation.getparent().tag = "div" | |
for xmlChapter in xmlChapters: | |
xmlEquations = xmlChapter.findall(".//EOAequationnonumber") | |
for xmlEquation in xmlEquations: | |
strFilename = xmlEquation.get("filename") | |
# Copy image of Equation | |
shutil.copy( | |
INPUT_DIR / "items" / strFilename, | |
OUTPUT_DIR / "OEBPS/images" / strFilename | |
) | |
# shutil.copy(os.getcwd() + "/items/" + strFilename, os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strFilename) | |
contentopf = addToContentopf(contentopf, "images/" + strFilename, strFilename, "png") | |
# Rework XML | |
xmlEquation.clear() | |
xmlEquation.tag = "p" | |
xmlEquationImage = etree.Element("img") | |
xmlEquationImage.set("src", "images/" + strFilename) | |
xmlEquationImage.set("alt", "") | |
xmlEquation.append(xmlEquationImage) | |
# Parent tag of Equation should be <div> instead of <p>, so that it may be removed | |
#xmlEquation.getparent().tag = "div" | |
# EOAequationarray not handled so far. However: my solution (renaming | |
# the div) just makes the element disappear, leaving only its children! | |
for xmlChapter in xmlChapters: | |
xmlEquationarrays = xmlChapter.findall(".//EOAequationarray") | |
for xmlEquationarray in xmlEquationarrays: | |
xmlEquationarray.tag = "div" | |
logging.info(f"{logseparator}Preparing Letterheads") | |
for xmlChapter in xmlChapters: | |
xmlLetterheads = xmlChapter.xpath(".//EOAletterhead") | |
for xmlLetterhead in xmlLetterheads: | |
xmlRecipient = xmlLetterhead.find(".//Recipient") | |
xmlRecipient.tag = "p" | |
xmlRecipient.getchildren()[0].tag = "em" | |
xmlArchive = xmlLetterhead.find(".//Archive") | |
xmlArchive.tag = "p" | |
xmlArchive.getchildren()[0].tag = "em" | |
xmlAdditional = xmlLetterhead.find(".//Additional") | |
xmlAdditional.tag = "p" | |
xmlAdditional.getchildren()[0].tag = "em" | |
xmlPages = xmlLetterhead.find(".//Pages") | |
xmlPages.tag = "p" | |
xmlPages.getchildren()[0].tag = "em" | |
xmlHR = etree.Element("hr") | |
xmlHR2 = etree.Element("hr") | |
xmlLetterhead.insert(0, xmlHR) | |
xmlLetterhead.insert(5, xmlHR2) | |
logging.info(f"{logseparator}Preparing Transcriptions") | |
# TODO: May need rework concerning the right Column | |
for xmlChapter in xmlChapters: | |
etree.strip_elements(xmlChapter, "Facsimilelink") | |
xmlTranscriptions = xmlChapter.xpath(".//EOAtranscripted") | |
for xmlTranscription in xmlTranscriptions: | |
logging.info("Processing Transcription") | |
#logging.info (etree.tostring(xmlTranscription)) | |
xmlTranscription.tag = "table" | |
xmlHeader = xmlTranscription.find(".//EOAtranscriptedheader") | |
xmlHeader.tag = "tr" | |
xmlLeftHeader = xmlTranscription.find(".//Leftheader") | |
# logging.info(xmlLeftHeader.text) | |
xmlLeftHeader.tag = "td" | |
xmlLeftHeader.set("style", "width: 50%") | |
xmlRightHeader = xmlTranscription.find(".//Rightheader") | |
xmlRightHeader.tag = "td" | |
xmlTranscriptedtext = xmlTranscription.find(".//EOAtranscriptedtext") | |
# change \n\n into </p><p> and pagebreak into </p><pagebreak><p> to create some valid markup | |
strTranscriptedtext = etree.tostring(xmlTranscriptedtext, encoding="unicode") | |
#strTranscriptedtext = re.sub (r"\n\n\n\n", "</p><p>", str(strTranscriptedtext), re.MULTILINE) | |
#strTranscriptedtext = re.sub (r"\n\n\n", "</p><p>", str(strTranscriptedtext), re.MULTILINE) | |
#strTranscriptedtext = re.sub (r"\n\n", "</p><p>", str(strTranscriptedtext)) | |
#strTranscriptedtext = re.sub (r"<pagebreak/>", "</p><pagebreak/><p>", strTranscriptedtext) | |
xmlLeftColumn = etree.Element("td") | |
xmlRightColumn = etree.Element("td") | |
boolRightColumn = False | |
xmlTemp = etree.XML(str(strTranscriptedtext)) | |
for xmlElement in xmlTemp.iterchildren(): | |
if xmlElement.tag == "pagebreak": | |
boolRightColumn = True | |
logging.info("Spaltenwechsel!") | |
continue | |
if boolRightColumn == False: | |
xmlLeftColumn.append(xmlElement) | |
if boolRightColumn == True: | |
xmlRightColumn.append(xmlElement) | |
xmlTranscriptedtext.clear() | |
xmlTranscriptedtext.tag = "tr" | |
xmlTranscriptedtext.set("valign", "top") | |
xmlTranscriptedtext.append(xmlLeftColumn) | |
xmlTranscriptedtext.append(xmlRightColumn) | |
# Remove <Facsimilelink> | |
logging.info(f"{logseparator}Preparing Tables") | |
intChapterNumber = 1 | |
for xmlChapter in xmlChapters: | |
xmlTables = xmlChapter.findall(".//EOAtable") | |
for xmlTable in xmlTables: | |
xmlRawTable = xmlTable.find(".//table") | |
xml_table_id = xmlRawTable.get("id") | |
strTableCaption = xmlTable.find(".//EOAtablecaption").text or "" | |
# logging.info("Working on ", strTableCaption) | |
if strTableCaption != "nonumber": | |
intTableNumber = dictTables[xml_table_id] | |
xmlTableCaption = etree.Element("p") | |
xmlTableCaption.text = str(intTableNumber) + " " + strTableCaption | |
if xmlTable.find(".//EOAtablecaption").getchildren() is not None: | |
for xmlChild in xmlTable.find(".//EOAtablecaption").iterchildren(): | |
xmlTableCaption.append(xmlChild) | |
xmlRawTable.addnext(xmlTableCaption) | |
xmlTable.find(".//EOAtablelabel").clear() | |
xmlTable.remove(xmlTable.find(".//EOAtablelabel")) | |
else: | |
logging.info("Table has no caption") | |
xmlTable.find(".//EOAtablecaption").clear() | |
xmlTable.remove(xmlTable.find(".//EOAtablecaption")) | |
# Analyze Width and Alignment of the Columns | |
strColumnString = xmlTable.find(".//EOAtablecolumns").text | |
strColumnString = re.sub(r"\|", "", strColumnString) | |
xmlTable.remove(xmlTable.find(".//EOAtablecolumns")) | |
reMatchObjects = re.findall(r'([L|R|C].*?[c|m]m)', strColumnString) | |
intTableWidth = 0 | |
listColumnAlignments = [None] | |
listColumnWidths = [None] | |
intNumberOfColumns = 0 | |
for strColumnDefinition in reMatchObjects: | |
strColumnDefinition = strColumnDefinition.rstrip("cm") | |
strColumnAlignment = strColumnDefinition[0] | |
if strColumnAlignment == "L": | |
strColumnAlignment = "left" | |
if strColumnAlignment == "C": | |
strColumnAlignment = "center" | |
if strColumnAlignment == "R": | |
strColumnAlignment = "right" | |
listColumnAlignments.append(strColumnAlignment) | |
intColumnWidth = int(float(strColumnDefinition.lstrip("LRC")) * 75) | |
listColumnWidths.append(intColumnWidth) | |
intTableWidth += intColumnWidth | |
intNumberOfColumns += 1 | |
xmlRawTable.set("width", str(intTableWidth)+"px;") | |
del xmlRawTable.attrib["rend"] | |
del xmlRawTable.attrib["id-text"] | |
del xmlRawTable.attrib["id"] | |
try: | |
del xmlRawTable.attrib["place"] | |
except KeyError: | |
pass | |
# Figure out and deal with the Header | |
xmlHeader = xmlRawTable.find(".//row/cell/tableheader") | |
if xmlHeader is not None: | |
xmlHeader.text = "" | |
xmlHeader.getparent().text = xmlHeader.tail | |
xmlHeader.getparent().remove(xmlHeader) | |
xmlFirstRow = xmlRawTable.find(".//row") | |
xmlFirstRow.tag = "tr" | |
xmlFirstRowCells = xmlFirstRow.findall(".//cell") | |
for xmlFirstRowCell in xmlFirstRowCells: | |
xmlFirstRowCell.tag = "th" | |
# Now Deal with the rest of the rows | |
xmlTableRows = xmlRawTable.findall(".//row") | |
for xmlTableRow in xmlTableRows: | |
xmlTableCells = xmlTableRow.findall(".//cell") | |
intCurrentColumn = 1 | |
logging.info(listColumnAlignments) | |
for xmlTableCell in xmlTableCells: | |
xmlTableCell.tag = "td" | |
xmlTableCell.set("align",listColumnAlignments[intCurrentColumn]) | |
xmlTableCell.set("style","width: " + str(listColumnWidths[intCurrentColumn]) + "px;") | |
# Deal with multicolumn | |
if xmlTableCell.get("cols") is not None: | |
xmlTableCell.set("colspan", xmlTableCell.get("cols")) | |
if intCurrentColumn > len(xmlTableCells): | |
intCurrentColumn = 1 | |
# Deal with multicolumn again, increase intCurrentColumn by the columns being spanned | |
elif xmlTableCell.get("cols") is not None: | |
intCurrentColumn = intCurrentColumn + int(xmlTableCell.get("cols")) | |
del xmlTableCell.attrib["cols"] | |
else: | |
intCurrentColumn += 1 | |
# deal with multirow | |
if xmlTableCell.get("rowspan") is not None: | |
cellchildren = xmlTableCell.getchildren() | |
for child in cellchildren: | |
if child.tag == "figure": | |
child.tag = "img" | |
imagepath = f"{child.get('file')}.{child.get('extension')}" | |
logging.debug(f"Source image: {PUBLICATION_DIR} {imagepath}") | |
strImageFileDir = os.path.dirname(imagepath) | |
strImageFileDir = re.sub("/", "", strImageFileDir) | |
strImageFileName = os.path.basename(imagepath) | |
logging.debug(f"Meant to be copied to {OUTPUT_DIR} /images/ {strImageFileDir}{strImageFileName}") | |
shutil.copy( | |
PUBLICATION_DIR / imagepath, | |
OUTPUT_DIR / "OEBPS" / "images" / (strImageFileDir + strImageFileName) | |
) | |
if child.get('extension') == "pdf": | |
strImageFilepath = libeoaconvert.sanitizeImage( | |
OUTPUT_DIR / "OEBPS" / "images" / (strImageFileDir + strImageFileName), | |
TEMP_DIR, GM_PATH, PDFCROP_EXEC | |
) | |
strImageFileName = strImageFileName.replace(".pdf", ".png") | |
strImageFileDir = f"images/{strImageFileDir + strImageFileName}" | |
child.set("src", strImageFileDir) | |
strImageFileNamewoSuffix, strImageFileName_Suffix = os.path.splitext(strImageFileName) | |
extension_and_mime = get_mimetype(strImageFileName_Suffix) | |
contentopf = addToContentopf( | |
contentopf, | |
strImageFileDir,# + strImageFileNamewoSuffix + "." + extension_and_mime, | |
# "images/" + strImageFileDir + strImageFileNamewoSuffix + "." + extension_and_mime, | |
strImageFileDir + strImageFileNamewoSuffix + "-nonumber-" + extension_and_mime, | |
extension_and_mime | |
) | |
child.set("width", f"{str(listColumnWidths[intCurrentColumn])}px") | |
del child.attrib["rend"] | |
del child.attrib["file"] | |
del child.attrib["extension"] | |
xmlTableRow.tag = "tr" | |
xmlTableRow.set("valign", "top") | |
xmlTableParent = xmlTable.getparent() | |
xmlTableParent.addnext(xmlTable) | |
xtp = etree.tostring(xmlTableParent) | |
# libeoaconvert.deb_var(xtp) | |
xmlTableParent.getparent().remove(xmlTableParent) | |
logging.info("Finished with that table.") | |
intChapterNumber += 1 | |
logging.info(f"{logseparator}Preparing Facsimiles") | |
xmlParts = xmlEbookTree.findall(".//div0") | |
for xmlPart in xmlParts: | |
xmlFacsimiles = xmlPart.findall(".//EOAfacsimilepage") | |
for xmlFacsimile in xmlFacsimiles: | |
strImageFile = xmlFacsimile.find(".//file").text | |
strFacsimileLabel = xmlFacsimile.find(".//label").text | |
facsimile_pagenumber = xmlFacsimile.find(".//pagenumber").text or "" | |
etree.strip_elements(xmlFacsimile, "file") | |
etree.strip_elements(xmlFacsimile, "label") | |
# TODO: Hier noch irgendwie (fehlendem) Suffix der Datei umgehen. Und ggf. Dateien Konvertieren | |
strImageFile = strImageFile.rstrip("\n") | |
strImageFileDir = os.path.dirname(strImageFile) | |
strImageFileDir = re.sub("/", "", strImageFileDir) | |
strImageFileName = os.path.basename(strImageFile) | |
# hier | |
shutil.copy( | |
PUBLICATION_DIR / strImageFile, | |
OUTPUT_DIR / "OEBPS/images" / (strImageFileDir + strImageFileName) | |
) | |
# shutil.copy(os.getcwd() + "/" + strImageFile, os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strImageFileDir + strImageFileName) | |
strImageFilepath = libeoaconvert.sanitizeImage( | |
OUTPUT_DIR / "OEBPS/images" / (strImageFileDir + strImageFileName), | |
TEMP_DIR, | |
GM_PATH, | |
PDFCROP_EXEC | |
) | |
# strImageFilepath = libeoaconvert.sanitizeImage(os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strImageFileDir + strImageFileName, GM_PATH, PDFCROP_EXEC) | |
# Add copied file to contentopf | |
img_base_file_name, img_file_extension = os.path.splitext(strImageFileName) | |
contentopf = addToContentopf(contentopf, "images/" + strImageFileDir + strImageFileName, strImageFileDir + strImageFileName, img_file_extension[1:]) | |
# strSVGTemplate = """<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100%" height="100%" viewBox="0 0 573 800" preserveAspectRatio="xMidYMid meet"></svg>""" | |
# xmlSVGFacsimile = etree.fromstring(strSVGTemplate) | |
# xmlNew = etree.Element('image') | |
# xmlNew.set("width", "600px;") | |
# xmlNew.set("height", "800px;") | |
# xmlNew.set("{http://www.w3.org/1999/xlink}href", "images/" + strImageFileDir + strImageFileName) | |
# xmlSVGFacsimile.append(xmlNew) | |
# xmlFacsimile.getparent().replace(xmlFacsimile, xmlSVGFacsimile) | |
# <img src="images/ImagesFigure3.jpg" alt="" style="width: 99%"/> | |
facsimile_image_element = etree.Element(htmlns + "img") | |
facsimile_image_element.set("src", "images/" + strImageFileDir + strImageFileName) | |
facsimile_image_element.set("style", "width: 95%; height: auto;") | |
facsimile_image_element.set("alt", "Facsimile page " + facsimile_pagenumber) | |
xmlFacsimile.getparent().replace(xmlFacsimile, facsimile_image_element) | |
logging.info(f"{logseparator}Preparing Cross-References") | |
# restart chapter counter | |
intChapterNumber = 1 | |
for xmlChapter in xmlChapters: | |
xmlReferences = xmlChapter.xpath(".//EOAref[not(parent::EOAref)]") | |
for xmlReference in xmlReferences: | |
# the new stuff | |
# label_text = xmlReference.find("Label").text[1:] | |
# logging.debug("label text is %s" % label_text) | |
# corresponding_eoa_id_element = xmlTree.xpath("//*[@xml:id='{}']".format(label_text)) | |
# if len(corresponding_eoa_id_element) > 1: | |
# logging.info("The xml:id %s has been assigned more than once. This is not allowed. Exiting." % corresponding_eoa_id_element) | |
# sys.exit() | |
# else: | |
# eoa_id_element = corresponding_eoa_id_element[0] | |
# eoa_id = eoa_id_element.get("id") | |
# end of the new stuff | |
hitarget_id_list = xmlReference.xpath("./ref/@hitarget") | |
if len(hitarget_id_list) == 1: | |
hitarget_id = hitarget_id_list[0] | |
else: | |
hitarget_id = None | |
reference_type = xmlReference.get("type") | |
if reference_type == "text": | |
tmpTail = xmlReference.tail or "" | |
strResult = xmlReference.text | |
elif reference_type == "collage": | |
tmpTail = xmlReference.tail or "" | |
logging.debug("Found reference to a Hyperimage collage.") | |
subreferences = xmlReference.xpath("./EOAref[@type='number']") | |
strResult = "" | |
for subref in subreferences: | |
subref_tail = subref.tail or "" | |
subref_target = subref.xpath("./ref/@target")[0] | |
target_string = dictFigures[subref_target] | |
strResult += f"{target_string}{subref_tail}" | |
elif reference_type == "number": | |
strResult = "!!! Cross Reference !!!" | |
xmlReferenceLabel = xmlReference.find("Label") | |
xmlReferenceLabelText = xmlReferenceLabel.text | |
xmlReferenceRef = xmlReference.find("ref") | |
xmlReferenceRefTarget = xmlReferenceRef.get("target") | |
if xmlReferenceLabelText in dictEquations: | |
logging.info("Found link to array:" + xmlReferenceLabelText) | |
strResult = dictEquations[xmlReferenceLabelText] | |
if xmlReferenceRefTarget in dictEquations: | |
logging.info("Found link to equation:" + xmlReferenceRefTarget) | |
strResult = dictEquations[xmlReferenceRefTarget] | |
if xmlReferenceRefTarget in dictLists: | |
logging.info("Found link to list") | |
strResult = dictLists[xmlReferenceRefTarget] | |
if xmlReferenceRefTarget in dictChapters: | |
logging.info("Found link to chapter") | |
strResult = dictChapters[xmlReferenceRefTarget] | |
if xmlReferenceRefTarget in dictSections: | |
logging.info("Found link to section") | |
strResult = dictSections[xmlReferenceRefTarget] | |
if xmlReferenceRefTarget in dictFigures: | |
logging.info("Found link to figure") | |
strResult = dictFigures[xmlReferenceRefTarget] | |
if xmlReferenceRefTarget in dictFootnotes: | |
logging.info("Found link to footnote") | |
strResult = dictFootnotes[xmlReferenceRefTarget] | |
if xmlReferenceRefTarget in dictTheorems: | |
logging.info("Found link to theorem") | |
strResult = dictTheorems[xmlReferenceRefTarget] | |
if xmlReferenceRefTarget in dictTables: | |
logging.info("Found link to table") | |
strResult = dictTables[xmlReferenceRefTarget] | |
tmpTail = xmlReference.tail or "" | |
#tmpTail = tmpTail.strip() | |
else: | |
logging.error("Found unknown reference type: %s. Exiting", reference_type) | |
sys.exit(0) | |
xmlReference.clear() | |
if args.hyperimage and hitarget_id and reference_type in ["collage", "number"]: | |
hyperimage_link = f"{publication_landingpage[:-11]}/{intChapterNumber}/index.html#{hitarget_id}" | |
xmlReference.tag = "a" | |
xmlReference.set("href", hyperimage_link) | |
else: | |
pass | |
xmlReference.text = strResult | |
xmlReference.tail = tmpTail | |
intChapterNumber += 1 | |
# Substitute Page-References with their targets | |
for xmlChapter in xmlChapters: | |
xmlReferences = xmlChapter.findall(".//EOApageref") | |
for xmlReference in xmlReferences: | |
strResult = "!!! Page Reference !!!" | |
xmlReferenceLabel = xmlReference.find("Label") | |
xmlReferenceLabelText = xmlReferenceLabel.text | |
logging.info(xmlReferenceLabelText) | |
xmlReferenceRef = xmlReference.find("ref") | |
xmlReferenceRefTarget = xmlReferenceRef.get("target") | |
if xmlReferenceLabelText in dictPagelabels: | |
logging.info("Found link to page: " + xmlReferenceLabelText) | |
strResult = dictPagelabels[xmlReferenceLabelText] | |
tmpTail = xmlReference.tail or "" | |
xmlReference.clear() | |
xmlReference.text = strResult | |
xmlReference.tail = tmpTail | |
# Correcting References to Publications | |
# NOTE: This may be reworked in the future to enable popups in the ebook | |
# NOTE: For the time being, span ist going to be removed | |
for xmlChapter in xmlChapters: | |
xmlPublicationreferences = xmlChapter.findall(".//span") | |
for xmlPublicationreference in xmlPublicationreferences: | |
if xmlPublicationreference.get("rel") == "popover": | |
xmlPublicationreference.tag = "EOAcitation" | |
############################################################## | |
# Finish ePub Conversion, save File # | |
############################################################## | |
logging.info(f"{logseparator}Cleaning up XML") | |
xmlIndexentries = xmlEbookTree.xpath(".//EOAindex | .//EOAindexperson | .//EOAindexlocation") | |
for xmlIndexentry in xmlIndexentries: | |
tmpTail = xmlIndexentry.tail or "" | |
xmlIndexentry.clear() | |
xmlIndexentry.tail = tmpTail | |
etree.strip_tags(xmlEbookTree, "EOAlabel", "EOAindex", "EOApageref", "EOAcitenumeric", "EOAtable", "EOAref", "note", "div", "div2", "div3", "div4", "div5", "citetext", "newpage", "EOAciteyear", "EOAtablelabel" , "hi", "pagebreak", "page", "pagestyle", "EOAcitation", "EOAciteauthoryear", "EOAcitemanual", "EOAprintbibliography", "EOAindexperson", "EOAprintindex", "EOAindexlocation", "EOAprintpersonindex", "EOAprintlocationindex","anchor", "temp", "EOAletterhead", "EOAhifigure", "EOAtocentry","tagtobestripped") | |
etree.strip_attributes(xmlEbookTree, "id-text", "noindent", "type", "label", "spacebefore", "rend", "hielement") # also contained "id" | |
etree.strip_elements(xmlEbookTree, "citekey", "originalcontents", "elementtoberemoved", with_tail=False) | |
logging.info("Write every Part and Chapter into one file") | |
xmlChapters = xmlEbookTree.findall("//div1") | |
listParts = [] | |
intTechnicalChapterNumber = 1 | |
for xmlChapter in xmlChapters: | |
# Load xmlHTMLTemplate | |
htmlChapter = etree.parse( str(EPUB_FILES / "epubchapter.xml"), xmlChapterParser) | |
# Find out, if it's inside a part. If Part has not been worked on, then do it | |
xmlChapterParent = xmlChapter.getparent() | |
if xmlChapterParent.tag == "div0" and xmlChapterParent.get("id") not in listParts: | |
listParts.append(xmlChapterParent.get("id")) | |
strPartTitle = xmlChapterParent.find(".//head").text | |
htmlChapter.find(".//" + htmlns + "title").text = strPartTitle | |
xmlNew = etree.Element('h1') | |
xmlNew.text = strPartTitle | |
htmlChapter.find(".//" + htmlns + "body").append(xmlNew) | |
# Save Part | |
tmpFileName = OUTPUT_DIR / ("OEBPS/chapter" + (str(intTechnicalChapterNumber)) + ".xhtml") | |
# tmpFileName = os.getcwd() + "/CONVERT/epub/OEBPS/chapter" + (str(intTechnicalChapterNumber)) + ".xhtml" | |
tmpFile = open (tmpFileName, "w") | |
tmpResult = etree.tostring(htmlChapter, pretty_print=True, encoding="unicode") | |
tmpFile.write(tmpResult) | |
tmpFile.close() | |
# Add to TocNCX | |
tocncx = addToTocncx(tocncx, htmlChapter.find(".//" + htmlns + "title").text, intTechnicalChapterNumber) | |
contentopf = addToContentopf(contentopf, "chapter" + str(intTechnicalChapterNumber) + ".xhtml", "chapter" + str(intTechnicalChapterNumber), "xml") | |
intTechnicalChapterNumber += 1 | |
# Reset htmlChapter | |
htmlChapter = etree.parse(str(EPUB_FILES / "epubchapter.xml"), xmlChapterParser) | |
# Aus div1 alle kinder auslesen und an htmlChapter dran hängen | |
xmlChildren = xmlChapter.getchildren() | |
for xmlChild in xmlChildren: | |
# Using Deepcopy, coz a simple append will delete the original | |
htmlChapter.find(".//" + htmlns + "body").append(deepcopy(xmlChild)) | |
# Save Chapter | |
tmpFileName = OUTPUT_DIR / ("OEBPS/chapter" + (str(intTechnicalChapterNumber)) + ".xhtml") | |
# tmpFileName = os.getcwd() + "/CONVERT/epub/OEBPS/chapter" + (str(intTechnicalChapterNumber)) + ".xhtml" | |
tmpFile = open (tmpFileName, "w") | |
tmpResult = etree.tostring(htmlChapter, pretty_print=True, encoding="unicode") | |
tmpFile.write(tmpResult) | |
tmpFile.close() | |
# Add to TocNCX | |
tocncx = addToTocncx(tocncx, xmlChapter.find(".//h1").text, intTechnicalChapterNumber) | |
contentopf = addToContentopf(contentopf, "chapter" + str(intTechnicalChapterNumber) + ".xhtml", "chapter" + str(intTechnicalChapterNumber), "xml") | |
# Content_OPF hinzufügen | |
intTechnicalChapterNumber += 1 | |
logging.info("Convert Facsimile-Parts") | |
xmlParts = xmlEbookTree.findall("//div0") | |
for xmlPart in xmlParts: | |
logging.info(f"{logseparator}Working on Facsimile-Part") | |
# check if it has a child element EOAfacsimilepart | |
if bool(xmlPart.findall(".//EOAfacsimilepart")): | |
htmlChapter = etree.parse(str(EPUB_FILES / "epubchapter.xml"), xmlChapterParser) | |
# Change EOAfacsimilepart into H1 | |
xmlHeadline = xmlPart.find(".//EOAfacsimilepart") | |
xmlHeadline.tag = "h1" | |
etree.strip_elements(xmlPart, "head") | |
# Aus div0 alle kinder auslesen und an htmlChapter dran hängen | |
xmlChildren = xmlPart.getchildren() | |
for xmlChild in xmlChildren: | |
# Using Deepcopy, coz a simple append will delete the original | |
htmlChapter.find(".//" + htmlns + "body").append(deepcopy(xmlChild)) | |
# Save Chapter | |
tmpFileName = OUTPUT_DIR / ("OEBPS/chapter" + (str(intTechnicalChapterNumber)) + ".xhtml") | |
# tmpFileName = os.getcwd() + "/CONVERT/epub/OEBPS/chapter" + (str(intTechnicalChapterNumber)) + ".xhtml" | |
tmpFile = open (tmpFileName, "w") | |
tmpResult = etree.tostring(htmlChapter, pretty_print=True, encoding="unicode") | |
tmpFile.write(tmpResult) | |
tmpFile.close() | |
# Save Chapter | |
tmpFileName = OUTPUT_DIR / ("OEBPS/chapter" + (str(intTechnicalChapterNumber)) + ".xhtml") | |
# tmpFileName = os.getcwd() + "/CONVERT/epub/OEBPS/chapter" + (str(intTechnicalChapterNumber)) + ".xhtml" | |
tmpFile = open (tmpFileName, "w") | |
tmpResult = etree.tostring(htmlChapter, pretty_print=True, encoding="unicode") | |
tmpFile.write(tmpResult) | |
tmpFile.close() | |
# Add to TocNCX | |
tocncx = addToTocncx(tocncx, xmlChapter.find("..//h1").text, intTechnicalChapterNumber) | |
contentopf = addToContentopf(contentopf, "chapter" + str(intTechnicalChapterNumber) + ".xhtml", "chapter" + str(intTechnicalChapterNumber), "xml") | |
# Content_OPF hinzufügen | |
intTechnicalChapterNumber += 1 | |
tocncx_filename = OUTPUT_DIR / "OEBPS/toc.ncx" | |
logging.info("Saving toc.ncx") | |
tocncx.write( str(tocncx_filename), pretty_print=True, xml_declaration=True, encoding="utf-8") | |
contentopf_filename = OUTPUT_DIR / "OEBPS/content.opf" | |
logging.info("Saving content.opf") | |
contentopf.write( str(contentopf_filename), pretty_print=True, xml_declaration=True, encoding="utf-8") | |
############################################################################ | |
# Finishing various Stuff # | |
############################################################################ | |
devel_ebook_file = TEMP_DIR / "Devel_ebook.xml" | |
logging.info(f"Write Temporary XML-Tree to {devel_ebook_file}.") | |
xmlEbookTree.write( str(devel_ebook_file), pretty_print=True, xml_declaration=True, encoding="utf-8") | |
if args.no_epub: | |
pass | |
else: | |
create_epub_container(f"{publication_series}{publication_number}", OUTPUT_DIR) | |
logging.info("Finished!") | |
# finis |