Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 1512 lines (1342 sloc) 65.9 KB
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
# Time-stamp: <2018-08-22 15:49:01 (kthoden)>
""" Convert a customized DocBook XML file into a set of files that
constitute the contents of an EPUB file.
Input file is a customized DocBook XML that has been generated either
with eoatex2imxml or tei2imxml.
"""
import os
import sys
import argparse
import re
import shutil
import configparser
import pickle
import shlex
import subprocess
import logging
from copy import deepcopy
from lxml import etree
import libeoaconvert
#####################
# Parsing arguments #
#####################
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--config", dest="CONFIG_FILE", help="Name of configuration file", metavar="CONFIGURATION")
parser.add_argument("-f", "--font", help="Font to be used, default is TeX Gyre Termes", default="termes")
parser.add_argument("-nc", "--nocaption", help="No captions for figures.", action="store_true")
args = parser.parse_args()
if args.CONFIG_FILE is not None:
CONFIG_FILE = os.path.abspath(args.CONFIG_FILE)
else:
# CONFIG_FILE = "/Users/kthoden/EOAKram/dev/EOASkripts/Skripten/eoaconvert.cfg"
CONFIG_FILE = os.path.dirname(sys.argv[0]) + "/config/eoaconvert.cfg"
print("The config file is ", CONFIG_FILE)
##################################
# Reading the configuration file #
##################################
CONFIG = configparser.ConfigParser()
CONFIG.read(CONFIG_FILE)
# CONFIG = configparser.ConfigParser()
# CONFIG.read("/Users/kthoden/EOAKram/dev/EOASkripts/Skripten/eoaconvert.cfg")
########################
# Paths to executables #
########################
EPUB_FILES = os.path.dirname(sys.argv[0]) + "/data/epub_files/"
# TEMPLATE_PATH = CONFIG['Auxiliaries']['template_path']
GM_PATH = CONFIG['Executables']['graphicsmagic']
TL_PATH = CONFIG['Executables']['texlive']
# TEXBIN_PATH = CONFIG['Executables']['texbin']
# TRALICS_PATH_EXEC = CONFIG['Executables']['tralics_path_exec']
# TRALICS_PATH_LIB = CONFIG['Executables']['TRALICS_PATH_LIB']
# SUPPORT_PATH = CONFIG['Executables']['support_path']
# AUX_TeX_FILES_PATH = CONFIG['Executables']['aux_tex_files_path']
print(GM_PATH)
lang_dict = {"fig" : {"en" : "Fig.", "de" : "Abb."}}
tmpDir = os.getcwd() + "/tmp_files/"
Datei = open('tmp_files/intermediate.log', 'w')
xmlTree = etree.parse("tmp_files/IntermediateXMLFile.xml")
with open('tmp_files/data.pickle', 'rb') as f:
data = pickle.load(f)
dictSections = data["secdict"]
dictEquations = data["eqdict"]
dictLists = data["listdict"]
dictChapters = data["chapterdict"]
dictFigures = data["figdict"]
dictFootnotes = data["fndict"]
dictTheorems = data["theoremdict"]
dictTables = data["tabdict"]
dictPagelabels = data["pagelabeldict"]
def remove_processinginstruction(xml_tree, pi_name):
"""Remove processing instructions with a specific name"""
proc_insts = xml_tree.xpath("//processing-instruction('{}')".format(pi_name))
# counter = 1
# for instruction in proc_insts:
# logging.debug("looking at pi %d" % counter)
# instruction_previous = instruction.getprevious()
# instruction_parent = instruction.getparent()
# if instruction_previous is not None:
# if instruction_previous.tail is not None:
# instruction_previous_tail = instruction_previous.tail
# else:
# instruction_previous_tail = ""
# else:
# instruction_previous_tail = ""
# instruction_tail = instruction.tail
# instruction_parent_text = instruction_parent.text
# print("parent text", instruction_parent_text)
# print("previous ", instruction_previous_tail)
# print("pi tail", instruction_tail)
# if instruction_previous is not None:
# logging.debug("case 1")
# if instruction_tail is not None:
# logging.debug("case 2")
# instruction_previous_tail += instruction_tail
# else:
# logging.debug("case 3")
# if instruction_tail is not None:
# instruction_parent.text += instruction_tail
# instruction_parent.remove(instruction)
# counter += 1
# Leaving that out for now.
# Found solution below on https://stackoverflow.com/questions/31522162/, but that
# seems only to work in all cases
for instruction in proc_insts:
etree.strip_tags(instruction.getparent(), instruction.tag)
logging.debug("Removed %s processing instructions of type %s." % (len(proc_insts), pi_name))
return xml_tree
# def remove_processinginstruction ends here
def addToContentopf(contentopf, Filename, FileID, Mediatype):
"""Function to add Elements to Content-OPF (epub)"""
# logging.debug("considering adding %s with FileID %s to content.opf" % (Filename, FileID))
global listContentopf
# Sanitizing FileID, id-attribute may not contain _ : or /
# FileID may also not start with a number
FileID = re.sub("\_", "", FileID)
FileID = re.sub("\.", "", FileID)
FileID = re.sub("\/", "", FileID)
FileID = re.sub("^[0-9]", "", FileID)
FileID = re.sub("^[0-9]", "", FileID)
FileID = re.sub("^[0-9]", "", FileID)
if FileID in listContentopf:
# logging.debug("Not adding %s, because something with a FileID %s is already there" % (Filename, FileID))
return contentopf
else:
# Sanitizing FileID, id-attribute may not contain _ : or /
# FileID may also not start with a number
FileID = re.sub("\_", "", FileID)
FileID = re.sub("\.", "", FileID)
FileID = re.sub("\/", "", FileID)
FileID = re.sub("^[0-9]", "", FileID)
FileID = re.sub("^[0-9]", "", FileID)
FileID = re.sub("^[0-9]", "", FileID)
dictMediatypes = {
"txt" : "text/plain",
"otf" : "application/vnd.ms-opentype",
"xml" : "application/xhtml+xml",
"jpg" : "image/jpeg",
"png" : "image/png"
}
contentopfns = "{http://www.idpf.org/2007/opf}"
xmlManifest = contentopf.find(".//" + contentopfns + "manifest")
xmlItem = etree.Element("item")
xmlItem.set("id", FileID)
xmlItem.set("media-type", dictMediatypes[Mediatype])
xmlItem.set("href", Filename)
xmlManifest.append(xmlItem)
# logging.debug("Added %s, with FileID %s" % (Filename, FileID))
# if it's a XML-File also extent <spine>
if Mediatype == "xml":
xmlSpine = contentopf.find(".//" + contentopfns + "spine")
xmlItemref = etree.Element("itemref")
xmlItemref.set("idref", FileID)
xmlSpine.append(xmlItemref)
listContentopf.append(FileID)
return contentopf
# def addToContentopf ends here
def addToTocncx(tocncx, Label, intTechnicalChapterNumber):
"""Function to add Chapters to Table of Contents (epub)"""
tocncxns = "{http://www.daisy.org/z3986/2005/ncx/}"
xmlNavMap = tocncx.find(".//" + tocncxns + "navMap")
xmlNavPoint = etree.Element("navPoint")
xmlNavPoint.set("playOrder", str(intTechnicalChapterNumber + 1))
xmlNavPoint.set("id", "chapter" + str(intTechnicalChapterNumber))
xmlNavLabel = etree.Element("navLabel")
xmlNavLabelText = etree.Element("text")
xmlNavLabelText.text = Label
xmlNavLabel.append(xmlNavLabelText)
xmlNavPoint.append(xmlNavLabel)
xmlContent = etree.Element("content")
xmlContent.set("src", "chapter" + str(intTechnicalChapterNumber) + ".xhtml")
xmlNavPoint.append(xmlContent)
xmlNavMap.append(xmlNavPoint)
return tocncx
# def addToTocncx ends here
##############################################################
# Create .epub basic structure #
##############################################################
# Create folder structure for ebook
if os.path.exists(os.getcwd() + "/CONVERT/epub") == False:
os.mkdir(os.getcwd() + "/CONVERT/epub")
os.mkdir(os.getcwd() + "/CONVERT/epub/META-INF")
os.mkdir(os.getcwd() + "/CONVERT/epub/OEBPS")
os.mkdir(os.getcwd() + "/CONVERT/epub/OEBPS/images")
os.mkdir(os.getcwd() + "/CONVERT/epub/OEBPS/fonts")
# Copy containter.xml and mimetype
shutil.copy(EPUB_FILES + "epubcontainer.xml", os.getcwd() + "/CONVERT/epub/META-INF/container.xml")
shutil.copy(EPUB_FILES + "epubmimetype", os.getcwd() + "/CONVERT/epub/mimetype")
# Preparing content.opf
xmlContentopfParser = etree.XMLParser(no_network=False,load_dtd=False)
contentopf = etree.parse(EPUB_FILES + "epubcontentopf.xml", xmlContentopfParser)
# This list includes all files which have already been included to avoid duplicates
listContentopf = []
#########
# Fonts #
#########
libertine_fonts = ["GPL.txt", "LICENCE.txt", "LinLibertine_R.otf", "LinLibertine_RI.otf", "LinLibertine_RZ.otf", "LinLibertine_RZI.otf", "OFL-1.1.txt"]
termes_fonts = ["texgyretermes-bold.otf", "texgyretermes-bolditalic.otf", "texgyretermes-italic.otf", "texgyretermes-regular.otf"]
if args.font == "termes":
font_files = termes_fonts
shutil.copy(EPUB_FILES + "eoa-epub-termes.css", os.getcwd() + "/CONVERT/epub/OEBPS/eoa-epub.css")
elif args.font == "libertine":
shutil.copy(EPUB_FILES + "eoa-epub-libertine.css", os.getcwd() + "/CONVERT/epub/OEBPS/eoa-epub.css")
font_files = libertine_fonts
else:
logging.info("Font not recognized, falling back to default.")
shutil.copy(EPUB_FILES + "eoa-epub-termes.css", os.getcwd() + "/CONVERT/epub/OEBPS/eoa-epub.css")
otf_id_counter = 1
txt_id_counter = 1
for fontfile in font_files:
shutil.copy(EPUB_FILES + fontfile, os.getcwd() + "/CONVERT/epub/OEBPS/fonts/")
base_file_name, file_extension = os.path.splitext(fontfile)
if file_extension == ".otf":
contentopf = addToContentopf(contentopf, "fonts/" + fontfile, "otf-font" + str(otf_id_counter), file_extension[1:])
otf_id_counter += 1
elif file_extension == ".txt":
contentopf = addToContentopf(contentopf, "fonts/" + fontfile, "font-txt" + str(txt_id_counter), file_extension[1:])
txt_id_counter += 1
else:
print("Other file found. Exiting")
sys.exit()
# shutil.copy(EPUB_FILES + "texgyretermes-bold.otf", os.getcwd() + "/CONVERT/epub/OEBPS/fonts/")
# shutil.copy(EPUB_FILES + "texgyretermes-bolditalic.otf", os.getcwd() + "/CONVERT/epub/OEBPS/fonts/")
# shutil.copy(EPUB_FILES + "texgyretermes-italic.otf", os.getcwd() + "/CONVERT/epub/OEBPS/fonts/")
# shutil.copy(EPUB_FILES + "texgyretermes-regular.otf", os.getcwd() + "/CONVERT/epub/OEBPS/fonts/")
# Shortcut for namespace
htmlns = "{http://www.w3.org/1999/xhtml}"
# Load Template for Chapter HTML
xmlChapterParser = etree.XMLParser(no_network=False,load_dtd=False) #resolve_entities=False
# Preparing toc.ncx
xmlTocncxParser = etree.XMLParser(no_network=False,load_dtd=False)
tocncx = etree.parse(EPUB_FILES + "epubtocncx.xml", xmlTocncxParser)
print("-----------------------------------------------------")
print("Preparing content.opf")
xmlMetadata = contentopf.find(".//{http://www.idpf.org/2007/opf}metadata")
# Prepare Metadata based on Publication.cfg
cfgPublication = configparser.RawConfigParser()
cfgPublication.read(os.getcwd() + "/CONVERT/publication.cfg")
# Prepare Author String
strAuthorString = cfgPublication.get("Authors", "Author1")
if cfgPublication.get("Authors", "Author2") != "":
strAuthorString = cfgPublication.get("Authors", "Author1") + " and " + cfgPublication.get("Authors", "Author2")
if cfgPublication.get("Authors", "Author3") != "":
strAuthorString = cfgPublication.get("Authors", "Author1") + ", " + cfgPublication.get("Authors", "Author2") + " and " + cfgPublication.get("Authors", "Author3")
if cfgPublication.get("Authors", "Author4") != "":
strAuthorString = cfgPublication.get("Authors", "Author1") + ", " + cfgPublication.get("Authors", "Author2") + ", " + cfgPublication.get("Authors", "Author3") + " and " + cfgPublication.get("Authors", "Author4")
xmlAuthor = etree.Element("{http://purl.org/dc/elements/1.1/}creator")
xmlAuthor.text = strAuthorString
xmlMetadata.append(xmlAuthor)
# Prepare Title-String
strTitleString = cfgPublication.get("Technical", "Title")
xmlTitle = etree.Element("{http://purl.org/dc/elements/1.1/}title")
xmlTitle.text = strTitleString
xmlMetadata.append(xmlTitle)
# Prepare Description via Subtitle
strSubtitleString = cfgPublication.get("Technical", "Subtitle")
if strSubtitleString != "":
xmlSubtitle = etree.Element("{http://purl.org/dc/elements/1.1/}description")
xmlSubtitle.text = strSubtitleString
xmlMetadata.append(xmlSubtitle)
# Prepare Identifier
strIdentifier = "MPIWG:" + cfgPublication.get("Technical", "Serie") + cfgPublication.get("Technical", "Number")
xmlIdentifier = etree.Element("{http://purl.org/dc/elements/1.1/}identifier")
xmlIdentifier.text = strIdentifier
xmlIdentifier.set("id", "BookId")
xmlMetadata.append(xmlIdentifier)
# Prepare Type
xmlType = etree.Element("{http://purl.org/dc/elements/1.1/}type")
xmlType.text = "Text"
xmlMetadata.append(xmlType)
#Prepare Date
strPublicationDate = cfgPublication.get("Technical", "PublicationDate")
xmlDate = etree.Element("{http://purl.org/dc/elements/1.1/}date")
xmlDate.text = strPublicationDate
xmlDate.set("{http://www.idpf.org/2007/opf}event", "creation")
xmlMetadata.append(xmlDate)
# Prepare Publisher
xmlPublisher = etree.Element("{http://purl.org/dc/elements/1.1/}publisher")
xmlPublisher.text = "Edition Open Access"
xmlMetadata.append(xmlPublisher)
# Prepare Rights
xmlPublisher = etree.Element("{http://purl.org/dc/elements/1.1/}rights")
xmlPublisher.text = "Published under Creative Commons by-nc-sa 3.0 Germany Licence"
xmlMetadata.append(xmlPublisher)
# Prepare Source
xmlSource = etree.Element("{http://purl.org/dc/elements/1.1/}source")
xmlSource.text = "Max Planck Research Library for the History and Development of Knowledge"
xmlMetadata.append(xmlSource)
# Prepare Subject
strSubject = cfgPublication.get("General", "Keyword1")
xmlSubject = etree.Element("{http://purl.org/dc/elements/1.1/}subject")
xmlSubject.text = strSubject
xmlMetadata.append(xmlSubject)
# Prepare Language
strLanguage = cfgPublication.get("Technical", "Language")
xmlLanguage = etree.Element("{http://purl.org/dc/elements/1.1/}language")
xmlLanguage.text = strLanguage
xmlMetadata.append(xmlLanguage)
#Prepare Cover
xmlCover = etree.Element("meta")
xmlCover.set("content", "cover_pic")
xmlCover.set("name", "cover")
xmlMetadata.append(xmlCover)
xmlManifest = contentopf.find(".//{http://www.idpf.org/2007/opf}manifest")
xmlItem = etree.Element("item")
xmlItem.set("id", "cover_pic")
xmlItem.set("href", "images/cover.jpg")
xmlItem.set("media-type", "image/jpeg")
xmlManifest.append(xmlItem)
shutil.copy(os.getcwd() + "/CONVERT/cover.jpg", os.getcwd() + "/CONVERT/epub/OEBPS/images/")
xmlItem = etree.Element("item")
xmlItem.set("id", "cover")
xmlItem.set("href", "cover.xhtml")
xmlItem.set("media-type", "application/xhtml+xml")
xmlManifest.append(xmlItem)
shutil.copy(EPUB_FILES + "epubcover.xhtml", os.getcwd() + "/CONVERT/epub/OEBPS/cover.xhtml")
print("-------------------")
print("Preparing intro.xhtml")
print("-------------------")
if cfgPublication.get("Technical", "Serie") == "Sources":
tmpFilePath = EPUB_FILES + "epubintro-sources.xhtml"
else:
tmpFilePath = EPUB_FILES + "epubintro.xhtml"
tmpFile = open(tmpFilePath, "r")
strIntroHTML = tmpFile.read()
tmpFile.close()
strIntroHTML = re.sub("author", strAuthorString, strIntroHTML)
strIntroHTML = re.sub("TITLE", strTitleString, strIntroHTML)
strIntroHTML = re.sub("year", cfgPublication.get("Technical", "PublicationYear"), strIntroHTML)
strIntroHTML = re.sub("series", cfgPublication.get("Technical", "Serie"), strIntroHTML)
strIntroHTML = re.sub("number", cfgPublication.get("Technical", "Number"), strIntroHTML)
try:
strIntroHTML = re.sub("AdditionalInformation", "<p>" + cfgPublication.get("General", "AdditionalInformation") + "</p>", strIntroHTML)
except configparser.NoOptionError:
strIntroHTML = re.sub("AdditionalInformation", "", strIntroHTML)
# if cfgPublication.get("General", "AdditionalInformation") is not None:
# strIntroHTML = re.sub("AdditionalInformation", "<p>" + cfgPublication.get("General", "AdditionalInformation") + "</p>", strIntroHTML)
# else:
# strIntroHTML = re.sub("AdditionalInformation", "", strIntroHTML)
tmpFilePath = os.getcwd() + "/CONVERT/epub/OEBPS/intro.xhtml"
tmpFile = open(tmpFilePath, "w")
tmpFile.write(strIntroHTML)
print("-------------------")
print("Preparing toc.ncx")
print("-------------------")
xmlHead = tocncx.find("//{http://www.daisy.org/z3986/2005/ncx/}head")
xmlMeta = etree.Element("meta")
xmlMeta.set("name", "dtb:uid")
xmlMeta.set("content", "MPIWG:" + cfgPublication.get("Technical", "Serie") + cfgPublication.get("Technical", "Number"))
xmlHead.append(xmlMeta)
xmlTitle = tocncx.find("//{http://www.daisy.org/z3986/2005/ncx/}docTitle")
xmlText = etree.Element("text")
xmlText.text = strTitleString
xmlTitle.append(xmlText)
xmlAuthor = tocncx.find("//{http://www.daisy.org/z3986/2005/ncx/}docAuthor")
xmlText = etree.Element("text")
xmlText.text = strAuthorString
xmlAuthor.append(xmlText)
##############################################################
# Convert Tralics-XML to Epub #
##############################################################
#xmlTree = remove_processinginstruction(xmlTree, 'hyperimage')
# Copy xmlTree to xmlEbookTree
xmlEbookTree = deepcopy(xmlTree)
# xmlChapters is a list containing all chapters
xmlChapters = xmlEbookTree.findall("//div1")
# Convert Chapters, Sections, Subsections and Subsubsections to h1, h2, h3, h4
# Insert Number from Dictionary where needed
print("-----------------------------------------------------")
print("Convert EOAChapter to H1")
for xmlChapter in xmlChapters:
xmlChapter.find("head").tag = "h1"
if xmlChapter.get("rend") != "nonumber":
idChapter = xmlChapter.get("id")
# print(idChapter + " konvertierung into h1")
# print(dictChapters[idChapter])
strHeadline = xmlChapter.find("h1").text or ""
xmlChapter.find("h1").text = str(dictChapters[idChapter]) + ". " + strHeadline
if xmlChapter.find(".//EOAauthor") is not None:
tmpXML = etree.Element("p")
tmpXML.append(etree.Element("i"))
tmpXML[0].text = xmlChapter.find(".//EOAauthor").text
xmlChapter.insert(1, tmpXML)
# Remove unwanted EOAauthor here
xmlChapter.find(".//EOAauthor").text = ""
xmlChapter = etree.strip_tags(xmlChapter, "EOAauthor")
# print(dictSections)
print("-----------------------------------------------------")
print("Convert EOAsection to H2")
xmlSections = xmlEbookTree.findall(".//div2")
for xmlSection in xmlSections:
xmlSection.find("head").tag = "h2"
if xmlSection.get("rend") != "nonumber":
idSection = xmlSection.get("id")
strHeadline = xmlSection.find("h2").text or ""
print(strHeadline)
xmlSection.find("h2").text = str(dictSections[idSection]) + " " + strHeadline
print("-----------------------------------------------------")
print("Convert EOAsubsection to H3")
xmlSubsections = xmlEbookTree.findall(".//div3")
for xmlSubsection in xmlSubsections:
xmlSubsection.find("head").tag = "h3"
if xmlSubsection.get("rend") != "nonumber":
idSection = xmlSubsection.get("id")
strHeadline = xmlSubsection.find("h3").text or ""
print(strHeadline)
xmlSubsection.find("h3").text = str(dictSections[idSection]) + " " + strHeadline
print("-----------------------------------------------------")
print("Convert EOAsubsubsection to H4")
xmlSubsubsections = xmlEbookTree.findall(".//div4")
for xmlSubsubsection in xmlSubsubsections:
xmlSubsubsection.find("head").tag = "h4"
#if xmlSubsubsection.get("rend") != "nonumber":
#idSection = xmlSubsection.get("id")
#strHeadline = xmlSubsection.find("h4").text
#xmlSubsection.find("h3").text = str(dictSections[idSection]) + " " + strHeadline
print("-----------------------------------------------------")
print("Convert EOAparagraph to H5")
xmlParagraphs = xmlEbookTree.findall(".//div5")
for xmlParagraph in xmlParagraphs:
print("Found a paragraph.")
xmlParagraph.find("head").tag = "h5"
print("-----------------------------------------------------")
print("Preparing Figures")
xmlFigures = xmlEbookTree.xpath(".//EOAfigure[not(@type='hionly')] | .//EOAlsfigure[not(@type='hionly')]")
for xmlFigure in xmlFigures:
# Copy File of the Image
# If it's in a subfolder, name of folder and name of image will be merged
strImageFileString = xmlFigure.find(".//file").text
strImageFileString = strImageFileString.rstrip("\n")
strImageFileDir = os.path.dirname(strImageFileString)
# Remove / from path
strImageFileDir = re.sub("/", "", strImageFileDir)
strImageFileName = os.path.basename(strImageFileString)
strImageFileNamewoSuffix, strImageFileName_Suffix = os.path.splitext(strImageFileName)
shutil.copy(os.getcwd() + "/" + strImageFileString, os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strImageFileDir + strImageFileName)
if strImageFileName_Suffix.lower() == ".jpg":
extension_and_mime = "jpg"
elif strImageFileName_Suffix.lower() in [".png", ".pdf"]:
extension_and_mime = "png"
else:
print("Found an unrecognized image suffix: %s" % strImageFileName_Suffix)
sys.exit()
strImageFilepath = libeoaconvert.sanitizeImage(os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strImageFileDir + strImageFileName, GM_PATH, TL_PATH)
# Add copied file to contentopf
content_opf_filename = "images" + os.path.sep + "{}{}.{}".format(strImageFileDir, strImageFileNamewoSuffix, extension_and_mime)
content_opf_fileid = "{}{}{}".format(strImageFileDir, strImageFileNamewoSuffix, extension_and_mime)
contentopf = addToContentopf(contentopf, content_opf_filename, content_opf_fileid, extension_and_mime)
idFigure = xmlFigure.find(".//anchor").get("id")
intFigureNumber = dictFigures[idFigure]
if xmlFigure.tag == "EOAfigure":
strImageWidth = xmlFigure.find(".//width").text
strImageWidth = strImageWidth.rstrip("\n")
if xmlFigure.tag == "EOAlsfigure":
strImageWidth = "100"
xmlFigure.clear()
xmlFigure.tag = "p"
xmlFigureImage = etree.Element("img")
xmlFigureImage.set("src", "images/" + strImageFileDir + strImageFileNamewoSuffix + "." + extension_and_mime)
xmlFigureImage.set("alt", "")
xmlFigureImage.set("style", "width: " + strImageWidth + "%")
xmlFigure.append(xmlFigureImage)
if args.nocaption:
pass
else:
xmlFigureCaption = xmlFigure.find(".//caption")
xmlFigureCaption.tag = "p"
strFigureCaption = xmlFigureCaption.text or ""
# FIX
xmlFigureCaption.text = lang_dict["fig"][strLanguage] + " " + str(intFigureNumber) + ": " + strFigureCaption
xmlFigure.addnext(xmlFigureCaption)
# Change the tag of the parent <p>-Tag to <div> so that it may be removed
#xmlFigure.getparent().tag = "div"
xml_figures_hyperimage = xmlEbookTree.xpath(".//EOAfigure[@type='hionly'] | .//EOAlsfigure[@type='hionly']")
logging.debug("found %s hyperimage figures" % len(xml_figures_hyperimage))
for fig in xml_figures_hyperimage:
fig.tag = "EOAhifigure"
print("-----------------------------------------------------")
print("Preparing not numbered Figures")
xmlFigures = xmlEbookTree.findall(".//EOAfigurenonumber")
for xmlFigure in xmlFigures:
# Copy File of the Image
# If it's in a subfolder, name of folder and name of image will be merged
strImageFileString = xmlFigure.find(".//file").text
strImageFileString = strImageFileString.rstrip("\n")
strImageFileDir = os.path.dirname(strImageFileString)
strImageFileDir = re.sub("/", "", strImageFileDir)
strImageFileName = os.path.basename(strImageFileString)
strImageFileNamewoSuffix = os.path.splitext(strImageFileName)[0]
shutil.copy(os.getcwd() + "/" + strImageFileString, os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strImageFileDir + strImageFileName)
strImageFilepath = libeoaconvert.sanitizeImage(os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strImageFileDir + strImageFileName, GM_PATH, TL_PATH)
# Add copied file to contentopf
contentopf = addToContentopf(contentopf, "images/" + strImageFileDir + strImageFileNamewoSuffix + ".jpg", strImageFileDir + strImageFileNamewoSuffix + "-nonumber-jpg", "jpg")
logging.debug("Added a nonumber figure")
strImageWidth = xmlFigure.find(".//width").text
strImageWidth = strImageWidth.rstrip("\n")
xmlFigure.clear()
xmlFigure.tag = "p"
xmlFigureImage = etree.Element("img")
xmlFigureImage.set("src", "images/" + strImageFileDir + strImageFileNamewoSuffix + ".jpg")
xmlFigureImage.set("alt", "")
xmlFigureImage.set("style", "width: " + strImageWidth + "%")
xmlFigure.append(xmlFigureImage)
print("-----------------------------------------------------")
print("Preparing Footnotes")
def alph_footnote_index(fndex):
"""
lowercase Latin footnotes need to support more than 26 values
These are zero-indexed.
>>> alph_footnote_index(0)
'a'
>>> alph_footnote_index(1)
'b'
>>> alph_footnote_index(24)
'y'
>>> alph_footnote_index(25)
'z'
>>> alph_footnote_index(26)
'aa'
>>> alph_footnote_index(27)
'ab'
"""
alphabet = "abcdefghijklmnopqrstuvwxyz"
quotient, remainder = divmod(fndex, len(alphabet))
if not quotient: return alphabet[fndex]
return alph_footnote_index(quotient - 1) + alph_footnote_index(remainder)
# def alph_footnote_index ends here
def replace_footnote_equations(footnote):
"""
captures reusable behavior from the existing code
potentially, some of the old code could be replaced by calls to this helper
usage: contentopf = replace_footnote_equations(my_footnote)
unfortunately, returning the result seemed like a better idea than mutating the global variable
"""
result = contentopf
for equation in footnote.findall(".//EOAequationnonumber"):
filename = equation.get("filename")
equation.clear()
equation.tag = "p"
img = etree.Element("img", src="images/%s" % filename, alt="")
equation.append(img)
cwd = os.getcwd()
shutil.copy("%s/items/%s" % (cwd, filename), "%s/CONVERT/epub/DEBPS/images/%s" % (cwd, filename))
result = addToContentopf(result, "images/" + filename, filename, "png")
print("einmal durch replace_footnote_equations")
return result
# def replace_footnote_equations ends here
def replace_footnote_with_sup(note):
"""
captures reusable behavior from the existing code
potentially, some of the old code could be replaced by calls to this helper
this behavior showed up in a few places
I thought I would be able to extract a little more, but this was all that was actually common
"""
tail = note.tail
note.clear()
note.tail = tail
note.tag = "sup"
# def replace_footnote_with_sup ends here
def bring_footnote_down_epub(footnote, footnote_name, destination):
"""
captures reusable behavior from the existing code
potentially, some of the old code could be replaced by calls to this helper
usage: contentopf = bring_footnote_down_epub(my_footnote, "1", xmlNewFootnotes)
unfortunately, returning the result seemed like a better idea than mutating the global variable
"""
contentopf = replace_footnote_equations(footnote) # see usage note
kids = list(footnote.getchildren())
prefix = "[%s]" % footnote_name
# we would like to prepend this footnote identifier to the footnote element
if footnote.text is not None:
# if the element starts with some text anyway, prepend it there
# footnote.text = "%s %s" % (prefix, footnote.text)
pass
else:
# if, however, the element begins with a child, prepend the text at the beginning of the first child instead
if len(kids):
first_child = kids[0]
# child_text = prefix
child_text = ""
# separate them with a space, unless the child had no text to begin with
child_suffix = first_child.text
if child_suffix is None:
child_suffix = ""
else:
child_text += " "
child_text += child_suffix
first_child.text = child_text
else:
# a totally empty footnote is weird, but who am I to judge?
footnote.text = prefix
footnote_text = footnote.text or ""
replace_footnote_with_sup(footnote)
footnote.text = "["
note_link = etree.SubElement(footnote, "a")
note_link.set("href", "#fn" + footnote_name)
note_link.set("id", "body_fn-ref" + footnote_name)
note_link.text = "%s" % footnote_name
note_link.tail = "]"
# append any text the footnote used to have to the destination
destkids = list(destination.getchildren())
if len(destkids):
# if the destination has children, append after the last one's tail
last_kid = destkids[-1]
prefix = last_kid.tail
if prefix is None:
prefix = ""
else:
prefix += " "
last_kid.tail = prefix + footnote_text
else:
# if the destination has no children, append to its text
prefix = destination.text
if prefix is None:
prefix = ""
else:
prefix += " "
destination.text = prefix + footnote_text
for kid in kids:
destination.append(kid)
return contentopf
# def bring_footnote_down_epub ends here
class FootnoteError(Exception):
"""
we only support one type of footnote per chapter
don't try to mix-and-match
"""
pass
# class FootnoteError ends here
intTechnicalChapterNumber = 1
for xmlChapter in xmlChapters:
groupings = libeoaconvert.get_bigfoot_data(xmlChapter)
xmlFootnotes = list(xmlChapter.findall(".//note"))
print("here come the footnotes. found", len(xmlFootnotes))
has_old = 0 != len(xmlFootnotes)
has_new = 0 != len(
[ # flatten the association list whose values are lists, so we can take the length
note
for grouping, notes in groupings
for note in notes
]
)
# the XOR case falls through, the AND is an error, and the NOR skips to the next chapter
if has_old:
if has_new:
raise FootnoteError("Chapter %s contains both \\EOAfn and footnotes in the style of \\EOAfnalph" % xmlChapter.get("id-text"))
else:
if not has_new:
continue
xmlNewFootnotes = etree.Element("div")
xmlNewFootnotesHeader = etree.Element("h3")
xmlNewFootnotesHeader.text = libeoaconvert.dictLangFootnotes[libeoaconvert.two_letter_language(xmlChapter.get("language"))]
xmlNewFootnotes.append(xmlNewFootnotesHeader)
for grouping, notes in groupings:
# do for the new-style footnotes what was being done for the old
for index, note in enumerate(notes):
footnote_name = str(index + 1)
if "lower-latin" == grouping:
footnote_name = alph_footnote_index(index)
para = etree.Element("p")
para.text = "["
note_link = etree.SubElement(para, "a")
note_link.set("id", "fn" + footnote_name)
note_link.set("href", "#body_fn-ref" + footnote_name)
note_link.text = "%s" % footnote_name
note_link.tail = "]"
contentopf = bring_footnote_down_epub(note, footnote_name, para)
xmlNewFootnotes.append(para)
tmpFileName = "chapter" + (str(intTechnicalChapterNumber)) + ".xhtml"
intFootnoteNumber = 1
for xmlFootnote in xmlFootnotes:
# Not numbered Equations may appear in a footnote, need to be treated differently
xmlEquationsnonumber = xmlFootnote.findall(".//EOAequationnonumber")
for xmlEquationnonumber in xmlEquationsnonumber:
strFilename = xmlEquationnonumber.get("filename")
xmlEquationnonumber.clear()
xmlEquationnonumber.tag = "p"
xmlIMG = etree.Element("img", src="images/"+ strFilename, alt="")
xmlEquationnonumber.append(xmlIMG)
shutil.copy(os.getcwd() + "/items/" + strFilename, os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strFilename)
contentopf = addToContentopf(contentopf, "images/" + strFilename, strFilename, "png")
tmp_fnstring = "fn" + str(intFootnoteNumber)
tmp_fnrefstring = "body_fn-ref" + str(intFootnoteNumber)
xmlFirstChild = xmlFootnote.getchildren()[0]
# this is for the reference text
if xmlFirstChild.text is None:
xmlNewFootnoteRefBottom = etree.SubElement(xmlFirstChild, "a", href = "#" + tmp_fnrefstring, id = tmp_fnstring)
xmlFirstChild.text = "["
xmlNewFootnoteRefBottom.text = str(intFootnoteNumber)
xmlNewFootnoteRefBottom.tail = "] "
xmlFirstChild.insert(0, xmlNewFootnoteRefBottom)
else:
xmlNewFootnoteRefBottom = etree.Element("a", href = "#" + tmp_fnrefstring, id = tmp_fnstring)
xmlNewFootnoteRefBottom.text = str(intFootnoteNumber)
beginning_of_footnote = xmlFirstChild.text
xmlFirstChild.text = "["
xmlNewFootnoteRefBottom.tail = "] " + beginning_of_footnote
xmlFirstChild.insert(0, xmlNewFootnoteRefBottom)
#Preserve tail and children of current <note>-Tag
xmlFootnoteContentsTail = xmlFootnote.tail
xmlFootnoteChildren = xmlFootnote.getchildren()
# Substitute current <note> with Number
xmlFootnote.clear()
xmlFootnote.tag = "sup"
xmlFootnote.text = "["
xmlFootnote.tail = xmlFootnoteContentsTail
xmlNewFootnoteRef = etree.SubElement(xmlFootnote, "a", href = "#" + tmp_fnstring, id = tmp_fnrefstring)
xmlNewFootnoteRef.text = str(intFootnoteNumber)
xmlNewFootnoteRef.tail = "]"
if len(xmlFootnoteChildren) != 0:
for xmlFootnoteChild in xmlFootnoteChildren:
xmlNewFootnotes.append(xmlFootnoteChild)
intFootnoteNumber += 1
xmlChapter.append(xmlNewFootnotes)
intTechnicalChapterNumber += 1
print("-----------------------------------------------------")
print("Preparing Lists")
for xmlChapter in xmlChapters:
xmlLists = xmlChapter.findall(".//list")
for xmlList in xmlLists:
if xmlList.get("type") == "description":
continue
if xmlList.get("type") == "ordered":
xmlList.tag = "ol"
xmlFirstItem = xmlList.find("..//item")
listnumber = xmlFirstItem.get("id-text")
xmlList.set("start", listnumber)
if xmlList.get("type") == "simple":
xmlList.tag = "ul"
xmlListItems = xmlList.findall(".//item")
for xmlListItem in xmlListItems:
xmlListItem.tag = "li"
print("-----------------------------------------------------")
print("Preparing Descriptions")
for xmlChapter in xmlChapters:
xmlDescriptions = xmlChapter.findall(".//list")
for xmlDescription in xmlDescriptions:
xmlDescription.tag = "dl"
del xmlDescription.attrib["type"]
for xmlChild in xmlDescription.iterchildren():
if xmlChild.tag == "label":
xmlChild.tag = "dt"
if xmlChild.tag == "item":
xmlChild.tag = "dd"
del xmlChild.attrib["id"]
del xmlChild.attrib["id-text"]
print("-----------------------------------------------------")
print("Preparing Blockquotes")
xmlParagraphs = xmlEbookTree.findall(".//p")
for xmlParagraph in xmlParagraphs:
if xmlParagraph.get("rend") == "quoted":
strParagraphText = xmlParagraph.text
strParagraphTail = xmlParagraph.tail
xmlParagraphChildren = xmlParagraph.getchildren()
xmlParagraph.clear()
xmlParagraph.tag = "blockquote"
xmlNew = etree.Element("p")
if strParagraphText is not None:
xmlNew.text = strParagraphText
if len(xmlParagraphChildren) != 0:
for xmlParagraphChild in xmlParagraphChildren:
xmlNew.append(xmlParagraphChild)
if strParagraphTail is not None:
xmlNew.tail = strParagraphTail
xmlParagraph.append(xmlNew)
print("-----------------------------------------------------")
print("Preparing Theorems")
for xmlChapter in xmlChapters:
xmlTheorems = xmlChapter.findall(".//theorem")
for xmlTheorem in xmlTheorems:
xmlTheoremHead = xmlTheorem.find(".//head")
strTheoremTitel = xmlTheorem.find(".//head").text
strTheoremText = xmlTheorem.find(".//p").text
xmlTheoremTextTail = xmlTheorem.find(".//p").tail
strTheoremNumber = xmlTheorem.get("id-text")
xmlTheorem.tag = "p"
xmlTheoremHead.tag = "b"
xmlTheoremHead.text = xmlTheoremHead.text + " " + strTheoremNumber
del xmlTheorem.attrib["style"]
del xmlTheorem.attrib["type"]
del xmlTheorem.attrib["id-text"]
del xmlTheorem.attrib["id"]
etree.strip_tags(xmlTheorem, "p")
print("-----------------------------------------------------")
print("Preparing Hyperlinks")
for xmlChapter in xmlChapters:
xmlHyperlinks = xmlChapter.findall(".//xref")
for xmlHyperlink in xmlHyperlinks:
strURL = xmlHyperlink.get('url')
if strURL.startswith("http://") == False:
if strURL.startswith("https://") == False:
strURL = "http://" + strURL
xmlHyperlink.tag = "a"
del xmlHyperlink.attrib["url"]
xmlHyperlink.set("href", strURL)
etree.strip_elements(xmlHyperlink, with_tail=True, *['allowbreak'])
xmlHyperlink.text = strURL
print("-----------------------------------------------------")
print("Convert emphasized text")
for xmlChapter in xmlChapters:
xmlItalics = xmlChapter.findall(".//hi")
for xmlItalic in xmlItalics:
if xmlItalic.get("rend") == "it":
xmlItalic.tag = "em"
del xmlItalic.attrib["rend"]
print("-----------------------------------------------------")
print("Convert bold text")
for xmlChapter in xmlChapters:
xmlBolds = xmlChapter.findall(".//hi")
for xmlBold in xmlBolds:
if xmlBold.get("rend") == "bold":
xmlBold.tag = "b"
del xmlBold.attrib["rend"]
print("-----------------------------------------------------")
print("Convert EOAup to <sup>")
for xmlChapter in xmlChapters:
xmlUps = xmlChapter.findall(".//EOAup")
for xmlUp in xmlUps:
xmlUp.tag = "sup"
print("-----------------------------------------------------")
print("Convert EOAdown to <sub>")
for xmlChapter in xmlChapters:
xmlDowns = xmlChapter.findall(".//EOAdown")
for xmlDown in xmlDowns:
xmlDown.tag = "sub"
print("-----------------------------------------------------")
print("Convert EOAst to <span>")
for xmlChapter in xmlChapters:
xmlStrikeouts = xmlChapter.findall(".//EOAst")
for xmlStrikeout in xmlStrikeouts:
xmlStrikeout.tag = "span"
xmlStrikeout.set("style", "text-decoration: line-through;")
print("-----------------------------------------------------")
print("Convert EOAls to something nice")
for xmlChapter in xmlChapters:
xmlLetterspaceds = xmlChapter.findall(".//EOAls")
for xmlLetterspaced in xmlLetterspaceds:
xmlLetterspaced.tag = "span"
xmlLetterspaced.set("style", "letter-spacing: 0.5em;")
print("-----------------------------------------------------")
print("Convert EOAcaps to something nice")
for xmlChapter in xmlChapters:
xmlLetterspaceds = xmlChapter.findall(".//EOAcaps")
for xmlLetterspaced in xmlLetterspaceds:
xmlLetterspaced.tag = "span"
xmlLetterspaced.set("style", "font-variant:small-caps;")
print("-----------------------------------------------------")
print("Convert EOAineq into appropriate IMG-Tags")
for xmlChapter in xmlChapters:
xmlInlineEquations = xmlChapter.findall(".//EOAineq")
for xmlInlineEquation in xmlInlineEquations:
xmlInlineEquation.tag = "img"
xmlInlineEquation.set("alt", xmlInlineEquation.get("TeX"))
del xmlInlineEquation.attrib["TeX"]
shutil.copy(os.getcwd() + "/items/" + xmlInlineEquation.get("src"), os.getcwd() + "/CONVERT/epub/OEBPS/images/" + xmlInlineEquation.get("src"))
xmlInlineEquation.set("src", "images/" + xmlInlineEquation.get("src"))
contentopf = addToContentopf(contentopf, xmlInlineEquation.get("src"), xmlInlineEquation.get("src"), "png")
print("-----------------------------------------------------")
print("Convert EOAchem into appropriate IMG-Tags")
for xmlChapter in xmlChapters:
xml_inline_chems = xmlChapter.findall(".//EOAchem")
for xml_inline_chem in xml_inline_chems:
xml_inline_chem.tag = "img"
xml_inline_chem.set("alt", xml_inline_chem.get("TeX"))
del xml_inline_chem.attrib["TeX"]
shutil.copy(os.getcwd() + "/items/" + xml_inline_chem.get("src"), os.getcwd() + "/CONVERT/epub/OEBPS/images/" + xml_inline_chem.get("src"))
xml_inline_chem.set("src", "images/" + xml_inline_chem.get("src"))
contentopf = addToContentopf(contentopf, xml_inline_chem.get("src"), xml_inline_chem.get("src"), "png")
print("-----------------------------------------------------")
print("Convert EOAinline into appropriate IMG-Tags")
for xmlChapter in xmlChapters:
xmlInlineElements = xmlChapter.findall(".//EOAinline")
for xmlInlineElement in xmlInlineElements:
xmlInlineElement.tag = "img"
xmlInlineElement.set("alt", "Too late")
strInlineElementFilePath = xmlInlineElement.text
# remove text from element. This is visible in epub (at least in calibre's e-book-viewer)
# however, the text is taken as id in content.opf
# set it to nil after the addToContentopf
strInlineElementFileName = os.path.basename(strInlineElementFilePath)
strInlineElementDirName = os.path.dirname(strInlineElementFilePath)
strNewImagePath = os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strInlineElementDirName + strInlineElementFileName
# trouble when there are subdirectories in Image path!
# some thing goes wrong here: <EOAinline>Images/png_300dpi/A.png</EOAinline>
shutil.copy(os.getcwd() + "/" + strInlineElementDirName + "/" + strInlineElementFileName, strNewImagePath)
# strNewImagePath = os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strInlineElementDirName + strInlineElementFileName
strCommand = GM_PATH + " convert " + strNewImagePath + " -resize 20x20 " + strNewImagePath
listArguments = shlex.split(strCommand)
subprocess.check_output(listArguments, shell=False)
xmlInlineElement.set("src", "images/" + strInlineElementDirName + strInlineElementFileName)
# contentopf, Filename, FileID, Mediatype
# <item id="Troublemaker" media-type="image/png" href="images/inlineA.jpg"/>
# Mediatype should not be hard coded!!!
# base this on file extension
extension = strInlineElementFileName.split(".")[-1]
contentopf = addToContentopf(contentopf, "images/" + strInlineElementDirName + strInlineElementFileName, xmlInlineElement.text, extension)
xmlInlineElement.text = ""
print("-----------------------------------------------------")
print("Preparing Verses")
for xmlChapter in xmlChapters:
xml_verses = xmlChapter.findall(".//EOAverse")
print(len(xml_verses))
for xml_verse in xml_verses:
xml_verse_children = xml_verse.getchildren()
for line in xml_verse_children[:-1]:
linebreak = etree.Element("br")
line.append(linebreak)
etree.strip_tags(xml_verse, "p")
xml_verse.tag = "p"
xml_verse.set("class", "verse")
print("-----------------------------------------------------")
print("Preparing Equations")
for xmlChapter in xmlChapters:
xmlEquations = xmlChapter.findall(".//EOAequation")
for xmlEquation in xmlEquations:
strNumber = xmlEquation.get("number")
strFilename = xmlEquation.get("filename")
# Copy image of Equation
shutil.copy(os.getcwd() + "/items/" + strFilename, os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strFilename)
contentopf = addToContentopf(contentopf, "images/" + strFilename, strFilename, "png")
# Find out Number of Equation to be appended in the last step
strEquationNumber = xmlEquation.get("number")
# Rework XML
xmlEquation.clear()
xmlEquation.tag = "p"
xmlEquationImage = etree.Element("img")
xmlEquationImage.set("src", "images/" + strFilename)
xmlEquationImage.set("alt", "")
xmlEquation.append(xmlEquationImage)
xmlNew = etree.Element('p')
xmlNew.text = "(" + strEquationNumber + ")"
xmlEquation.addnext(xmlNew)
# Parent tag of Equation should be <div> instead of <p>, so that it may be removed
#xmlEquation.getparent().tag = "div"
for xmlChapter in xmlChapters:
xmlEquations = xmlChapter.findall(".//EOAequationnonumber")
for xmlEquation in xmlEquations:
strFilename = xmlEquation.get("filename")
# Copy image of Equation
shutil.copy(os.getcwd() + "/items/" + strFilename, os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strFilename)
contentopf = addToContentopf(contentopf, "images/" + strFilename, strFilename, "png")
# Rework XML
xmlEquation.clear()
xmlEquation.tag = "p"
xmlEquationImage = etree.Element("img")
xmlEquationImage.set("src", "images/" + strFilename)
xmlEquationImage.set("alt", "")
xmlEquation.append(xmlEquationImage)
# Parent tag of Equation should be <div> instead of <p>, so that it may be removed
#xmlEquation.getparent().tag = "div"
# EOAequationarray not handled so far. However: my solution (renaming
# the div) just makes the element disappear, leaving only its children!
for xmlChapter in xmlChapters:
xmlEquationarrays = xmlChapter.findall(".//EOAequationarray")
for xmlEquationarray in xmlEquationarrays:
xmlEquationarray.tag = "div"
print("-----------------------------------------------------")
print("Preparing Letterheads")
for xmlChapter in xmlChapters:
xmlLetterheads = xmlChapter.xpath(".//EOAletterhead")
for xmlLetterhead in xmlLetterheads:
xmlRecipient = xmlLetterhead.find(".//Recipient")
xmlRecipient.tag = "p"
xmlRecipient.getchildren()[0].tag = "em"
xmlArchive = xmlLetterhead.find(".//Archive")
xmlArchive.tag = "p"
xmlArchive.getchildren()[0].tag = "em"
xmlAdditional = xmlLetterhead.find(".//Additional")
xmlAdditional.tag = "p"
xmlAdditional.getchildren()[0].tag = "em"
xmlPages = xmlLetterhead.find(".//Pages")
xmlPages.tag = "p"
xmlPages.getchildren()[0].tag = "em"
xmlHR = etree.Element("hr")
xmlHR2 = etree.Element("hr")
xmlLetterhead.insert(0, xmlHR)
xmlLetterhead.insert(5, xmlHR2)
print("-----------------------------------------------------")
print("Preparing Transcriptions")
# TODO: May need rework concerning the right Column
for xmlChapter in xmlChapters:
etree.strip_elements(xmlChapter, "Facsimilelink")
xmlTranscriptions = xmlChapter.xpath(".//EOAtranscripted")
for xmlTranscription in xmlTranscriptions:
print("Processing Transcription")
#print (etree.tostring(xmlTranscription))
xmlTranscription.tag = "table"
xmlHeader = xmlTranscription.find(".//EOAtranscriptedheader")
xmlHeader.tag = "tr"
xmlLeftHeader = xmlTranscription.find(".//Leftheader")
# print(xmlLeftHeader.text)
xmlLeftHeader.tag = "td"
xmlLeftHeader.set("style", "width: 50%")
xmlRightHeader = xmlTranscription.find(".//Rightheader")
xmlRightHeader.tag = "td"
xmlTranscriptedtext = xmlTranscription.find(".//EOAtranscriptedtext")
# change \n\n into </p><p> and pagebreak into </p><pagebreak><p> to create some valid markup
strTranscriptedtext = etree.tostring(xmlTranscriptedtext, encoding="unicode")
#strTranscriptedtext = re.sub (r"\n\n\n\n", "</p><p>", str(strTranscriptedtext), re.MULTILINE)
#strTranscriptedtext = re.sub (r"\n\n\n", "</p><p>", str(strTranscriptedtext), re.MULTILINE)
#strTranscriptedtext = re.sub (r"\n\n", "</p><p>", str(strTranscriptedtext))
#strTranscriptedtext = re.sub (r"<pagebreak/>", "</p><pagebreak/><p>", strTranscriptedtext)
xmlLeftColumn = etree.Element("td")
xmlRightColumn = etree.Element("td")
boolRightColumn = False
xmlTemp = etree.XML(str(strTranscriptedtext))
for xmlElement in xmlTemp.iterchildren():
if xmlElement.tag == "pagebreak":
boolRightColumn = True
print("Spaltenwechsel!")
continue
if boolRightColumn == False:
xmlLeftColumn.append(xmlElement)
if boolRightColumn == True:
xmlRightColumn.append(xmlElement)
xmlTranscriptedtext.clear()
xmlTranscriptedtext.tag = "tr"
xmlTranscriptedtext.set("valign", "top")
xmlTranscriptedtext.append(xmlLeftColumn)
xmlTranscriptedtext.append(xmlRightColumn)
# Remove <Facsimilelink>
print("-----------------------------------------------------")
print("Preparing Tables")
intChapterNumber = 1
for xmlChapter in xmlChapters:
xmlTables = xmlChapter.findall(".//EOAtable")
for xmlTable in xmlTables:
xmlRawTable = xmlTable.find(".//table")
strTableCaption = xmlTable.find(".//EOAtablecaption").text or ""
# print("Working on ", strTableCaption)
if strTableCaption != "nonumber":
intTableNumber = dictTables[xmlTable.find(".//EOAtablelabel").text]
xmlTableCaption = etree.Element("p")
xmlTableCaption.text = str(intTableNumber) + " " + strTableCaption
if xmlTable.find(".//EOAtablecaption").getchildren() is not None:
for xmlChild in xmlTable.find(".//EOAtablecaption").iterchildren():
xmlTableCaption.append(xmlChild)
xmlRawTable.addnext(xmlTableCaption)
else:
print("Table has no caption")
xmlTable.find(".//EOAtablecaption").clear()
xmlTable.remove(xmlTable.find(".//EOAtablecaption"))
xmlTable.find(".//EOAtablelabel").clear()
xmlTable.remove(xmlTable.find(".//EOAtablelabel"))
# Analyze Width and Alignment of the Columns
strColumnString = xmlTable.find(".//EOAtablecolumns").text
strColumnString = re.sub(r"\|", "", strColumnString)
xmlTable.remove(xmlTable.find(".//EOAtablecolumns"))
reMatchObjects = re.findall(r'([L|R|C].*?cm)', strColumnString)
intTableWidth = 0
listColumnAlignments = [None]
listColumnWidths = [None]
intNumberOfColumns = 0
for strColumnDefinition in reMatchObjects:
strColumnDefinition = strColumnDefinition.rstrip("cm")
strColumnDefinition = strColumnDefinition.rstrip("mm")
strColumnAlignment = strColumnDefinition[0]
if strColumnAlignment == "L":
strColumnAlignment = "left"
if strColumnAlignment == "C":
strColumnAlignment = "center"
if strColumnAlignment == "R":
strColumnAlignment = "right"
listColumnAlignments.append(strColumnAlignment)
intColumnWidth = int(float(strColumnDefinition.lstrip("LRC")) * 75)
listColumnWidths.append(intColumnWidth)
intTableWidth += intColumnWidth
intNumberOfColumns += 1
xmlRawTable.set("width", str(intTableWidth)+"px;")
del xmlRawTable.attrib["rend"]
del xmlRawTable.attrib["id-text"]
del xmlRawTable.attrib["id"]
del xmlRawTable.attrib["place"]
# Figure out and deal with the Header
xmlHeader = xmlRawTable.find(".//row/cell/tableheader")
if xmlHeader is not None:
xmlHeader.text = ""
xmlHeader.getparent().text = xmlHeader.tail
xmlHeader.getparent().remove(xmlHeader)
xmlFirstRow = xmlRawTable.find(".//row")
xmlFirstRow.tag = "tr"
xmlFirstRowCells = xmlFirstRow.findall(".//cell")
for xmlFirstRowCell in xmlFirstRowCells:
xmlFirstRowCell.tag = "th"
# Now Deal with the rest of the rows
xmlTableRows = xmlRawTable.findall(".//row")
for xmlTableRow in xmlTableRows:
xmlTableCells = xmlTableRow.findall(".//cell")
intCurrentColumn = 1
print(listColumnAlignments)
for xmlTableCell in xmlTableCells:
xmlTableCell.tag = "td"
xmlTableCell.set("align",listColumnAlignments[intCurrentColumn])
xmlTableCell.set("style","width: " + str(listColumnWidths[intCurrentColumn]) + "px;")
# Deal with multicolumn
if xmlTableCell.get("cols") is not None:
xmlTableCell.set("colspan", xmlTableCell.get("cols"))
if intCurrentColumn > len(xmlTableCells):
intCurrentColumn = 1
# Deal with multicolumn again, increase intCurrentColumn by the columns being spanned
elif xmlTableCell.get("cols") is not None:
intCurrentColumn = intCurrentColumn + int(xmlTableCell.get("cols"))
del xmlTableCell.attrib["cols"]
else:
intCurrentColumn += 1
xmlTableRow.tag = "tr"
xmlTableRow.set("valign", "top")
xmlTableParent = xmlTable.getparent()
xmlTableParent.addnext(xmlTable)
xtp = etree.tostring(xmlTableParent)
# libeoaconvert.deb_var(xtp)
xmlTableParent.getparent().remove(xmlTableParent)
print("Finished with that table.")
intChapterNumber += 1
print("-----------------------------------------------------")
print("Preparing Facsimiles")
xmlParts = xmlEbookTree.findall(".//div0")
for xmlPart in xmlParts:
xmlFacsimiles = xmlPart.findall(".//EOAfacsimilepage")
for xmlFacsimile in xmlFacsimiles:
strImageFile = xmlFacsimile.find(".//file").text
strFacsimileLabel = xmlFacsimile.find(".//label").text
facsimile_pagenumber = xmlFacsimile.find(".//pagenumber").text or ""
etree.strip_elements(xmlFacsimile, "file")
etree.strip_elements(xmlFacsimile, "label")
# TODO: Hier noch irgendwie (fehlendem) Suffix der Datei umgehen. Und ggf. Dateien Konvertieren
strImageFile = strImageFile.rstrip("\n")
strImageFileDir = os.path.dirname(strImageFile)
strImageFileDir = re.sub("/", "", strImageFileDir)
strImageFileName = os.path.basename(strImageFile)
# hier
shutil.copy(os.getcwd() + "/" + strImageFile, os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strImageFileDir + strImageFileName)
strImageFilepath = libeoaconvert.sanitizeImage(os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strImageFileDir + strImageFileName, GM_PATH, TL_PATH)
# Add copied file to contentopf
img_base_file_name, img_file_extension = os.path.splitext(strImageFileName)
contentopf = addToContentopf(contentopf, "images/" + strImageFileDir + strImageFileName, strImageFileDir + strImageFileName, img_file_extension[1:])
# strSVGTemplate = """<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100%" height="100%" viewBox="0 0 573 800" preserveAspectRatio="xMidYMid meet"></svg>"""
# xmlSVGFacsimile = etree.fromstring(strSVGTemplate)
# xmlNew = etree.Element('image')
# xmlNew.set("width", "600px;")
# xmlNew.set("height", "800px;")
# xmlNew.set("{http://www.w3.org/1999/xlink}href", "images/" + strImageFileDir + strImageFileName)
# xmlSVGFacsimile.append(xmlNew)
# xmlFacsimile.getparent().replace(xmlFacsimile, xmlSVGFacsimile)
# <img src="images/ImagesFigure3.jpg" alt="" style="width: 99%"/>
facsimile_image_element = etree.Element(htmlns + "img")
facsimile_image_element.set("src", "images/" + strImageFileDir + strImageFileName)
facsimile_image_element.set("style", "width: 95%; height: auto;")
facsimile_image_element.set("alt", "Facsimile page " + facsimile_pagenumber)
xmlFacsimile.getparent().replace(xmlFacsimile, facsimile_image_element)
print("-----------------------------------------------------")
print("Preparing Cross-References")
for xmlChapter in xmlChapters:
xmlReferences = xmlChapter.findall(".//EOAref")
for xmlReference in xmlReferences:
# the new stuff
# label_text = xmlReference.find("Label").text[1:]
# logging.debug("label text is %s" % label_text)
# corresponding_eoa_id_element = xmlTree.xpath("//*[@xml:id='{}']".format(label_text))
# if len(corresponding_eoa_id_element) > 1:
# print("The xml:id %s has been assigned more than once. This is not allowed. Exiting." % corresponding_eoa_id_element)
# sys.exit()
# else:
# eoa_id_element = corresponding_eoa_id_element[0]
# eoa_id = eoa_id_element.get("id")
# end of the new stuff
print("XXXXXXXX")
strResult = "!!! Cross Reference !!!"
xmlReferenceLabel = xmlReference.find("Label")
xmlReferenceLabelText = xmlReferenceLabel.text
xmlReferenceRef = xmlReference.find("ref")
xmlReferenceRefTarget = xmlReferenceRef.get("target")
if xmlReferenceLabelText in dictEquations:
print("Verweis auf Array gefunden:" + xmlReferenceLabelText)
strResult = dictEquations[xmlReferenceLabelText]
if xmlReferenceRefTarget in dictEquations:
print("Verweis auf Equation gefunden:" + xmlReferenceRefTarget)
strResult = dictEquations[xmlReferenceRefTarget]
if xmlReferenceRefTarget in dictLists:
print("Verweis auf Liste gefunden")
strResult = dictLists[xmlReferenceRefTarget]
if xmlReferenceRefTarget in dictChapters:
print("Verweis auf Kapitel gefunden")
strResult = dictChapters[xmlReferenceRefTarget]
if xmlReferenceRefTarget in dictSections:
print("Verweis auf Section gefunden")
strResult = dictSections[xmlReferenceRefTarget]
if xmlReferenceRefTarget in dictFigures:
print("Verweis auf Abbildung gefunden")
strResult = dictFigures[xmlReferenceRefTarget]
if xmlReferenceRefTarget in dictFootnotes:
print("Verweis auf Fussnote gefunden")
strResult = dictFootnotes[xmlReferenceRefTarget]
if xmlReferenceRefTarget in dictTheorems:
print("Verweis auf Theorem gefunden")
strResult = dictTheorems[xmlReferenceRefTarget]
if xmlReferenceLabelText in dictTables:
print("Verweis auf Tabelle gefunden")
strResult = dictTables[xmlReferenceLabelText]
tmpTail = xmlReference.tail or ""
#tmpTail = tmpTail.strip()
print("XXXXXXXX")
xmlReference.clear()
xmlReference.text = strResult
xmlReference.tail = tmpTail
# Substitute Page-References with their targets
for xmlChapter in xmlChapters:
xmlReferences = xmlChapter.findall(".//EOApageref")
for xmlReference in xmlReferences:
strResult = "!!! Page Reference !!!"
xmlReferenceLabel = xmlReference.find("Label")
xmlReferenceLabelText = xmlReferenceLabel.text
print(xmlReferenceLabelText)
xmlReferenceRef = xmlReference.find("ref")
xmlReferenceRefTarget = xmlReferenceRef.get("target")
if xmlReferenceLabelText in dictPagelabels:
print("Verweis auf Seite gefunden: " + xmlReferenceLabelText)
strResult = dictPagelabels[xmlReferenceLabelText]
tmpTail = xmlReference.tail or ""
xmlReference.clear()
xmlReference.text = strResult
xmlReference.tail = tmpTail
# Correcting References to Publications
# NOTE: This may be reworked in the future to enable popups in the ebook
# NOTE: For the time being, span ist going to be removed
for xmlChapter in xmlChapters:
xmlPublicationreferences = xmlChapter.findall(".//span")
for xmlPublicationreference in xmlPublicationreferences:
if xmlPublicationreference.get("rel") == "popover":
xmlPublicationreference.tag = "EOAcitation"
##############################################################
# Finish ePub Conversion, save File #
##############################################################
print("-----------------------------------------------------")
print("Cleaning up XML")
xmlIndexentries = xmlEbookTree.xpath(".//EOAindex | .//EOAindexperson | .//EOAindexlocation")
for xmlIndexentry in xmlIndexentries:
tmpTail = xmlIndexentry.tail or ""
xmlIndexentry.clear()
xmlIndexentry.tail = tmpTail
etree.strip_tags(xmlEbookTree, "EOAlabel", "EOAindex", "EOApageref", "EOAcitenumeric", "EOAtable", "EOAref", "note", "div", "div2", "div3", "div4", "div5", "citetext", "newpage", "EOAciteyear", "EOAtablelabel" , "hi", "pagebreak", "page", "pagestyle", "EOAcitation", "EOAciteauthoryear", "EOAcitemanual", "EOAprintbibliography", "EOAindexperson", "EOAprintindex", "EOAindexlocation", "EOAprintpersonindex", "EOAprintlocationindex","anchor", "temp", "EOAletterhead", "EOAhifigure")
etree.strip_attributes(xmlEbookTree, "id-text", "noindent", "type", "label", "spacebefore", "rend") # also contained "id"
etree.strip_elements(xmlEbookTree, "citekey", with_tail=False)
# Write every Part and Chapter into one file
xmlChapters = xmlEbookTree.findall("//div1")
listParts = []
intTechnicalChapterNumber = 1
for xmlChapter in xmlChapters:
# Load xmlHTMLTemplate
htmlChapter = etree.parse(EPUB_FILES + "epubchapter.xml", xmlChapterParser)
# Find out, if it's inside a part. If Part has not been worked on, then do it
xmlChapterParent = xmlChapter.getparent()
if xmlChapterParent.tag == "div0" and xmlChapterParent.get("id") not in listParts:
listParts.append(xmlChapterParent.get("id"))
strPartTitle = xmlChapterParent.find(".//head").text
htmlChapter.find(".//" + htmlns + "title").text = strPartTitle
xmlNew = etree.Element('h1')
xmlNew.text = strPartTitle
htmlChapter.find(".//" + htmlns + "body").append(xmlNew)
# Save Part
tmpFileName = os.getcwd() + "/CONVERT/epub/OEBPS/chapter" + (str(intTechnicalChapterNumber)) + ".xhtml"
tmpFile = open (tmpFileName, "w")
tmpResult = etree.tostring(htmlChapter, pretty_print=True, encoding="unicode")
tmpFile.write(tmpResult)
tmpFile.close()
# Add to TocNCX
tocncx = addToTocncx(tocncx, htmlChapter.find(".//" + htmlns + "title").text, intTechnicalChapterNumber)
contentopf = addToContentopf(contentopf, "chapter" + str(intTechnicalChapterNumber) + ".xhtml", "chapter" + str(intTechnicalChapterNumber), "xml")
intTechnicalChapterNumber += 1
# Reset htmlChapter
htmlChapter = etree.parse(EPUB_FILES + "epubchapter.xml", xmlChapterParser)
# Aus div1 alle kinder auslesen und an htmlChapter dran hängen
xmlChildren = xmlChapter.getchildren()
for xmlChild in xmlChildren:
# Using Deepcopy, coz a simple append will delete the original
htmlChapter.find(".//" + htmlns + "body").append(deepcopy(xmlChild))
# Save Chapter
tmpFileName = os.getcwd() + "/CONVERT/epub/OEBPS/chapter" + (str(intTechnicalChapterNumber)) + ".xhtml"
tmpFile = open (tmpFileName, "w")
tmpResult = etree.tostring(htmlChapter, pretty_print=True, encoding="unicode")
tmpFile.write(tmpResult)
tmpFile.close()
# Add to TocNCX
tocncx = addToTocncx(tocncx, xmlChapter.find(".//h1").text, intTechnicalChapterNumber)
contentopf = addToContentopf(contentopf, "chapter" + str(intTechnicalChapterNumber) + ".xhtml", "chapter" + str(intTechnicalChapterNumber), "xml")
# Content_OPF hinzufügen
intTechnicalChapterNumber += 1
# Convert Facsimile-Parts
xmlParts = xmlEbookTree.findall("//div0")
for xmlPart in xmlParts:
print("-------------")
print("Working on Facsimile-Part")
print("-------------")
# check if it has a child element EOAfacsimilepart
if bool(xmlPart.findall(".//EOAfacsimilepart")):
htmlChapter = etree.parse(EPUB_FILES + "epubchapter.xml", xmlChapterParser)
# Change EOAfacsimilepart into H1
xmlHeadline = xmlPart.find(".//EOAfacsimilepart")
xmlHeadline.tag = "h1"
etree.strip_elements(xmlPart, "head")
# Aus div0 alle kinder auslesen und an htmlChapter dran hängen
xmlChildren = xmlPart.getchildren()
for xmlChild in xmlChildren:
# Using Deepcopy, coz a simple append will delete the original
htmlChapter.find(".//" + htmlns + "body").append(deepcopy(xmlChild))
# Save Chapter
tmpFileName = os.getcwd() + "/CONVERT/epub/OEBPS/chapter" + (str(intTechnicalChapterNumber)) + ".xhtml"
tmpFile = open (tmpFileName, "w")
tmpResult = etree.tostring(htmlChapter, pretty_print=True, encoding="unicode")
tmpFile.write(tmpResult)
tmpFile.close()
# Save Chapter
tmpFileName = os.getcwd() + "/CONVERT/epub/OEBPS/chapter" + (str(intTechnicalChapterNumber)) + ".xhtml"
tmpFile = open (tmpFileName, "w")
tmpResult = etree.tostring(htmlChapter, pretty_print=True, encoding="unicode")
tmpFile.write(tmpResult)
tmpFile.close()
# Add to TocNCX
tocncx = addToTocncx(tocncx, xmlChapter.find("..//h1").text, intTechnicalChapterNumber)
contentopf = addToContentopf(contentopf, "chapter" + str(intTechnicalChapterNumber) + ".xhtml", "chapter" + str(intTechnicalChapterNumber), "xml")
# Content_OPF hinzufügen
intTechnicalChapterNumber += 1
# Saving toc.ncx
tmpFileName = os.getcwd() + "/CONVERT/epub/OEBPS/toc.ncx"
tmpFile = open (tmpFileName, "w")
tmpResult = etree.tostring(tocncx, pretty_print=True, encoding="unicode")
tmpFile.write(tmpResult)
tmpFile.close()
# Saving content.opf
tmpFileName = os.getcwd() + "/CONVERT/epub/OEBPS/content.opf"
tmpFile = open (tmpFileName, "w")
tmpResult = etree.tostring(contentopf, pretty_print=True, encoding="unicode")
tmpFile.write(tmpResult)
tmpFile.close()
############################################################################
# Finishing various Stuff #
############################################################################
# Write Temporary XML-Tree
ergebnisdatei = open("tmp_files/Devel_ebook.xml", "w")
ergebnis = etree.tostring(xmlEbookTree, pretty_print=True, encoding="unicode")
ergebnisdatei.write(ergebnis)
ergebnisdatei.close()