Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 1505 lines (1337 sloc) 65.7 KB
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
# Time-stamp: <2018-06-01 16:17:44 (kthoden)>
import os
import sys
import argparse
import re
import shutil
import configparser
import pickle
import shlex
import subprocess
import logging
from copy import deepcopy
from lxml import etree
import libeoaconvert
#####################
# Parsing arguments #
#####################
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--config", dest="CONFIG_FILE", help="Name of configuration file", metavar="CONFIGURATION")
parser.add_argument("-f", "--font", help="Font to be used, default is TeX Gyre Termes", default="termes")
parser.add_argument("-nc", "--nocaption", help="No captions for figures.", action="store_true")
args = parser.parse_args()
if args.CONFIG_FILE is not None:
CONFIG_FILE = os.path.abspath(args.CONFIG_FILE)
else:
# CONFIG_FILE = "/Users/kthoden/EOAKram/dev/EOASkripts/Skripten/eoaconvert.cfg"
CONFIG_FILE = os.path.dirname(sys.argv[0]) + "/config/eoaconvert.cfg"
print("The config file is ", CONFIG_FILE)
##################################
# Reading the configuration file #
##################################
CONFIG = configparser.ConfigParser()
CONFIG.read(CONFIG_FILE)
# CONFIG = configparser.ConfigParser()
# CONFIG.read("/Users/kthoden/EOAKram/dev/EOASkripts/Skripten/eoaconvert.cfg")
########################
# Paths to executables #
########################
EPUB_FILES = os.path.dirname(sys.argv[0]) + "/data/epub_files/"
# TEMPLATE_PATH = CONFIG['Auxiliaries']['template_path']
GM_PATH = CONFIG['Executables']['graphicsmagic']
TL_PATH = CONFIG['Executables']['texlive']
# TEXBIN_PATH = CONFIG['Executables']['texbin']
# TRALICS_PATH_EXEC = CONFIG['Executables']['tralics_path_exec']
# TRALICS_PATH_LIB = CONFIG['Executables']['TRALICS_PATH_LIB']
# SUPPORT_PATH = CONFIG['Executables']['support_path']
# AUX_TeX_FILES_PATH = CONFIG['Executables']['aux_tex_files_path']
print(GM_PATH)
lang_dict = {"fig" : {"en" : "Fig.", "de" : "Abb."}}
tmpDir = os.getcwd() + "/tmp_files/"
Datei = open('tmp_files/intermediate.log', 'w')
xmlTree = etree.parse("tmp_files/IntermediateXMLFile.xml")
with open('tmp_files/data.pickle', 'rb') as f:
data = pickle.load(f)
dictSections = data["secdict"]
dictEquations = data["eqdict"]
dictLists = data["listdict"]
dictChapters = data["chapterdict"]
dictFigures = data["figdict"]
dictFootnotes = data["fndict"]
dictTheorems = data["theoremdict"]
dictTables = data["tabdict"]
dictPagelabels = data["pagelabeldict"]
def remove_processinginstruction(xml_tree, pi_name):
"""Remove processing instructions with a specific name"""
proc_insts = xml_tree.xpath("//processing-instruction('{}')".format(pi_name))
# counter = 1
# for instruction in proc_insts:
# logging.debug("looking at pi %d" % counter)
# instruction_previous = instruction.getprevious()
# instruction_parent = instruction.getparent()
# if instruction_previous is not None:
# if instruction_previous.tail is not None:
# instruction_previous_tail = instruction_previous.tail
# else:
# instruction_previous_tail = ""
# else:
# instruction_previous_tail = ""
# instruction_tail = instruction.tail
# instruction_parent_text = instruction_parent.text
# print("parent text", instruction_parent_text)
# print("previous ", instruction_previous_tail)
# print("pi tail", instruction_tail)
# if instruction_previous is not None:
# logging.debug("case 1")
# if instruction_tail is not None:
# logging.debug("case 2")
# instruction_previous_tail += instruction_tail
# else:
# logging.debug("case 3")
# if instruction_tail is not None:
# instruction_parent.text += instruction_tail
# instruction_parent.remove(instruction)
# counter += 1
# Leaving that out for now.
# Found solution below on https://stackoverflow.com/questions/31522162/, but that
# seems only to work in all cases
for instruction in proc_insts:
etree.strip_tags(instruction.getparent(), instruction.tag)
logging.debug("Removed %s processing instructions of type %s." % (len(proc_insts), pi_name))
return xml_tree
# def remove_processinginstruction ends here
def addToContentopf(contentopf, Filename, FileID, Mediatype):
"""Function to add Elements to Content-OPF (epub)"""
# logging.debug("considering adding %s with FileID %s to content.opf" % (Filename, FileID))
global listContentopf
# Sanitizing FileID, id-attribute may not contain _ : or /
# FileID may also not start with a number
FileID = re.sub("\_", "", FileID)
FileID = re.sub("\.", "", FileID)
FileID = re.sub("\/", "", FileID)
FileID = re.sub("^[0-9]", "", FileID)
FileID = re.sub("^[0-9]", "", FileID)
FileID = re.sub("^[0-9]", "", FileID)
if FileID in listContentopf:
# logging.debug("Not adding %s, because something with a FileID %s is already there" % (Filename, FileID))
return contentopf
else:
# Sanitizing FileID, id-attribute may not contain _ : or /
# FileID may also not start with a number
FileID = re.sub("\_", "", FileID)
FileID = re.sub("\.", "", FileID)
FileID = re.sub("\/", "", FileID)
FileID = re.sub("^[0-9]", "", FileID)
FileID = re.sub("^[0-9]", "", FileID)
FileID = re.sub("^[0-9]", "", FileID)
dictMediatypes = {
"txt" : "text/plain",
"otf" : "application/vnd.ms-opentype",
"xml" : "application/xhtml+xml",
"jpg" : "image/jpeg",
"png" : "image/png"
}
contentopfns = "{http://www.idpf.org/2007/opf}"
xmlManifest = contentopf.find(".//" + contentopfns + "manifest")
xmlItem = etree.Element("item")
xmlItem.set("id", FileID)
xmlItem.set("media-type", dictMediatypes[Mediatype])
xmlItem.set("href", Filename)
xmlManifest.append(xmlItem)
# logging.debug("Added %s, with FileID %s" % (Filename, FileID))
# if it's a XML-File also extent <spine>
if Mediatype == "xml":
xmlSpine = contentopf.find(".//" + contentopfns + "spine")
xmlItemref = etree.Element("itemref")
xmlItemref.set("idref", FileID)
xmlSpine.append(xmlItemref)
listContentopf.append(FileID)
return contentopf
# def addToContentopf ends here
def addToTocncx(tocncx, Label, intTechnicalChapterNumber):
"""Function to add Chapters to Table of Contents (epub)"""
tocncxns = "{http://www.daisy.org/z3986/2005/ncx/}"
xmlNavMap = tocncx.find(".//" + tocncxns + "navMap")
xmlNavPoint = etree.Element("navPoint")
xmlNavPoint.set("playOrder", str(intTechnicalChapterNumber + 1))
xmlNavPoint.set("id", "chapter" + str(intTechnicalChapterNumber))
xmlNavLabel = etree.Element("navLabel")
xmlNavLabelText = etree.Element("text")
xmlNavLabelText.text = Label
xmlNavLabel.append(xmlNavLabelText)
xmlNavPoint.append(xmlNavLabel)
xmlContent = etree.Element("content")
xmlContent.set("src", "chapter" + str(intTechnicalChapterNumber) + ".xhtml")
xmlNavPoint.append(xmlContent)
xmlNavMap.append(xmlNavPoint)
return tocncx
# def addToTocncx ends here
##############################################################
# Create .epub basic structure #
##############################################################
# Create folder structure for ebook
if os.path.exists(os.getcwd() + "/CONVERT/epub") == False:
os.mkdir(os.getcwd() + "/CONVERT/epub")
os.mkdir(os.getcwd() + "/CONVERT/epub/META-INF")
os.mkdir(os.getcwd() + "/CONVERT/epub/OEBPS")
os.mkdir(os.getcwd() + "/CONVERT/epub/OEBPS/images")
os.mkdir(os.getcwd() + "/CONVERT/epub/OEBPS/fonts")
# Copy containter.xml and mimetype
shutil.copy(EPUB_FILES + "epubcontainer.xml", os.getcwd() + "/CONVERT/epub/META-INF/container.xml")
shutil.copy(EPUB_FILES + "epubmimetype", os.getcwd() + "/CONVERT/epub/mimetype")
# Preparing content.opf
xmlContentopfParser = etree.XMLParser(no_network=False,load_dtd=False)
contentopf = etree.parse(EPUB_FILES + "epubcontentopf.xml", xmlContentopfParser)
# This list includes all files which have already been included to avoid duplicates
listContentopf = []
#########
# Fonts #
#########
libertine_fonts = ["GPL.txt", "LICENCE.txt", "LinLibertine_R.otf", "LinLibertine_RI.otf", "LinLibertine_RZ.otf", "LinLibertine_RZI.otf", "OFL-1.1.txt"]
termes_fonts = ["texgyretermes-bold.otf", "texgyretermes-bolditalic.otf", "texgyretermes-italic.otf", "texgyretermes-regular.otf"]
if args.font == "termes":
font_files = termes_fonts
shutil.copy(EPUB_FILES + "eoa-epub-termes.css", os.getcwd() + "/CONVERT/epub/OEBPS/eoa-epub.css")
elif args.font == "libertine":
shutil.copy(EPUB_FILES + "eoa-epub-libertine.css", os.getcwd() + "/CONVERT/epub/OEBPS/eoa-epub.css")
font_files = libertine_fonts
else:
logging.info("Font not recognized, falling back to default.")
shutil.copy(EPUB_FILES + "eoa-epub-termes.css", os.getcwd() + "/CONVERT/epub/OEBPS/eoa-epub.css")
otf_id_counter = 1
txt_id_counter = 1
for fontfile in font_files:
shutil.copy(EPUB_FILES + fontfile, os.getcwd() + "/CONVERT/epub/OEBPS/fonts/")
base_file_name, file_extension = os.path.splitext(fontfile)
if file_extension == ".otf":
contentopf = addToContentopf(contentopf, "fonts/" + fontfile, "otf-font" + str(otf_id_counter), file_extension[1:])
otf_id_counter += 1
elif file_extension == ".txt":
contentopf = addToContentopf(contentopf, "fonts/" + fontfile, "font-txt" + str(txt_id_counter), file_extension[1:])
txt_id_counter += 1
else:
print("Other file found. Exiting")
sys.exit()
# shutil.copy(EPUB_FILES + "texgyretermes-bold.otf", os.getcwd() + "/CONVERT/epub/OEBPS/fonts/")
# shutil.copy(EPUB_FILES + "texgyretermes-bolditalic.otf", os.getcwd() + "/CONVERT/epub/OEBPS/fonts/")
# shutil.copy(EPUB_FILES + "texgyretermes-italic.otf", os.getcwd() + "/CONVERT/epub/OEBPS/fonts/")
# shutil.copy(EPUB_FILES + "texgyretermes-regular.otf", os.getcwd() + "/CONVERT/epub/OEBPS/fonts/")
# Shortcut for namespace
htmlns = "{http://www.w3.org/1999/xhtml}"
# Load Template for Chapter HTML
xmlChapterParser = etree.XMLParser(no_network=False,load_dtd=False) #resolve_entities=False
# Preparing toc.ncx
xmlTocncxParser = etree.XMLParser(no_network=False,load_dtd=False)
tocncx = etree.parse(EPUB_FILES + "epubtocncx.xml", xmlTocncxParser)
print("-----------------------------------------------------")
print("Preparing content.opf")
xmlMetadata = contentopf.find(".//{http://www.idpf.org/2007/opf}metadata")
# Prepare Metadata based on Publication.cfg
cfgPublication = configparser.RawConfigParser()
cfgPublication.read(os.getcwd() + "/CONVERT/publication.cfg")
# Prepare Author String
strAuthorString = cfgPublication.get("Authors", "Author1")
if cfgPublication.get("Authors", "Author2") != "":
strAuthorString = cfgPublication.get("Authors", "Author1") + " and " + cfgPublication.get("Authors", "Author2")
if cfgPublication.get("Authors", "Author3") != "":
strAuthorString = cfgPublication.get("Authors", "Author1") + ", " + cfgPublication.get("Authors", "Author2") + " and " + cfgPublication.get("Authors", "Author3")
if cfgPublication.get("Authors", "Author4") != "":
strAuthorString = cfgPublication.get("Authors", "Author1") + ", " + cfgPublication.get("Authors", "Author2") + ", " + cfgPublication.get("Authors", "Author3") + " and " + cfgPublication.get("Authors", "Author4")
xmlAuthor = etree.Element("{http://purl.org/dc/elements/1.1/}creator")
xmlAuthor.text = strAuthorString
xmlMetadata.append(xmlAuthor)
# Prepare Title-String
strTitleString = cfgPublication.get("Technical", "Title")
xmlTitle = etree.Element("{http://purl.org/dc/elements/1.1/}title")
xmlTitle.text = strTitleString
xmlMetadata.append(xmlTitle)
# Prepare Description via Subtitle
strSubtitleString = cfgPublication.get("Technical", "Subtitle")
if strSubtitleString != "":
xmlSubtitle = etree.Element("{http://purl.org/dc/elements/1.1/}description")
xmlSubtitle.text = strSubtitleString
xmlMetadata.append(xmlSubtitle)
# Prepare Identifier
strIdentifier = "MPIWG:" + cfgPublication.get("Technical", "Serie") + cfgPublication.get("Technical", "Number")
xmlIdentifier = etree.Element("{http://purl.org/dc/elements/1.1/}identifier")
xmlIdentifier.text = strIdentifier
xmlIdentifier.set("id", "BookId")
xmlMetadata.append(xmlIdentifier)
# Prepare Type
xmlType = etree.Element("{http://purl.org/dc/elements/1.1/}type")
xmlType.text = "Text"
xmlMetadata.append(xmlType)
#Prepare Date
strPublicationDate = cfgPublication.get("Technical", "PublicationDate")
xmlDate = etree.Element("{http://purl.org/dc/elements/1.1/}date")
xmlDate.text = strPublicationDate
xmlDate.set("{http://www.idpf.org/2007/opf}event", "creation")
xmlMetadata.append(xmlDate)
# Prepare Publisher
xmlPublisher = etree.Element("{http://purl.org/dc/elements/1.1/}publisher")
xmlPublisher.text = "Edition Open Access"
xmlMetadata.append(xmlPublisher)
# Prepare Rights
xmlPublisher = etree.Element("{http://purl.org/dc/elements/1.1/}rights")
xmlPublisher.text = "Published under Creative Commons by-nc-sa 3.0 Germany Licence"
xmlMetadata.append(xmlPublisher)
# Prepare Source
xmlSource = etree.Element("{http://purl.org/dc/elements/1.1/}source")
xmlSource.text = "Max Planck Research Library for the History and Development of Knowledge"
xmlMetadata.append(xmlSource)
# Prepare Subject
strSubject = cfgPublication.get("General", "Keyword1")
xmlSubject = etree.Element("{http://purl.org/dc/elements/1.1/}subject")
xmlSubject.text = strSubject
xmlMetadata.append(xmlSubject)
# Prepare Language
strLanguage = cfgPublication.get("Technical", "Language")
xmlLanguage = etree.Element("{http://purl.org/dc/elements/1.1/}language")
xmlLanguage.text = strLanguage
xmlMetadata.append(xmlLanguage)
#Prepare Cover
xmlCover = etree.Element("meta")
xmlCover.set("content", "cover_pic")
xmlCover.set("name", "cover")
xmlMetadata.append(xmlCover)
xmlManifest = contentopf.find(".//{http://www.idpf.org/2007/opf}manifest")
xmlItem = etree.Element("item")
xmlItem.set("id", "cover_pic")
xmlItem.set("href", "images/cover.jpg")
xmlItem.set("media-type", "image/jpeg")
xmlManifest.append(xmlItem)
shutil.copy(os.getcwd() + "/CONVERT/cover.jpg", os.getcwd() + "/CONVERT/epub/OEBPS/images/")
xmlItem = etree.Element("item")
xmlItem.set("id", "cover")
xmlItem.set("href", "cover.xhtml")
xmlItem.set("media-type", "application/xhtml+xml")
xmlManifest.append(xmlItem)
shutil.copy(EPUB_FILES + "epubcover.xhtml", os.getcwd() + "/CONVERT/epub/OEBPS/cover.xhtml")
print("-------------------")
print("Preparing intro.xhtml")
print("-------------------")
if cfgPublication.get("Technical", "Serie") == "Sources":
tmpFilePath = EPUB_FILES + "epubintro-sources.xhtml"
else:
tmpFilePath = EPUB_FILES + "epubintro.xhtml"
tmpFile = open(tmpFilePath, "r")
strIntroHTML = tmpFile.read()
tmpFile.close()
strIntroHTML = re.sub("author", strAuthorString, strIntroHTML)
strIntroHTML = re.sub("TITLE", strTitleString, strIntroHTML)
strIntroHTML = re.sub("year", cfgPublication.get("Technical", "PublicationYear"), strIntroHTML)
strIntroHTML = re.sub("series", cfgPublication.get("Technical", "Serie"), strIntroHTML)
strIntroHTML = re.sub("number", cfgPublication.get("Technical", "Number"), strIntroHTML)
try:
strIntroHTML = re.sub("AdditionalInformation", "<p>" + cfgPublication.get("General", "AdditionalInformation") + "</p>", strIntroHTML)
except configparser.NoOptionError:
strIntroHTML = re.sub("AdditionalInformation", "", strIntroHTML)
# if cfgPublication.get("General", "AdditionalInformation") is not None:
# strIntroHTML = re.sub("AdditionalInformation", "<p>" + cfgPublication.get("General", "AdditionalInformation") + "</p>", strIntroHTML)
# else:
# strIntroHTML = re.sub("AdditionalInformation", "", strIntroHTML)
tmpFilePath = os.getcwd() + "/CONVERT/epub/OEBPS/intro.xhtml"
tmpFile = open(tmpFilePath, "w")
tmpFile.write(strIntroHTML)
print("-------------------")
print("Preparing toc.ncx")
print("-------------------")
xmlHead = tocncx.find("//{http://www.daisy.org/z3986/2005/ncx/}head")
xmlMeta = etree.Element("meta")
xmlMeta.set("name", "dtb:uid")
xmlMeta.set("content", "MPIWG:" + cfgPublication.get("Technical", "Serie") + cfgPublication.get("Technical", "Number"))
xmlHead.append(xmlMeta)
xmlTitle = tocncx.find("//{http://www.daisy.org/z3986/2005/ncx/}docTitle")
xmlText = etree.Element("text")
xmlText.text = strTitleString
xmlTitle.append(xmlText)
xmlAuthor = tocncx.find("//{http://www.daisy.org/z3986/2005/ncx/}docAuthor")
xmlText = etree.Element("text")
xmlText.text = strAuthorString
xmlAuthor.append(xmlText)
##############################################################
# Convert Tralics-XML to Epub #
##############################################################
#xmlTree = remove_processinginstruction(xmlTree, 'hyperimage')
# Copy xmlTree to xmlEbookTree
xmlEbookTree = deepcopy(xmlTree)
# xmlChapters is a list containing all chapters
xmlChapters = xmlEbookTree.findall("//div1")
# Convert Chapters, Sections, Subsections and Subsubsections to h1, h2, h3, h4
# Insert Number from Dictionary where needed
print("-----------------------------------------------------")
print("Convert EOAChapter to H1")
for xmlChapter in xmlChapters:
xmlChapter.find("head").tag = "h1"
if xmlChapter.get("rend") != "nonumber":
idChapter = xmlChapter.get("id")
# print(idChapter + " konvertierung into h1")
# print(dictChapters[idChapter])
strHeadline = xmlChapter.find("h1").text or ""
xmlChapter.find("h1").text = str(dictChapters[idChapter]) + ". " + strHeadline
if xmlChapter.find(".//EOAauthor") is not None:
tmpXML = etree.Element("p")
tmpXML.append(etree.Element("i"))
tmpXML[0].text = xmlChapter.find(".//EOAauthor").text
xmlChapter.insert(1, tmpXML)
# Remove unwanted EOAauthor here
xmlChapter.find(".//EOAauthor").text = ""
xmlChapter = etree.strip_tags(xmlChapter, "EOAauthor")
# print(dictSections)
print("-----------------------------------------------------")
print("Convert EOAsection to H2")
xmlSections = xmlEbookTree.findall(".//div2")
for xmlSection in xmlSections:
xmlSection.find("head").tag = "h2"
if xmlSection.get("rend") != "nonumber":
idSection = xmlSection.get("id")
strHeadline = xmlSection.find("h2").text or ""
print(strHeadline)
xmlSection.find("h2").text = str(dictSections[idSection]) + " " + strHeadline
print("-----------------------------------------------------")
print("Convert EOAsubsection to H3")
xmlSubsections = xmlEbookTree.findall(".//div3")
for xmlSubsection in xmlSubsections:
xmlSubsection.find("head").tag = "h3"
if xmlSubsection.get("rend") != "nonumber":
idSection = xmlSubsection.get("id")
strHeadline = xmlSubsection.find("h3").text or ""
print(strHeadline)
xmlSubsection.find("h3").text = str(dictSections[idSection]) + " " + strHeadline
print("-----------------------------------------------------")
print("Convert EOAsubsubsection to H4")
xmlSubsubsections = xmlEbookTree.findall(".//div4")
for xmlSubsubsection in xmlSubsubsections:
xmlSubsubsection.find("head").tag = "h4"
#if xmlSubsubsection.get("rend") != "nonumber":
#idSection = xmlSubsection.get("id")
#strHeadline = xmlSubsection.find("h4").text
#xmlSubsection.find("h3").text = str(dictSections[idSection]) + " " + strHeadline
print("-----------------------------------------------------")
print("Convert EOAparagraph to H5")
xmlParagraphs = xmlEbookTree.findall(".//div5")
for xmlParagraph in xmlParagraphs:
print("Found a paragraph.")
xmlParagraph.find("head").tag = "h5"
print("-----------------------------------------------------")
print("Preparing Figures")
xmlFigures = xmlEbookTree.xpath(".//EOAfigure[not(@type='hionly')] | .//EOAlsfigure[not(@type='hionly')]")
for xmlFigure in xmlFigures:
# Copy File of the Image
# If it's in a subfolder, name of folder and name of image will be merged
strImageFileString = xmlFigure.find(".//file").text
strImageFileString = strImageFileString.rstrip("\n")
strImageFileDir = os.path.dirname(strImageFileString)
# Remove / from path
strImageFileDir = re.sub("/", "", strImageFileDir)
strImageFileName = os.path.basename(strImageFileString)
strImageFileNamewoSuffix, strImageFileName_Suffix = os.path.splitext(strImageFileName)
shutil.copy(os.getcwd() + "/" + strImageFileString, os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strImageFileDir + strImageFileName)
if strImageFileName_Suffix.lower() == ".jpg":
extension_and_mime = "jpg"
elif strImageFileName_Suffix.lower() in [".png", ".pdf"]:
extension_and_mime = "png"
else:
print("Found an unrecognized image suffix: %s" % strImageFileName_Suffix)
sys.exit()
strImageFilepath = libeoaconvert.sanitizeImage(os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strImageFileDir + strImageFileName, GM_PATH, TL_PATH)
# Add copied file to contentopf
content_opf_filename = "images" + os.path.sep + "{}{}.{}".format(strImageFileDir, strImageFileNamewoSuffix, extension_and_mime)
content_opf_fileid = "{}{}{}".format(strImageFileDir, strImageFileNamewoSuffix, extension_and_mime)
contentopf = addToContentopf(contentopf, content_opf_filename, content_opf_fileid, extension_and_mime)
idFigure = xmlFigure.find(".//anchor").get("id")
intFigureNumber = dictFigures[idFigure]
if xmlFigure.tag == "EOAfigure":
strImageWidth = xmlFigure.find(".//width").text
strImageWidth = strImageWidth.rstrip("\n")
if xmlFigure.tag == "EOAlsfigure":
strImageWidth = "100"
xmlFigure.clear()
xmlFigure.tag = "p"
xmlFigureImage = etree.Element("img")
xmlFigureImage.set("src", "images/" + strImageFileDir + strImageFileNamewoSuffix + "." + extension_and_mime)
xmlFigureImage.set("alt", "")
xmlFigureImage.set("style", "width: " + strImageWidth + "%")
xmlFigure.append(xmlFigureImage)
if args.nocaption:
pass
else:
xmlFigureCaption = xmlFigure.find(".//caption")
xmlFigureCaption.tag = "p"
strFigureCaption = xmlFigureCaption.text or ""
# FIX
xmlFigureCaption.text = lang_dict["fig"][strLanguage] + " " + str(intFigureNumber) + ": " + strFigureCaption
xmlFigure.addnext(xmlFigureCaption)
# Change the tag of the parent <p>-Tag to <div> so that it may be removed
#xmlFigure.getparent().tag = "div"
xml_figures_hyperimage = xmlEbookTree.xpath(".//EOAfigure[@type='hionly'] | .//EOAlsfigure[@type='hionly']")
logging.debug("found %s hyperimage figures" % len(xml_figures_hyperimage))
for fig in xml_figures_hyperimage:
fig.tag = "EOAhifigure"
print("-----------------------------------------------------")
print("Preparing not numbered Figures")
xmlFigures = xmlEbookTree.findall(".//EOAfigurenonumber")
for xmlFigure in xmlFigures:
# Copy File of the Image
# If it's in a subfolder, name of folder and name of image will be merged
strImageFileString = xmlFigure.find(".//file").text
strImageFileString = strImageFileString.rstrip("\n")
strImageFileDir = os.path.dirname(strImageFileString)
strImageFileDir = re.sub("/", "", strImageFileDir)
strImageFileName = os.path.basename(strImageFileString)
strImageFileNamewoSuffix = os.path.splitext(strImageFileName)[0]
shutil.copy(os.getcwd() + "/" + strImageFileString, os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strImageFileDir + strImageFileName)
strImageFilepath = libeoaconvert.sanitizeImage(os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strImageFileDir + strImageFileName, GM_PATH, TL_PATH)
# Add copied file to contentopf
contentopf = addToContentopf(contentopf, "images/" + strImageFileDir + strImageFileNamewoSuffix + ".jpg", strImageFileDir + strImageFileNamewoSuffix + "-nonumber-jpg", "jpg")
logging.debug("Added a nonumber figure")
strImageWidth = xmlFigure.find(".//width").text
strImageWidth = strImageWidth.rstrip("\n")
xmlFigure.clear()
xmlFigure.tag = "p"
xmlFigureImage = etree.Element("img")
xmlFigureImage.set("src", "images/" + strImageFileDir + strImageFileNamewoSuffix + ".jpg")
xmlFigureImage.set("alt", "")
xmlFigureImage.set("style", "width: " + strImageWidth + "%")
xmlFigure.append(xmlFigureImage)
print("-----------------------------------------------------")
print("Preparing Footnotes")
def alph_footnote_index(fndex):
"""
lowercase Latin footnotes need to support more than 26 values
These are zero-indexed.
>>> alph_footnote_index(0)
'a'
>>> alph_footnote_index(1)
'b'
>>> alph_footnote_index(24)
'y'
>>> alph_footnote_index(25)
'z'
>>> alph_footnote_index(26)
'aa'
>>> alph_footnote_index(27)
'ab'
"""
alphabet = "abcdefghijklmnopqrstuvwxyz"
quotient, remainder = divmod(fndex, len(alphabet))
if not quotient: return alphabet[fndex]
return alph_footnote_index(quotient - 1) + alph_footnote_index(remainder)
# def alph_footnote_index ends here
def replace_footnote_equations(footnote):
"""
captures reusable behavior from the existing code
potentially, some of the old code could be replaced by calls to this helper
usage: contentopf = replace_footnote_equations(my_footnote)
unfortunately, returning the result seemed like a better idea than mutating the global variable
"""
result = contentopf
for equation in footnote.findall(".//EOAequationnonumber"):
filename = equation.get("filename")
equation.clear()
equation.tag = "p"
img = etree.Element("img", src="images/%s" % filename, alt="")
equation.append(img)
cwd = os.getcwd()
shutil.copy("%s/items/%s" % (cwd, filename), "%s/CONVERT/epub/DEBPS/images/%s" % (cwd, filename))
result = addToContentopf(result, "images/" + filename, filename, "png")
print("einmal durch replace_footnote_equations")
return result
# def replace_footnote_equations ends here
def replace_footnote_with_sup(note):
"""
captures reusable behavior from the existing code
potentially, some of the old code could be replaced by calls to this helper
this behavior showed up in a few places
I thought I would be able to extract a little more, but this was all that was actually common
"""
tail = note.tail
note.clear()
note.tail = tail
note.tag = "sup"
# def replace_footnote_with_sup ends here
def bring_footnote_down_epub(footnote, footnote_name, destination):
"""
captures reusable behavior from the existing code
potentially, some of the old code could be replaced by calls to this helper
usage: contentopf = bring_footnote_down_epub(my_footnote, "1", xmlNewFootnotes)
unfortunately, returning the result seemed like a better idea than mutating the global variable
"""
contentopf = replace_footnote_equations(footnote) # see usage note
kids = list(footnote.getchildren())
prefix = "[%s]" % footnote_name
# we would like to prepend this footnote identifier to the footnote element
if footnote.text is not None:
# if the element starts with some text anyway, prepend it there
# footnote.text = "%s %s" % (prefix, footnote.text)
pass
else:
# if, however, the element begins with a child, prepend the text at the beginning of the first child instead
if len(kids):
first_child = kids[0]
# child_text = prefix
child_text = ""
# separate them with a space, unless the child had no text to begin with
child_suffix = first_child.text
if child_suffix is None:
child_suffix = ""
else:
child_text += " "
child_text += child_suffix
first_child.text = child_text
else:
# a totally empty footnote is weird, but who am I to judge?
footnote.text = prefix
footnote_text = footnote.text or ""
replace_footnote_with_sup(footnote)
footnote.text = "["
note_link = etree.SubElement(footnote, "a")
note_link.set("href", "#fn" + footnote_name)
note_link.set("id", "body_fn-ref" + footnote_name)
note_link.text = "%s" % footnote_name
note_link.tail = "]"
# append any text the footnote used to have to the destination
destkids = list(destination.getchildren())
if len(destkids):
# if the destination has children, append after the last one's tail
last_kid = destkids[-1]
prefix = last_kid.tail
if prefix is None:
prefix = ""
else:
prefix += " "
last_kid.tail = prefix + footnote_text
else:
# if the destination has no children, append to its text
prefix = destination.text
if prefix is None:
prefix = ""
else:
prefix += " "
destination.text = prefix + footnote_text
for kid in kids:
destination.append(kid)
return contentopf
# def bring_footnote_down_epub ends here
class FootnoteError(Exception):
"""
we only support one type of footnote per chapter
don't try to mix-and-match
"""
pass
# class FootnoteError ends here
intTechnicalChapterNumber = 1
for xmlChapter in xmlChapters:
groupings = libeoaconvert.get_bigfoot_data(xmlChapter)
xmlFootnotes = list(xmlChapter.findall(".//note"))
print("here come the footnotes. found", len(xmlFootnotes))
has_old = 0 != len(xmlFootnotes)
has_new = 0 != len(
[ # flatten the association list whose values are lists, so we can take the length
note
for grouping, notes in groupings
for note in notes
]
)
# the XOR case falls through, the AND is an error, and the NOR skips to the next chapter
if has_old:
if has_new:
raise FootnoteError("Chapter %s contains both \\EOAfn and footnotes in the style of \\EOAfnalph" % xmlChapter.get("id-text"))
else:
if not has_new:
continue
xmlNewFootnotes = etree.Element("div")
xmlNewFootnotesHeader = etree.Element("h3")
xmlNewFootnotesHeader.text = libeoaconvert.dictLangFootnotes[libeoaconvert.two_letter_language(xmlChapter.get("language"))]
xmlNewFootnotes.append(xmlNewFootnotesHeader)
for grouping, notes in groupings:
# do for the new-style footnotes what was being done for the old
for index, note in enumerate(notes):
footnote_name = str(index + 1)
if "lower-latin" == grouping:
footnote_name = alph_footnote_index(index)
para = etree.Element("p")
para.text = "["
note_link = etree.SubElement(para, "a")
note_link.set("id", "fn" + footnote_name)
note_link.set("href", "#body_fn-ref" + footnote_name)
note_link.text = "%s" % footnote_name
note_link.tail = "]"
contentopf = bring_footnote_down_epub(note, footnote_name, para)
xmlNewFootnotes.append(para)
tmpFileName = "chapter" + (str(intTechnicalChapterNumber)) + ".xhtml"
intFootnoteNumber = 1
for xmlFootnote in xmlFootnotes:
# Not numbered Equations may appear in a footnote, need to be treated differently
xmlEquationsnonumber = xmlFootnote.findall(".//EOAequationnonumber")
for xmlEquationnonumber in xmlEquationsnonumber:
strFilename = xmlEquationnonumber.get("filename")
xmlEquationnonumber.clear()
xmlEquationnonumber.tag = "p"
xmlIMG = etree.Element("img", src="images/"+ strFilename, alt="")
xmlEquationnonumber.append(xmlIMG)
shutil.copy(os.getcwd() + "/items/" + strFilename, os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strFilename)
contentopf = addToContentopf(contentopf, "images/" + strFilename, strFilename, "png")
tmp_fnstring = "fn" + str(intFootnoteNumber)
tmp_fnrefstring = "body_fn-ref" + str(intFootnoteNumber)
xmlFirstChild = xmlFootnote.getchildren()[0]
# this is for the reference text
if xmlFirstChild.text is None:
xmlNewFootnoteRefBottom = etree.SubElement(xmlFirstChild, "a", href = "#" + tmp_fnrefstring, id = tmp_fnstring)
xmlFirstChild.text = "["
xmlNewFootnoteRefBottom.text = str(intFootnoteNumber)
xmlNewFootnoteRefBottom.tail = "] "
xmlFirstChild.insert(0, xmlNewFootnoteRefBottom)
else:
xmlNewFootnoteRefBottom = etree.Element("a", href = "#" + tmp_fnrefstring, id = tmp_fnstring)
xmlNewFootnoteRefBottom.text = str(intFootnoteNumber)
beginning_of_footnote = xmlFirstChild.text
xmlFirstChild.text = "["
xmlNewFootnoteRefBottom.tail = "] " + beginning_of_footnote
xmlFirstChild.insert(0, xmlNewFootnoteRefBottom)
#Preserve tail and children of current <note>-Tag
xmlFootnoteContentsTail = xmlFootnote.tail
xmlFootnoteChildren = xmlFootnote.getchildren()
# Substitute current <note> with Number
xmlFootnote.clear()
xmlFootnote.tag = "sup"
xmlFootnote.text = "["
xmlFootnote.tail = xmlFootnoteContentsTail
xmlNewFootnoteRef = etree.SubElement(xmlFootnote, "a", href = "#" + tmp_fnstring, id = tmp_fnrefstring)
xmlNewFootnoteRef.text = str(intFootnoteNumber)
xmlNewFootnoteRef.tail = "]"
if len(xmlFootnoteChildren) != 0:
for xmlFootnoteChild in xmlFootnoteChildren:
xmlNewFootnotes.append(xmlFootnoteChild)
intFootnoteNumber += 1
xmlChapter.append(xmlNewFootnotes)
intTechnicalChapterNumber += 1
print("-----------------------------------------------------")
print("Preparing Lists")
for xmlChapter in xmlChapters:
xmlLists = xmlChapter.findall(".//list")
for xmlList in xmlLists:
if xmlList.get("type") == "description":
continue
if xmlList.get("type") == "ordered":
xmlList.tag = "ol"
xmlFirstItem = xmlList.find("..//item")
listnumber = xmlFirstItem.get("id-text")
xmlList.set("start", listnumber)
if xmlList.get("type") == "simple":
xmlList.tag = "ul"
xmlListItems = xmlList.findall(".//item")
for xmlListItem in xmlListItems:
xmlListItem.tag = "li"
print("-----------------------------------------------------")
print("Preparing Descriptions")
for xmlChapter in xmlChapters:
xmlDescriptions = xmlChapter.findall(".//list")
for xmlDescription in xmlDescriptions:
xmlDescription.tag = "dl"
del xmlDescription.attrib["type"]
for xmlChild in xmlDescription.iterchildren():
if xmlChild.tag == "label":
xmlChild.tag = "dt"
if xmlChild.tag == "item":
xmlChild.tag = "dd"
del xmlChild.attrib["id"]
del xmlChild.attrib["id-text"]
print("-----------------------------------------------------")
print("Preparing Blockquotes")
xmlParagraphs = xmlEbookTree.findall(".//p")
for xmlParagraph in xmlParagraphs:
if xmlParagraph.get("rend") == "quoted":
strParagraphText = xmlParagraph.text
strParagraphTail = xmlParagraph.tail
xmlParagraphChildren = xmlParagraph.getchildren()
xmlParagraph.clear()
xmlParagraph.tag = "blockquote"
xmlNew = etree.Element("p")
if strParagraphText is not None:
xmlNew.text = strParagraphText
if len(xmlParagraphChildren) != 0:
for xmlParagraphChild in xmlParagraphChildren:
xmlNew.append(xmlParagraphChild)
if strParagraphTail is not None:
xmlNew.tail = strParagraphTail
xmlParagraph.append(xmlNew)
print("-----------------------------------------------------")
print("Preparing Theorems")
for xmlChapter in xmlChapters:
xmlTheorems = xmlChapter.findall(".//theorem")
for xmlTheorem in xmlTheorems:
xmlTheoremHead = xmlTheorem.find(".//head")
strTheoremTitel = xmlTheorem.find(".//head").text
strTheoremText = xmlTheorem.find(".//p").text
xmlTheoremTextTail = xmlTheorem.find(".//p").tail
strTheoremNumber = xmlTheorem.get("id-text")
xmlTheorem.tag = "p"
xmlTheoremHead.tag = "b"
xmlTheoremHead.text = xmlTheoremHead.text + " " + strTheoremNumber
del xmlTheorem.attrib["style"]
del xmlTheorem.attrib["type"]
del xmlTheorem.attrib["id-text"]
del xmlTheorem.attrib["id"]
etree.strip_tags(xmlTheorem, "p")
print("-----------------------------------------------------")
print("Preparing Hyperlinks")
for xmlChapter in xmlChapters:
xmlHyperlinks = xmlChapter.findall(".//xref")
for xmlHyperlink in xmlHyperlinks:
strURL = xmlHyperlink.get('url')
if strURL.startswith("http://") == False:
if strURL.startswith("https://") == False:
strURL = "http://" + strURL
xmlHyperlink.tag = "a"
del xmlHyperlink.attrib["url"]
xmlHyperlink.set("href", strURL)
etree.strip_elements(xmlHyperlink, with_tail=True, *['allowbreak'])
xmlHyperlink.text = strURL
print("-----------------------------------------------------")
print("Convert emphasized text")
for xmlChapter in xmlChapters:
xmlItalics = xmlChapter.findall(".//hi")
for xmlItalic in xmlItalics:
if xmlItalic.get("rend") == "it":
xmlItalic.tag = "em"
del xmlItalic.attrib["rend"]
print("-----------------------------------------------------")
print("Convert bold text")
for xmlChapter in xmlChapters:
xmlBolds = xmlChapter.findall(".//hi")
for xmlBold in xmlBolds:
if xmlBold.get("rend") == "bold":
xmlBold.tag = "b"
del xmlBold.attrib["rend"]
print("-----------------------------------------------------")
print("Convert EOAup to <sup>")
for xmlChapter in xmlChapters:
xmlUps = xmlChapter.findall(".//EOAup")
for xmlUp in xmlUps:
xmlUp.tag = "sup"
print("-----------------------------------------------------")
print("Convert EOAdown to <sub>")
for xmlChapter in xmlChapters:
xmlDowns = xmlChapter.findall(".//EOAdown")
for xmlDown in xmlDowns:
xmlDown.tag = "sub"
print("-----------------------------------------------------")
print("Convert EOAst to <span>")
for xmlChapter in xmlChapters:
xmlStrikeouts = xmlChapter.findall(".//EOAst")
for xmlStrikeout in xmlStrikeouts:
xmlStrikeout.tag = "span"
xmlStrikeout.set("style", "text-decoration: line-through;")
print("-----------------------------------------------------")
print("Convert EOAls to something nice")
for xmlChapter in xmlChapters:
xmlLetterspaceds = xmlChapter.findall(".//EOAls")
for xmlLetterspaced in xmlLetterspaceds:
xmlLetterspaced.tag = "span"
xmlLetterspaced.set("style", "letter-spacing: 0.5em;")
print("-----------------------------------------------------")
print("Convert EOAcaps to something nice")
for xmlChapter in xmlChapters:
xmlLetterspaceds = xmlChapter.findall(".//EOAcaps")
for xmlLetterspaced in xmlLetterspaceds:
xmlLetterspaced.tag = "span"
xmlLetterspaced.set("style", "font-variant:small-caps;")
print("-----------------------------------------------------")
print("Convert EOAineq into appropriate IMG-Tags")
for xmlChapter in xmlChapters:
xmlInlineEquations = xmlChapter.findall(".//EOAineq")
for xmlInlineEquation in xmlInlineEquations:
xmlInlineEquation.tag = "img"
xmlInlineEquation.set("alt", xmlInlineEquation.get("TeX"))
del xmlInlineEquation.attrib["TeX"]
shutil.copy(os.getcwd() + "/items/" + xmlInlineEquation.get("src"), os.getcwd() + "/CONVERT/epub/OEBPS/images/" + xmlInlineEquation.get("src"))
xmlInlineEquation.set("src", "images/" + xmlInlineEquation.get("src"))
contentopf = addToContentopf(contentopf, xmlInlineEquation.get("src"), xmlInlineEquation.get("src"), "png")
print("-----------------------------------------------------")
print("Convert EOAchem into appropriate IMG-Tags")
for xmlChapter in xmlChapters:
xml_inline_chems = xmlChapter.findall(".//EOAchem")
for xml_inline_chem in xml_inline_chems:
xml_inline_chem.tag = "img"
xml_inline_chem.set("alt", xml_inline_chem.get("TeX"))
del xml_inline_chem.attrib["TeX"]
shutil.copy(os.getcwd() + "/items/" + xml_inline_chem.get("src"), os.getcwd() + "/CONVERT/epub/OEBPS/images/" + xml_inline_chem.get("src"))
xml_inline_chem.set("src", "images/" + xml_inline_chem.get("src"))
contentopf = addToContentopf(contentopf, xml_inline_chem.get("src"), xml_inline_chem.get("src"), "png")
print("-----------------------------------------------------")
print("Convert EOAinline into appropriate IMG-Tags")
for xmlChapter in xmlChapters:
xmlInlineElements = xmlChapter.findall(".//EOAinline")
for xmlInlineElement in xmlInlineElements:
xmlInlineElement.tag = "img"
xmlInlineElement.set("alt", "Too late")
strInlineElementFilePath = xmlInlineElement.text
# remove text from element. This is visible in epub (at least in calibre's e-book-viewer)
# however, the text is taken as id in content.opf
# set it to nil after the addToContentopf
strInlineElementFileName = os.path.basename(strInlineElementFilePath)
strInlineElementDirName = os.path.dirname(strInlineElementFilePath)
strNewImagePath = os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strInlineElementDirName + strInlineElementFileName
# trouble when there are subdirectories in Image path!
# some thing goes wrong here: <EOAinline>Images/png_300dpi/A.png</EOAinline>
shutil.copy(os.getcwd() + "/" + strInlineElementDirName + "/" + strInlineElementFileName, strNewImagePath)
# strNewImagePath = os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strInlineElementDirName + strInlineElementFileName
strCommand = GM_PATH + " convert " + strNewImagePath + " -resize 20x20 " + strNewImagePath
listArguments = shlex.split(strCommand)
subprocess.check_output(listArguments, shell=False)
xmlInlineElement.set("src", "images/" + strInlineElementDirName + strInlineElementFileName)
# contentopf, Filename, FileID, Mediatype
# <item id="Troublemaker" media-type="image/png" href="images/inlineA.jpg"/>
# Mediatype should not be hard coded!!!
# base this on file extension
extension = strInlineElementFileName.split(".")[-1]
contentopf = addToContentopf(contentopf, "images/" + strInlineElementDirName + strInlineElementFileName, xmlInlineElement.text, extension)
xmlInlineElement.text = ""
print("-----------------------------------------------------")
print("Preparing Verses")
for xmlChapter in xmlChapters:
xml_verses = xmlChapter.findall(".//EOAverse")
print(len(xml_verses))
for xml_verse in xml_verses:
xml_verse_children = xml_verse.getchildren()
for line in xml_verse_children[:-1]:
linebreak = etree.Element("br")
line.append(linebreak)
etree.strip_tags(xml_verse, "p")
xml_verse.tag = "p"
xml_verse.set("class", "verse")
print("-----------------------------------------------------")
print("Preparing Equations")
for xmlChapter in xmlChapters:
xmlEquations = xmlChapter.findall(".//EOAequation")
for xmlEquation in xmlEquations:
strNumber = xmlEquation.get("number")
strFilename = xmlEquation.get("filename")
# Copy image of Equation
shutil.copy(os.getcwd() + "/items/" + strFilename, os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strFilename)
contentopf = addToContentopf(contentopf, "images/" + strFilename, strFilename, "png")
# Find out Number of Equation to be appended in the last step
strEquationNumber = xmlEquation.get("number")
# Rework XML
xmlEquation.clear()
xmlEquation.tag = "p"
xmlEquationImage = etree.Element("img")
xmlEquationImage.set("src", "images/" + strFilename)
xmlEquationImage.set("alt", "")
xmlEquation.append(xmlEquationImage)
xmlNew = etree.Element('p')
xmlNew.text = "(" + strEquationNumber + ")"
xmlEquation.addnext(xmlNew)
# Parent tag of Equation should be <div> instead of <p>, so that it may be removed
#xmlEquation.getparent().tag = "div"
for xmlChapter in xmlChapters:
xmlEquations = xmlChapter.findall(".//EOAequationnonumber")
for xmlEquation in xmlEquations:
strFilename = xmlEquation.get("filename")
# Copy image of Equation
shutil.copy(os.getcwd() + "/items/" + strFilename, os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strFilename)
contentopf = addToContentopf(contentopf, "images/" + strFilename, strFilename, "png")
# Rework XML
xmlEquation.clear()
xmlEquation.tag = "p"
xmlEquationImage = etree.Element("img")
xmlEquationImage.set("src", "images/" + strFilename)
xmlEquationImage.set("alt", "")
xmlEquation.append(xmlEquationImage)
# Parent tag of Equation should be <div> instead of <p>, so that it may be removed
#xmlEquation.getparent().tag = "div"
# EOAequationarray not handled so far. However: my solution (renaming
# the div) just makes the element disappear, leaving only its children!
for xmlChapter in xmlChapters:
xmlEquationarrays = xmlChapter.findall(".//EOAequationarray")
for xmlEquationarray in xmlEquationarrays:
xmlEquationarray.tag = "div"
print("-----------------------------------------------------")
print("Preparing Letterheads")
for xmlChapter in xmlChapters:
xmlLetterheads = xmlChapter.xpath(".//EOAletterhead")
for xmlLetterhead in xmlLetterheads:
xmlRecipient = xmlLetterhead.find(".//Recipient")
xmlRecipient.tag = "p"
xmlRecipient.getchildren()[0].tag = "em"
xmlArchive = xmlLetterhead.find(".//Archive")
xmlArchive.tag = "p"
xmlArchive.getchildren()[0].tag = "em"
xmlAdditional = xmlLetterhead.find(".//Additional")
xmlAdditional.tag = "p"
xmlAdditional.getchildren()[0].tag = "em"
xmlPages = xmlLetterhead.find(".//Pages")
xmlPages.tag = "p"
xmlPages.getchildren()[0].tag = "em"
xmlHR = etree.Element("hr")
xmlHR2 = etree.Element("hr")
xmlLetterhead.insert(0, xmlHR)
xmlLetterhead.insert(5, xmlHR2)
print("-----------------------------------------------------")
print("Preparing Transcriptions")
# TODO: May need rework concerning the right Column
for xmlChapter in xmlChapters:
etree.strip_elements(xmlChapter, "Facsimilelink")
xmlTranscriptions = xmlChapter.xpath(".//EOAtranscripted")
for xmlTranscription in xmlTranscriptions:
print("Processing Transcription")
#print (etree.tostring(xmlTranscription))
xmlTranscription.tag = "table"
xmlHeader = xmlTranscription.find(".//EOAtranscriptedheader")
xmlHeader.tag = "tr"
xmlLeftHeader = xmlTranscription.find(".//Leftheader")
# print(xmlLeftHeader.text)
xmlLeftHeader.tag = "td"
xmlLeftHeader.set("style", "width: 50%")
xmlRightHeader = xmlTranscription.find(".//Rightheader")
xmlRightHeader.tag = "td"
xmlTranscriptedtext = xmlTranscription.find(".//EOAtranscriptedtext")
# change \n\n into </p><p> and pagebreak into </p><pagebreak><p> to create some valid markup
strTranscriptedtext = etree.tostring(xmlTranscriptedtext, encoding="unicode")
#strTranscriptedtext = re.sub (r"\n\n\n\n", "</p><p>", str(strTranscriptedtext), re.MULTILINE)
#strTranscriptedtext = re.sub (r"\n\n\n", "</p><p>", str(strTranscriptedtext), re.MULTILINE)
#strTranscriptedtext = re.sub (r"\n\n", "</p><p>", str(strTranscriptedtext))
#strTranscriptedtext = re.sub (r"<pagebreak/>", "</p><pagebreak/><p>", strTranscriptedtext)
xmlLeftColumn = etree.Element("td")
xmlRightColumn = etree.Element("td")
boolRightColumn = False
xmlTemp = etree.XML(str(strTranscriptedtext))
for xmlElement in xmlTemp.iterchildren():
if xmlElement.tag == "pagebreak":
boolRightColumn = True
print("Spaltenwechsel!")
continue
if boolRightColumn == False:
xmlLeftColumn.append(xmlElement)
if boolRightColumn == True:
xmlRightColumn.append(xmlElement)
xmlTranscriptedtext.clear()
xmlTranscriptedtext.tag = "tr"
xmlTranscriptedtext.set("valign", "top")
xmlTranscriptedtext.append(xmlLeftColumn)
xmlTranscriptedtext.append(xmlRightColumn)
# Remove <Facsimilelink>
print("-----------------------------------------------------")
print("Preparing Tables")
intChapterNumber = 1
for xmlChapter in xmlChapters:
xmlTables = xmlChapter.findall(".//EOAtable")
for xmlTable in xmlTables:
xmlRawTable = xmlTable.find(".//table")
strTableCaption = xmlTable.find(".//EOAtablecaption").text or ""
# print("Working on ", strTableCaption)
if strTableCaption != "nonumber":
intTableNumber = dictTables[xmlTable.find(".//EOAtablelabel").text]
xmlTableCaption = etree.Element("p")
xmlTableCaption.text = str(intTableNumber) + " " + strTableCaption
if xmlTable.find(".//EOAtablecaption").getchildren() is not None:
for xmlChild in xmlTable.find(".//EOAtablecaption").iterchildren():
xmlTableCaption.append(xmlChild)
xmlRawTable.addnext(xmlTableCaption)
else:
print("Table has no caption")
xmlTable.find(".//EOAtablecaption").clear()
xmlTable.remove(xmlTable.find(".//EOAtablecaption"))
xmlTable.find(".//EOAtablelabel").clear()
xmlTable.remove(xmlTable.find(".//EOAtablelabel"))
# Analyze Width and Alignment of the Columns
strColumnString = xmlTable.find(".//EOAtablecolumns").text
strColumnString = re.sub(r"\|", "", strColumnString)
xmlTable.remove(xmlTable.find(".//EOAtablecolumns"))
reMatchObjects = re.findall(r'([L|R|C].*?cm)', strColumnString)
intTableWidth = 0
listColumnAlignments = [None]
listColumnWidths = [None]
intNumberOfColumns = 0
for strColumnDefinition in reMatchObjects:
strColumnDefinition = strColumnDefinition.rstrip("cm")
strColumnDefinition = strColumnDefinition.rstrip("mm")
strColumnAlignment = strColumnDefinition[0]
if strColumnAlignment == "L":
strColumnAlignment = "left"
if strColumnAlignment == "C":
strColumnAlignment = "center"
if strColumnAlignment == "R":
strColumnAlignment = "right"
listColumnAlignments.append(strColumnAlignment)
intColumnWidth = int(float(strColumnDefinition.lstrip("LRC")) * 75)
listColumnWidths.append(intColumnWidth)
intTableWidth += intColumnWidth
intNumberOfColumns += 1
xmlRawTable.set("width", str(intTableWidth)+"px;")
del xmlRawTable.attrib["rend"]
del xmlRawTable.attrib["id-text"]
del xmlRawTable.attrib["id"]
del xmlRawTable.attrib["place"]
# Figure out and deal with the Header
xmlHeader = xmlRawTable.find(".//row/cell/tableheader")
if xmlHeader is not None:
xmlHeader.text = ""
xmlHeader.getparent().text = xmlHeader.tail
xmlHeader.getparent().remove(xmlHeader)
xmlFirstRow = xmlRawTable.find(".//row")
xmlFirstRow.tag = "tr"
xmlFirstRowCells = xmlFirstRow.findall(".//cell")
for xmlFirstRowCell in xmlFirstRowCells:
xmlFirstRowCell.tag = "th"
# Now Deal with the rest of the rows
xmlTableRows = xmlRawTable.findall(".//row")
for xmlTableRow in xmlTableRows:
xmlTableCells = xmlTableRow.findall(".//cell")
intCurrentColumn = 1
print(listColumnAlignments)
for xmlTableCell in xmlTableCells:
xmlTableCell.tag = "td"
xmlTableCell.set("align",listColumnAlignments[intCurrentColumn])
xmlTableCell.set("style","width: " + str(listColumnWidths[intCurrentColumn]) + "px;")
# Deal with multicolumn
if xmlTableCell.get("cols") is not None:
xmlTableCell.set("colspan", xmlTableCell.get("cols"))
if intCurrentColumn > len(xmlTableCells):
intCurrentColumn = 1
# Deal with multicolumn again, increase intCurrentColumn by the columns being spanned
elif xmlTableCell.get("cols") is not None:
intCurrentColumn = intCurrentColumn + int(xmlTableCell.get("cols"))
del xmlTableCell.attrib["cols"]
else:
intCurrentColumn += 1
xmlTableRow.tag = "tr"
xmlTableRow.set("valign", "top")
xmlTableParent = xmlTable.getparent()
xmlTableParent.addnext(xmlTable)
xtp = etree.tostring(xmlTableParent)
# libeoaconvert.deb_var(xtp)
xmlTableParent.getparent().remove(xmlTableParent)
print("Finished with that table.")
intChapterNumber += 1
print("-----------------------------------------------------")
print("Preparing Facsimiles")
xmlParts = xmlEbookTree.findall(".//div0")
for xmlPart in xmlParts:
xmlFacsimiles = xmlPart.findall(".//EOAfacsimilepage")
for xmlFacsimile in xmlFacsimiles:
strImageFile = xmlFacsimile.find(".//file").text
strFacsimileLabel = xmlFacsimile.find(".//label").text
facsimile_pagenumber = xmlFacsimile.find(".//pagenumber").text or ""
etree.strip_elements(xmlFacsimile, "file")
etree.strip_elements(xmlFacsimile, "label")
# TODO: Hier noch irgendwie (fehlendem) Suffix der Datei umgehen. Und ggf. Dateien Konvertieren
strImageFile = strImageFile.rstrip("\n")
strImageFileDir = os.path.dirname(strImageFile)
strImageFileDir = re.sub("/", "", strImageFileDir)
strImageFileName = os.path.basename(strImageFile)
# hier
shutil.copy(os.getcwd() + "/" + strImageFile, os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strImageFileDir + strImageFileName)
strImageFilepath = libeoaconvert.sanitizeImage(os.getcwd() + "/CONVERT/epub/OEBPS/images/" + strImageFileDir + strImageFileName, GM_PATH, TL_PATH)
# Add copied file to contentopf
img_base_file_name, img_file_extension = os.path.splitext(strImageFileName)
contentopf = addToContentopf(contentopf, "images/" + strImageFileDir + strImageFileName, strImageFileDir + strImageFileName, img_file_extension[1:])
# strSVGTemplate = """<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100%" height="100%" viewBox="0 0 573 800" preserveAspectRatio="xMidYMid meet"></svg>"""
# xmlSVGFacsimile = etree.fromstring(strSVGTemplate)
# xmlNew = etree.Element('image')
# xmlNew.set("width", "600px;")
# xmlNew.set("height", "800px;")
# xmlNew.set("{http://www.w3.org/1999/xlink}href", "images/" + strImageFileDir + strImageFileName)
# xmlSVGFacsimile.append(xmlNew)
# xmlFacsimile.getparent().replace(xmlFacsimile, xmlSVGFacsimile)
# <img src="images/ImagesFigure3.jpg" alt="" style="width: 99%"/>
facsimile_image_element = etree.Element(htmlns + "img")
facsimile_image_element.set("src", "images/" + strImageFileDir + strImageFileName)
facsimile_image_element.set("style", "width: 95%; height: auto;")
facsimile_image_element.set("alt", "Facsimile page " + facsimile_pagenumber)
xmlFacsimile.getparent().replace(xmlFacsimile, facsimile_image_element)
print("-----------------------------------------------------")
print("Preparing Cross-References")
for xmlChapter in xmlChapters:
xmlReferences = xmlChapter.findall(".//EOAref")
for xmlReference in xmlReferences:
# the new stuff
# label_text = xmlReference.find("Label").text[1:]
# logging.debug("label text is %s" % label_text)
# corresponding_eoa_id_element = xmlTree.xpath("//*[@xml:id='{}']".format(label_text))
# if len(corresponding_eoa_id_element) > 1:
# print("The xml:id %s has been assigned more than once. This is not allowed. Exiting." % corresponding_eoa_id_element)
# sys.exit()
# else:
# eoa_id_element = corresponding_eoa_id_element[0]
# eoa_id = eoa_id_element.get("id")
# end of the new stuff
print("XXXXXXXX")
strResult = "!!! Cross Reference !!!"
xmlReferenceLabel = xmlReference.find("Label")
xmlReferenceLabelText = xmlReferenceLabel.text
xmlReferenceRef = xmlReference.find("ref")
xmlReferenceRefTarget = xmlReferenceRef.get("target")
if xmlReferenceLabelText in dictEquations:
print("Verweis auf Array gefunden:" + xmlReferenceLabelText)
strResult = dictEquations[xmlReferenceLabelText]
if xmlReferenceRefTarget in dictEquations:
print("Verweis auf Equation gefunden:" + xmlReferenceRefTarget)
strResult = dictEquations[xmlReferenceRefTarget]
if xmlReferenceRefTarget in dictLists:
print("Verweis auf Liste gefunden")
strResult = dictLists[xmlReferenceRefTarget]
if xmlReferenceRefTarget in dictChapters:
print("Verweis auf Kapitel gefunden")
strResult = dictChapters[xmlReferenceRefTarget]
if xmlReferenceRefTarget in dictSections:
print("Verweis auf Section gefunden")
strResult = dictSections[xmlReferenceRefTarget]
if xmlReferenceRefTarget in dictFigures:
print("Verweis auf Abbildung gefunden")
strResult = dictFigures[xmlReferenceRefTarget]
if xmlReferenceRefTarget in dictFootnotes:
print("Verweis auf Fussnote gefunden")
strResult = dictFootnotes[xmlReferenceRefTarget]
if xmlReferenceRefTarget in dictTheorems:
print("Verweis auf Theorem gefunden")
strResult = dictTheorems[xmlReferenceRefTarget]
if xmlReferenceLabelText in dictTables:
print("Verweis auf Tabelle gefunden")
strResult = dictTables[xmlReferenceLabelText]
tmpTail = xmlReference.tail or ""
#tmpTail = tmpTail.strip()
print("XXXXXXXX")
xmlReference.clear()
xmlReference.text = strResult
xmlReference.tail = tmpTail
# Substitute Page-References with their targets
for xmlChapter in xmlChapters:
xmlReferences = xmlChapter.findall(".//EOApageref")
for xmlReference in xmlReferences:
strResult = "!!! Page Reference !!!"
xmlReferenceLabel = xmlReference.find("Label")
xmlReferenceLabelText = xmlReferenceLabel.text
print(xmlReferenceLabelText)
xmlReferenceRef = xmlReference.find("ref")
xmlReferenceRefTarget = xmlReferenceRef.get("target")
if xmlReferenceLabelText in dictPagelabels:
print("Verweis auf Seite gefunden: " + xmlReferenceLabelText)
strResult = dictPagelabels[xmlReferenceLabelText]
tmpTail = xmlReference.tail or ""
xmlReference.clear()
xmlReference.text = strResult
xmlReference.tail = tmpTail
# Correcting References to Publications
# NOTE: This may be reworked in the future to enable popups in the ebook
# NOTE: For the time being, span ist going to be removed
for xmlChapter in xmlChapters:
xmlPublicationreferences = xmlChapter.findall(".//span")
for xmlPublicationreference in xmlPublicationreferences:
if xmlPublicationreference.get("rel") == "popover":
xmlPublicationreference.tag = "EOAcitation"
##############################################################
# Finish ePub Conversion, save File #
##############################################################
print("-----------------------------------------------------")
print("Cleaning up XML")
xmlIndexentries = xmlEbookTree.xpath(".//EOAindex | .//EOAindexperson | .//EOAindexlocation")
for xmlIndexentry in xmlIndexentries:
tmpTail = xmlIndexentry.tail or ""
xmlIndexentry.clear()
xmlIndexentry.tail = tmpTail
etree.strip_tags(xmlEbookTree, "EOAlabel", "EOAindex", "EOApageref", "EOAcitenumeric", "EOAtable", "EOAref", "note", "div", "div2", "div3", "div4", "div5", "citetext", "newpage", "EOAciteyear", "EOAtablelabel" , "hi", "pagebreak", "page", "pagestyle", "EOAcitation", "EOAciteauthoryear", "EOAcitemanual", "EOAprintbibliography", "EOAindexperson", "EOAprintindex", "EOAindexlocation", "EOAprintpersonindex", "EOAprintlocationindex","anchor", "temp", "EOAletterhead", "EOAhifigure")
etree.strip_attributes(xmlEbookTree, "id-text", "noindent", "type", "label", "spacebefore", "rend") # also contained "id"
etree.strip_elements(xmlEbookTree, "citekey", with_tail=False)
# Write every Part and Chapter into one file
xmlChapters = xmlEbookTree.findall("//div1")
listParts = []
intTechnicalChapterNumber = 1
for xmlChapter in xmlChapters:
# Load xmlHTMLTemplate
htmlChapter = etree.parse(EPUB_FILES + "epubchapter.xml", xmlChapterParser)
# Find out, if it's inside a part. If Part has not been worked on, then do it
xmlChapterParent = xmlChapter.getparent()
if xmlChapterParent.tag == "div0" and xmlChapterParent.get("id") not in listParts:
listParts.append(xmlChapterParent.get("id"))
strPartTitle = xmlChapterParent.find(".//head").text
htmlChapter.find(".//" + htmlns + "title").text = strPartTitle
xmlNew = etree.Element('h1')
xmlNew.text = strPartTitle
htmlChapter.find(".//" + htmlns + "body").append(xmlNew)
# Save Part
tmpFileName = os.getcwd() + "/CONVERT/epub/OEBPS/chapter" + (str(intTechnicalChapterNumber)) + ".xhtml"
tmpFile = open (tmpFileName, "w")
tmpResult = etree.tostring(htmlChapter, pretty_print=True, encoding="unicode")
tmpFile.write(tmpResult)
tmpFile.close()
# Add to TocNCX
tocncx = addToTocncx(tocncx, htmlChapter.find(".//" + htmlns + "title").text, intTechnicalChapterNumber)
contentopf = addToContentopf(contentopf, "chapter" + str(intTechnicalChapterNumber) + ".xhtml", "chapter" + str(intTechnicalChapterNumber), "xml")
intTechnicalChapterNumber += 1
# Reset htmlChapter
htmlChapter = etree.parse(EPUB_FILES + "epubchapter.xml", xmlChapterParser)
# Aus div1 alle kinder auslesen und an htmlChapter dran hängen
xmlChildren = xmlChapter.getchildren()
for xmlChild in xmlChildren:
# Using Deepcopy, coz a simple append will delete the original
htmlChapter.find(".//" + htmlns + "body").append(deepcopy(xmlChild))
# Save Chapter
tmpFileName = os.getcwd() + "/CONVERT/epub/OEBPS/chapter" + (str(intTechnicalChapterNumber)) + ".xhtml"
tmpFile = open (tmpFileName, "w")
tmpResult = etree.tostring(htmlChapter, pretty_print=True, encoding="unicode")
tmpFile.write(tmpResult)
tmpFile.close()
# Add to TocNCX
tocncx = addToTocncx(tocncx, xmlChapter.find(".//h1").text, intTechnicalChapterNumber)
contentopf = addToContentopf(contentopf, "chapter" + str(intTechnicalChapterNumber) + ".xhtml", "chapter" + str(intTechnicalChapterNumber), "xml")
# Content_OPF hinzufügen
intTechnicalChapterNumber += 1
# Convert Facsimile-Parts
xmlParts = xmlEbookTree.findall("//div0")
for xmlPart in xmlParts:
print("-------------")
print("Working on Facsimile-Part")
print("-------------")
# check if it has a child element EOAfacsimilepart
if bool(xmlPart.findall(".//EOAfacsimilepart")):
htmlChapter = etree.parse(EPUB_FILES + "epubchapter.xml", xmlChapterParser)
# Change EOAfacsimilepart into H1
xmlHeadline = xmlPart.find(".//EOAfacsimilepart")
xmlHeadline.tag = "h1"
etree.strip_elements(xmlPart, "head")
# Aus div0 alle kinder auslesen und an htmlChapter dran hängen
xmlChildren = xmlPart.getchildren()
for xmlChild in xmlChildren:
# Using Deepcopy, coz a simple append will delete the original
htmlChapter.find(".//" + htmlns + "body").append(deepcopy(xmlChild))
# Save Chapter
tmpFileName = os.getcwd() + "/CONVERT/epub/OEBPS/chapter" + (str(intTechnicalChapterNumber)) + ".xhtml"
tmpFile = open (tmpFileName, "w")
tmpResult = etree.tostring(htmlChapter, pretty_print=True, encoding="unicode")
tmpFile.write(tmpResult)
tmpFile.close()
# Save Chapter
tmpFileName = os.getcwd() + "/CONVERT/epub/OEBPS/chapter" + (str(intTechnicalChapterNumber)) + ".xhtml"
tmpFile = open (tmpFileName, "w")
tmpResult = etree.tostring(htmlChapter, pretty_print=True, encoding="unicode")
tmpFile.write(tmpResult)
tmpFile.close()
# Add to TocNCX
tocncx = addToTocncx(tocncx, xmlChapter.find("..//h1").text, intTechnicalChapterNumber)
contentopf = addToContentopf(contentopf, "chapter" + str(intTechnicalChapterNumber) + ".xhtml", "chapter" + str(intTechnicalChapterNumber), "xml")
# Content_OPF hinzufügen
intTechnicalChapterNumber += 1
# Saving toc.ncx
tmpFileName = os.getcwd() + "/CONVERT/epub/OEBPS/toc.ncx"
tmpFile = open (tmpFileName, "w")
tmpResult = etree.tostring(tocncx, pretty_print=True, encoding="unicode")
tmpFile.write(tmpResult)
tmpFile.close()
# Saving content.opf
tmpFileName = os.getcwd() + "/CONVERT/epub/OEBPS/content.opf"
tmpFile = open (tmpFileName, "w")
tmpResult = etree.tostring(contentopf, pretty_print=True, encoding="unicode")
tmpFile.write(tmpResult)
tmpFile.close()
############################################################################
# Finishing various Stuff #
############################################################################
# Write Temporary XML-Tree
ergebnisdatei = open("tmp_files/Devel_ebook.xml", "w")
ergebnis = etree.tostring(xmlEbookTree, pretty_print=True, encoding="unicode")
ergebnisdatei.write(ergebnis)
ergebnisdatei.close()