Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 1561 lines (1436 sloc) 75.7 KB
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
# Time-stamp: <2018-06-04 15:09:32 (kthoden)>
import pickle
import os
import sys
import re
import shutil
import shlex
import subprocess
import argparse
import configparser
import libeoaconvert
import logging
from copy import deepcopy
from lxml import etree
#####################
# Parsing arguments #
#####################
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--config", dest="CONFIG_FILE", help="Name of configuration file", metavar="CONFIGURATION")
args = parser.parse_args()
if args.CONFIG_FILE is not None:
CONFIG_FILE = os.path.abspath(args.CONFIG_FILE)
else:
CONFIG_FILE = os.path.dirname(sys.argv[0]) + "/config/eoaconvert.cfg"
##################################
# Reading the configuration file #
##################################
CONFIG = configparser.ConfigParser()
CONFIG.read(CONFIG_FILE)
######################
# Setting up logging #
######################
LOGFILE = CONFIG['General']['logfile']
LOGLEVEL = CONFIG['General']['loglevel']
logging.basicConfig(level=LOGLEVEL, format='%(asctime)s - %(levelname)s - %(message)s')
logging.debug("The configfile is%s." % CONFIG_FILE)
########################
# Paths to executables #
########################
GM_PATH = CONFIG['Executables']['graphicsmagic']
TL_PATH = CONFIG['Executables']['texlive']
###########################################
# Loading data from first conversion step #
###########################################
with open('tmp_files/data.pickle', 'rb') as f:
data = pickle.load(f)
dictChapters = data["chapterdict"]
dictEquations = data["eqdict"]
dictLists = data["listdict"]
dictTheorems = data["theoremdict"]
dictSections = data["secdict"]
dictFigures = data["figdict"]
dictFootnotes = data["fndict"]
dictTables = data["tabdict"]
dictPagelabels = data["pagelabeldict"]
if not os.path.exists(os.getcwd() + os.path.sep + "debug"):
os.mkdir(os.getcwd() + os.path.sep + "debug")
xmlTree = etree.parse("tmp_files/IntermediateXMLFile.xml")
libeoaconvert.debug_xml_here(xmlTree, "fresh")
print("""
############################################################################
# Convert tralics-XML to Django Data Structure #
############################################################################
""")
# Create django File Structure
if os.path.exists(os.getcwd() + "/CONVERT/django") == False:
os.mkdir(os.getcwd() + "/CONVERT/django")
os.mkdir(os.getcwd() + "/CONVERT/django/images")
os.mkdir(os.getcwd() + "/CONVERT/django/images/embedded")
os.mkdir(os.getcwd() + "/CONVERT/django/files")
# Create empty xmlTree
xmlEOAdocument = etree.Element("EOAdocument")
xmlDjangoTree = etree.ElementTree(xmlEOAdocument)
etree.strip_attributes(xmlTree, "noindent")
# Remove temp-Tag
etree.strip_tags(xmlTree, "temp")
libeoaconvert.debug_xml_here(xmlTree, "afterstriptags")
# Write Temporary XML-Maintree
ergebnisdatei = open("tmp_files/Devel_django.xml", "w")
ergebnis = etree.tostring(xmlTree, pretty_print=True, encoding="unicode")
ergebnisdatei.write(ergebnis)
ergebnisdatei.close()
# Find all Chapters from the original tralics XML
xmlChapters = xmlTree.findall("//div1")
def replace_footnote_with_sup(note):
"""
captures reusable behavior from the existing code
potentially, some of the old code could be replaced by calls to this helper
this behavior showed up in a few places
I thought I would be able to extract a little more, but this was all that was actually common
"""
tail = note.tail
note.clear()
note.tail = tail
note.tag = "sup"
# def replace_footnote_with_sup ends here
def alph_footnote_index(fndex):
"""
lowercase Latin footnotes need to support more than 26 values
These are zero-indexed.
>>> alph_footnote_index(0)
'a'
>>> alph_footnote_index(1)
'b'
>>> alph_footnote_index(24)
'y'
>>> alph_footnote_index(25)
'z'
>>> alph_footnote_index(26)
'aa'
>>> alph_footnote_index(27)
'ab'
"""
alphabet = "abcdefghijklmnopqrstuvwxyz"
quotient, remainder = divmod(fndex, len(alphabet))
if not quotient: return alphabet[fndex]
return alph_footnote_index(quotient - 1) + alph_footnote_index(remainder)
# def alph_footnote_index ends here
def debug_chapters(xmlEOAchapters):
"""Write individual chapters to files"""
chap_num = 1
for chapter in xmlEOAchapters:
tmp_filename = "%s/debug/debug-chapter-%02d.xml" % (os.getcwd(), chap_num)
tmp_file = open (tmp_filename, "w")
tmp_result = etree.tostring(chapter, pretty_print=True, encoding="unicode")
tmp_file.write(tmp_result)
tmp_file.close()
chap_num += 1
# def debug_chapters ends here
def gettext(xmlElement):
"""Maintain text and strip subchildren"""
xmlText = xmlElement.text or ""
for xmlChild in xmlElement:
xmlText += gettext(xmlChild)
if xmlChild.tail:
xmlText += xmlChild.tail
return xmlText
# def gettext ends here
def djangoParseObject(xmlElement, indent=False, listtype=None, listnumber=0, uid=None):
# Get Dictionaries of Numbers via Global Variables
global dictChapters
global dictFigures
global dictEquations
global dictSections
global dictFootnotes
global dictPagelabels
global dictTables
global dictLists
global intObjectNumber
# Check what kind of Element we have and change the data
if isinstance(xmlElement.tag, str):
if xmlElement.tag == "EOAtranscripted":
xmlResult = etree.Element("temp")
xmlEOATranscription = etree.Element("EOAtranscription")
xmlEOATranscription.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlLeftheader = xmlElement.find(".//Leftheader")
etree.strip_tags(xmlLeftheader, "p")
xmlEOATranscription.append(xmlLeftheader)
xmlRightheader = xmlElement.find(".//Rightheader")
etree.strip_tags(xmlRightheader, "p")
xmlEOATranscription.append(xmlRightheader)
xmlTranscriptedtext = xmlElement.find(".//EOAtranscriptedtext")
# change \n\n into </p><p> and pagebreak intto </p><pagebreak><p> to create some valid markup
strTranscriptedtext = etree.tostring(xmlTranscriptedtext, encoding="unicode")
#strTranscriptedtext = re.sub (r"\n\n", "</p><p>", str(strTranscriptedtext))
#strTranscriptedtext = re.sub (r"<p><pagebreak/></p>", "<pagebreak/>", strTranscriptedtext)
xmlLeftColumn = etree.Element("EOAtranscriptionleft")
xmlRightColumn = etree.Element("EOAtranscriptionright")
boolRightColumn = False
xmlTemp = etree.XML(str(strTranscriptedtext))
for xmlElement in xmlTemp.iterchildren():
if xmlElement.tag == "pagebreak":
boolRightColumn = True
continue
if boolRightColumn == False:
xmlLeftColumn.append(xmlElement)
if boolRightColumn == True:
xmlRightColumn.append(xmlElement)
xmlEOATranscription.append(xmlLeftColumn)
xmlEOATranscription.append(xmlRightColumn)
# Convert Images within the transcription
logging.debug("EOAfigurenonumber")
xmlFigures = xmlEOATranscription.findall(".//EOAfigurenonumber")
logging.debug(xmlFigures)
if xmlFigures is not None:
for xmlFigure in xmlFigures:
strImageFileString = xmlFigure.find(".//file").text
strImageFileString = strImageFileString.rstrip("\n")
strImageFileDir = os.path.dirname(strImageFileString)
strImageFileDir = re.sub("/", "", strImageFileDir)
strImageFileName = os.path.basename(strImageFileString)
strImageFileNamewoSuffix = os.path.splitext(strImageFileName)[0]
strCommand = GM_PATH + " convert " + os.getcwd() + "/" + strImageFileString + " -resize 250x250\\> " + os.getcwd() + "/CONVERT/django/images/embedded/" + strImageFileDir + strImageFileName
listArguments = shlex.split(strCommand)
subprocess.check_output(listArguments, shell=False)
tmpStrTail = xmlFigure.tail
xmlFigure.clear()
xmlFigure.tag = "img"
xmlFigure.set("src", strImageFileDir + strImageFileName)
xmlFigure.set("alt", "")
xmlResult.append(xmlEOATranscription)
elif xmlElement.tag == "EOAletterhead":
xmlResult = etree.Element("temp")
xmlEOAletterhead = etree.Element("EOAletterhead")
xmlEOAletterrecipient = xmlElement.find(".//Recipient")
xmlEOAletterhead.append(xmlEOAletterrecipient)
xmlEOAletterarchive = xmlElement.find(".//Archive")
xmlEOAletterhead.append(xmlEOAletterarchive)
xmlEOAletteradditional = xmlElement.find(".//Additional")
xmlEOAletterhead.append(xmlEOAletteradditional)
xmlEOAletterpages = xmlElement.find(".//Pages")
xmlEOAletterhead.append(xmlEOAletterpages)
xmlEOAletterhead.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlResult.append(xmlEOAletterhead)
elif xmlElement.tag == "EOAfigurenonumber":
# elif xmlElement.findall(".//EOAfigurenonumber"):
xmlResult = etree.Element("temp")
# Create basic Element EOAfigurenonumber
xmlEOAfigure = etree.Element("EOAfigurenonumber")
# Copy Image
strImageFileString = xmlElement.find(".//file").text
strImageFileString = strImageFileString.rstrip("\n")
strImageFileDir = os.path.dirname(strImageFileString)
strImageFileDir = re.sub("/", "", strImageFileDir)
strImageFileName = os.path.basename(strImageFileString)
strImageFileNamewoSuffix = os.path.splitext(strImageFileName)[0]
shutil.copy(os.getcwd() + "/" + strImageFileString, os.getcwd() + "/CONVERT/django/images/" + strImageFileDir + strImageFileName)
xmlEOAfigure.set("file", strImageFileDir + strImageFileName)
xmlEOAfigure.set("width", xmlElement.find(".//width").text + "px;")
xmlEOAfigure.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlResult.append(xmlEOAfigure)
elif xmlElement.tag == "EOAfigure":
xmlResult = etree.Element("temp")
# Create basic Element EOAfigure
xmlEOAfigure = etree.Element("EOAfigure")
# Copy Image
if xmlElement.get("type") == "hionly":
logging.debug("Found hyperimage figure, continuing")
pass
else:
strImageFileString = xmlElement.find(".//file").text
strImageFileString = strImageFileString.rstrip("\n")
strImageFileDir = os.path.dirname(strImageFileString)
strImageFileDir = re.sub("/", "", strImageFileDir)
strImageFileName = os.path.basename(strImageFileString)
strImageFileNamewoSuffix = os.path.splitext(strImageFileName)[0]
shutil.copy(os.getcwd() + "/" + strImageFileString, os.getcwd() + "/CONVERT/django/images/" + strImageFileDir + strImageFileName)
logging.debug("Django figure %s." % strImageFileName)
# yellow
if os.path.splitext(strImageFileName)[1].lower() == ".pdf":
logging.debug("Found a PDF file")
strImageFilepath = libeoaconvert.sanitizeImage(os.getcwd() + "/CONVERT/django/images/" + strImageFileDir + strImageFileName, GM_PATH, TL_PATH)
xmlEOAfigure.set("file", strImageFileDir + strImageFileName.replace(".pdf", ".png"))
logging.debug("The filename is %s" % xmlEOAfigure.get("file"))
else:
xmlEOAfigure.set("file", strImageFileDir + strImageFileName)
xmlEOAfigure.set("width", xmlElement.find(".//width").text + "px;")
xmlEOAfigure.set("order", str(intObjectNumber))
# Insert Caption
xmlEOAfigure.append(xmlElement.find(".//caption"))
xmlResult.append(xmlEOAfigure)
intObjectNumber += 1
# Insert visual Number and uid
strFigureNumber = dictFigures[xmlElement.find(".//anchor").get("id")]
xmlEOAfigure.set("number", strFigureNumber)
strFigureUID = xmlElement.find(".//anchor").get("id")
xmlEOAfigure.set("id", strFigureUID)
elif xmlElement.findall(".//EOAtable"):
xmlResult = etree.Element("EOAtable")
xmlRawTable = xmlElement.find(".//table")
xmlResult.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlResult.append(xmlRawTable)
# Copy Number, Label and Caption
if xmlElement.find(".//EOAtablecaption").text != "nonumber":
xmlResult.append(xmlElement.find(".//EOAtablecaption"))
xmlResult.set("label", xmlElement.find(".//EOAtablelabel").text)
xmlResult.set("number", dictTables[xmlElement.find(".//EOAtablelabel").text])
xmlResult.set("id", xmlRawTable.get("id"))
else:
xmlElement.set("numbering", "false")
#if xmlElement.find(".//EOAtablelabel").text is not None:
# Transform width of Columns
strColumnString = xmlElement.find(".//EOAtablecolumns").text
strColumnString = re.sub(r"\|", "", strColumnString)
reMatchObjects = re.findall(r'([L|R|C].*?cm)', strColumnString)
intTableWidth = 0
listColumnAlignments = [None]
listColumnWidths = [None]
intNumberOfColumns = 0
for strColumnDefinition in reMatchObjects:
strColumnDefinition = strColumnDefinition.rstrip("cm")
strColumnAlignment = strColumnDefinition[0]
if strColumnAlignment == "L":
strColumnAlignment = "left"
if strColumnAlignment == "C":
strColumnAlignment = "center"
if strColumnAlignment == "R":
strColumnAlignment = "right"
listColumnAlignments.append(strColumnAlignment)
intColumnWidth = int(float(strColumnDefinition.lstrip("LRC")) * 75)
listColumnWidths.append(intColumnWidth)
intTableWidth += intColumnWidth
intNumberOfColumns += 1
xmlRawTable.set("width", str(intTableWidth))
# Figure out and deal with the Header
xmlHeader = xmlRawTable.find(".//row/cell/tableheader")
if xmlHeader is not None:
xmlHeader.text = ""
xmlHeader.getparent().text = xmlHeader.tail
xmlHeader.getparent().remove(xmlHeader)
xmlFirstRow = xmlRawTable.find(".//row")
xmlFirstRow.tag = "tr"
xmlFirstRowCells = xmlFirstRow.findall(".//cell")
for xmlFirstRowCell in xmlFirstRowCells:
xmlFirstRowCell.tag = "th"
# Now Deal with the rest of the rows
xmlTableRows = xmlRawTable.findall(".//row")
for xmlTableRow in xmlTableRows:
xmlTableCells = xmlTableRow.findall(".//cell")
intCurrentColumn = 1
for xmlTableCell in xmlTableCells:
xmlTableCell.tag = "td"
xmlTableCell.set("align",listColumnAlignments[intCurrentColumn])
xmlTableCell.set("style","width: " + str(listColumnWidths[intCurrentColumn]) + ";")
# Deal with multicolumn
if xmlTableCell.get("cols") is not None:
xmlTableCell.set("colspan", xmlTableCell.get("cols"))
if intCurrentColumn > len(xmlTableCells):
intCurrentColumn = 1
# Deal with multicolumn again, increase intCurrentColumn by the columns being spanned
elif xmlTableCell.get("cols") is not None:
intCurrentColumn = intCurrentColumn + int(xmlTableCell.get("cols"))
del xmlTableCell.attrib["cols"]
else:
intCurrentColumn += 1
xmlTableRow.tag = "tr"
xmlTableRow.set("valign", "top")
elif xmlElement.tag == "list" and xmlElement.get('type') != 'description':
xmlResult = etree.Element("temp")
if xmlElement.get('type') == 'ordered':
# Change first item into EOAlistfirstitem
xmlFirstItem = xmlElement.find("..//item")
xmlFirstItemElement = xmlFirstItem.getchildren()[0]
xmlResult.append(djangoParseObject(xmlFirstItemElement,indent=True, listtype="ordered", listnumber=xmlFirstItem.get("id-text"), uid=xmlFirstItem.get("id")))
# Process Child Elements which are Part of this item
if len(xmlFirstItem.getchildren()) >= 1:
for xmlChild in xmlFirstItem.iterchildren():
xmlResult.append(djangoParseObject(xmlChild,indent=True))
xmlFirstItem.getparent().remove(xmlFirstItem)
# Process remaining items in this list
tmpIntNumber = 2
for xmlItem in xmlElement.iterchildren():
xmlItemElement = xmlItem.getchildren()[0]
xmlResult.append(djangoParseObject(xmlItemElement,indent=True,listtype="ordered",listnumber=xmlItem.get("id-text"), uid=xmlItem.get("id")))
tmpIntNumber += 1
if len(xmlItem.getchildren()) >= 1:
for xmlChild in xmlItem.iterchildren():
xmlResult.append(djangoParseObject(xmlChild, indent=True))
xmlItem.getparent().remove(xmlItem)
if xmlElement.get('type') == 'simple':
xml_first_child = xmlElement.getchildren()[0]
if xml_first_child.tag == 'item':
logging.debug("a simple list with no special items")
# Change first item into EOAlistfirstitem
xmlFirstItem = xmlElement.find("..//item")
xmlFirstItemElement = xmlFirstItem.getchildren()[0]
xmlResult.append(djangoParseObject(xmlFirstItemElement,indent=True,listtype="unordered", listnumber="-"))
# Process Child Elements which are Part of this item
if len(xmlFirstItem.getchildren()) >= 1:
logging.debug("len xmlFirstItem.getchildren is greater or equal 1")
for xmlChild in xmlFirstItem.iterchildren():
xmlResult.append(djangoParseObject(xmlChild,indent=True))
xmlFirstItem.getparent().remove(xmlFirstItem)
for xmlItem in xmlElement.iterchildren():
xmlItemElement = xmlItem.getchildren()[0]
xmlResult.append(djangoParseObject(xmlItemElement,indent=True))
if len(xmlItem.getchildren()) >= 1:
for xmlChild in xmlItem.iterchildren():
xmlResult.append(djangoParseObject(xmlChild,indent=True))
xmlItem.getparent().remove(xmlItem)
#############
# Baustelle #
#############
elif xml_first_child.tag == 'label':
logging.debug("a simple list with named items")
# Change first item into EOAlistfirstitem
xmlFirstItem = xmlElement.find("..//item")
xmlFirstItemElement = xmlFirstItem.getchildren()[0]
logging.debug(xmlFirstItemElement.text)
# debugging
logging.debug(etree.tostring(xmlFirstItemElement))
# end of debugging
xml_first_label = xmlElement.find("..//label")
listnumber_text = xml_first_label.text
xmlResult.append(djangoParseObject(xmlFirstItemElement,indent=True,listtype="unordered custom", listnumber=listnumber_text))
logging.debug("The length of the children of the first item: %s." % len(xmlFirstItem.getchildren()))
# Process Child Elements which are Part of this item
if len(xmlFirstItem.getchildren()) >= 1:
logging.debug("len xmlFirstItem.getchildren is greater or equal 1")
for xmlChild in xmlFirstItem.iterchildren():
xmlResult.append(djangoParseObject(xmlChild,indent=True))
xmlFirstItem.getparent().remove(xmlFirstItem)
xml_first_label.getparent().remove(xml_first_label)
all_the_labels = xmlElement.findall("label")
all_the_items = xmlElement.findall("item")
logging.debug("itemlength %s." % len(all_the_items))
logging.debug("labellength %s." % len(all_the_labels))
for listlabel, listitem in zip(all_the_labels, all_the_items):
logging.debug("listitem text %s." % listitem.text)
logging.debug("listlabel text %s." % listlabel.text)
xml_item_element = listitem.getchildren()[0]
xmlResult.append(djangoParseObject(xml_item_element, indent=True, listnumber=listlabel.text))
listlabel.getparent().remove(listlabel)
listitem.getparent().remove(listitem)
# for xmlItem in xmlElement.iterchildren():
# print("So many items have we: ", len(xmlItem))
# xmlItemElement = xmlItem.getchildren()[0]
# xmlResult.append(djangoParseObject(xmlItemElement,indent=True))
# if len(xmlItem.getchildren()) >= 1:
# for xmlChild in xmlItem.iterchildren():
# xmlResult.append(djangoParseObject(xmlChild,indent=True))
# xmlItem.getparent().remove(xmlItem)
##################
# Ende Baustelle #
##################
elif xmlElement.tag == "list" and xmlElement.get('type') == 'description':
logging.debug("A description")
xmlResult = etree.Element("temp")
while len(xmlElement.getchildren()) != 0:
xmlDescription = etree.Element("EOAdescription")
xmlDescription.set("order", str(intObjectNumber))
xmlLabel = xmlElement.getchildren()[0]
xmlItem = xmlElement.getchildren()[1]
if len(xmlItem.getchildren()) > 0:
xmlContent = xmlItem.getchildren()[0]
else:
xmlContent = etree.Element("p")
xmlLabel.tag = "description"
xmlDescription.append(xmlLabel)
xmlDescription.append(xmlContent)
xmlResult.append(xmlDescription)
intObjectNumber += 1
if len(xmlItem.getchildren()) > 0:
for xmlChild in xmlItem.iterchildren():
xmlResult.append(djangoParseObject(xmlChild,indent=True))
xmlItem.getparent().remove(xmlItem)
elif xmlElement.tag == "theorem":
xmlTheoremHead = xmlElement.find(".//head")
xmlTheoremText = xmlElement.find(".//p")
strTheoremNumber = xmlElement.get("id-text")
strTheoremID = xmlElement.get("id")
xmlResult = etree.Element("EOAtheorem")
xmlResult.append(xmlTheoremHead)
xmlResult.append(xmlTheoremText)
xmlResult.set("order", str(intObjectNumber))
xmlResult.set("number", strTheoremNumber)
xmlResult.set("uid", strTheoremID)
intObjectNumber += 1
elif xmlElement.findall(".//EOAequationarray"):
xmlResult = etree.Element("temp")
for xmlEquation in xmlElement.findall(".//EOAequation"):
xmlEOAequation = etree.Element("EOAequation")
xmlEOAequation.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlEOAequation.set("number", xmlEquation.get("number"))
xmlEOAequation.set("filename", xmlEquation.get("filename"))
if xmlEquation.get("label") is not None:
xmlEOAequation.set("label", xmlEquation.get("label"))
shutil.copy(os.getcwd() + "/items/" + xmlEquation.get("filename"), os.getcwd() + "/CONVERT/django/images/")
xmlEOAequation.set("TeX", xmlEquation.get("TeX"))
if xmlEquation.get("label") is not None:
xmlEOAequation.set("label", xmlEquation.get("label"))
xmlResult.append(xmlEOAequation)
elif xmlElement.findall(".//EOAequationarraynonumber"):
xmlResult = etree.Element("temp")
for xmlEquation in xmlElement.findall(".//EOAequationarraynonumber"):
xmlEOAequation = etree.Element("EOAequation")
xmlEOAequation.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlEOAequation.set("number", "")
xmlEOAequation.set("filename", xmlEquation.get("filename"))
shutil.copy(os.getcwd() + "/items/" + xmlEquation.get("filename"), os.getcwd() + "/CONVERT/django/images/")
xmlEOAequation.set("TeX", xmlEquation.get("TeX"))
xmlResult.append(xmlEOAequation)
elif xmlElement.tag == "EOAequationnonumber":
# Process one EOAequation which is not encapsulated
xmlResult = etree.Element("EOAequation")
xmlResult.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlResult.set("filename", xmlElement.get("filename"))
xmlResult.set("TeX", xmlElement.get("TeX"))
shutil.copy(os.getcwd() + "/items/" + xmlElement.get("filename"), os.getcwd() + "/CONVERT/django/images/")
xmlResult.set("number", "")
elif xmlElement.findall(".//EOAequation"):
# Process various Equations which may be encapsulated within <p>
xmlEquations = xmlElement.findall(".//EOAequation")
xmlResult = etree.Element("temp")
for xmlEquation in xmlEquations:
# Create basic Element EOAequation
xmlEOAequation = etree.Element("EOAequation")
xmlEOAequation.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlEOAequation.set("number", xmlEquation.get("number"))
xmlEOAequation.set("TeX", xmlEquation.get("TeX"))
if xmlEquation.get("uid") is not None:
xmlEOAequation.set("uid", xmlEquation.get("uid"))
shutil.copy(os.getcwd() + "/items/" + xmlEquation.get("filename"), os.getcwd() + "/CONVERT/django/images/")
xmlEOAequation.set("filename", xmlEquation.get("filename"))
xmlResult.append(xmlEOAequation)
elif xmlElement.tag == "EOAequation":
# Process one EOAequation which is not encapsulated
xmlResult = etree.Element("EOAequation")
xmlResult.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlResult.set("number", xmlElement.get("number"))
xmlResult.set("TeX", xmlElement.get("TeX"))
if xmlElement.get("uid") is not None:
xmlResult.set("uid", xmlElement.get("uid"))
shutil.copy(os.getcwd() + "/items/" + xmlElement.get("filename"), os.getcwd() + "/CONVERT/django/images/")
xmlResult.set("filename", xmlElement.get("filename"))
elif xmlElement.tag == "div3":
xmlResult = etree.Element("EOAsubsection")
xmlResult.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlResult.append(xmlElement.find("head"))
for xmlChild in xmlElement.iterchildren():
xmlResult.append(djangoParseObject(xmlChild))
elif xmlElement.tag == "div4":
xmlResult = etree.Element("EOAsubsubsection")
xmlResult.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlResult.append(xmlElement.find("head"))
for xmlChild in xmlElement.iterchildren():
xmlResult.append(djangoParseObject(xmlChild))
elif xmlElement.tag == "EOAverse":
xmlResult = etree.Element("EOAparagraph")
xmlResult.set("style", "verse")
xmlResult.set("order", str(intObjectNumber))
intObjectNumber += 1
xml_verselines = xmlElement.findall("p")
xmlResult.append(deepcopy(xml_verselines[0]))
for xml_verseline in xml_verselines[1:]:
linebreak = etree.Element("br")
xmlResult.append(linebreak)
copied_line = deepcopy(xml_verseline)
xmlResult.append(copied_line)
etree.strip_tags(xmlResult, "p")
elif xmlElement.tag == "EOAbox":
logging.debug("Found a box")
xmlResult = etree.Element("temp")
xmlResult.set("style", "box")
box_header = xmlElement.find("head")
box_header.tag = "EOAparagraph"
box_header.set("style", "box")
box_header.set("order", str(intObjectNumber))
head_contents = box_header.find("p")
head_contents.tag = "b"
# etree.strip_tags(box_header, "p")
xmlResult.append(box_header)
intObjectNumber += 1
# question: what to do about paragraph equivalent objects?
box_elements = xmlElement.getchildren()
logging.debug(len(box_elements))
for box_element in box_elements:
if box_element.tag == "p":
box_element.tag = "EOAparagraph"
box_element.set("style", "box")
box_element.set("order", str(intObjectNumber))
xmlResult.append(box_element)
intObjectNumber += 1
elif xmlElement.tag == "EOAtocentry":
# throw them out for the time being
xmlResult = etree.Element("temp")
elif xmlElement.tag == "pagebreak":
# throw them out for the time being
xmlResult = etree.Element("temp")
else:
xmlElement.tag = "EOAparagraph"
quoted_paragraph = xmlElement.get("rend")
if quoted_paragraph is not None and quoted_paragraph == "quoted":
xmlElement.set("rend", "quoted")
xmlElement.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlResult = xmlElement
else:
print("SPECIAL: %s - %s" % (xmlElement, xmlElement.text))
xmlResult = xmlElement
if indent==True:
xmlResult.set("indent", "True")
if listtype != None:
xmlResult.set("listtype", listtype)
if listnumber != 0:
xmlResult.set("listnumber", listnumber)
if uid != None:
xmlResult.set("id", uid)
return xmlResult
# def djangoParseObject ends here
def make_index(index_hits, index_type):
"""Make an index"""
dictIndex = {}
for xmlEOAindex in index_hits:
strMainEntry = xmlEOAindex.get("main")
str_display_entry = xmlEOAindex.get("display")
# If strMainEntry not in Index, then create new index element
if strMainEntry not in dictIndex:
dictIndex[strMainEntry] = {}
dictIndex[strMainEntry]["display_string"] = ""
dictIndex[strMainEntry]["listMainentries"] = []
dictIndex[strMainEntry]["dictSubentries"] = {}
# store the display string here.
if str_display_entry is not None:
dictIndex[strMainEntry]["display_string"] = str_display_entry
else:
dictIndex[strMainEntry]["display_string"] = strMainEntry
# if entry has no subentry then append it to listMainentries
if strMainEntry in dictIndex and xmlEOAindex.get("secondary") == None:
dictIndex[strMainEntry]["listMainentries"].append(xmlEOAindex)
# if entry has subentry, proceed on the second level
if strMainEntry in dictIndex and xmlEOAindex.get("secondary") is not None:
# put the next line in anyway
# dictIndex[strMainEntry]["listMainentries"].append(xmlEOAindex)
strSubEntry = xmlEOAindex.get("secondary")
# if strSubEntry is not in dictSubentries, then create new list
if strSubEntry not in dictIndex[strMainEntry]["dictSubentries"]:
dictIndex[strMainEntry]["dictSubentries"][strSubEntry] = []
dictIndex[strMainEntry]["dictSubentries"][strSubEntry].append(xmlEOAindex)
else:
dictIndex[strMainEntry]["dictSubentries"][strSubEntry].append(xmlEOAindex)
# Sort the main index
listSortedKeys = sorted(dictIndex.keys(), key=str.lower)
if index_type == "regular":
new_index_element = "EOAprintindex"
else:
new_index_element = "EOAprint%sindex" % index_type
# Create new and empty xmlTree for xmlEOAindex
xmlEOAprintindex = etree.Element(new_index_element)
xmlEOAindexsection = None
listFirstChars = []
for strSortedKey in listSortedKeys:
strFirstChar = strSortedKey[0].upper()
if strFirstChar not in listFirstChars:
logging.debug("Beginning a new letter: %s." % strFirstChar)
listFirstChars.append(strFirstChar)
if xmlEOAindexsection is not None:
xmlEOAprintindex.append(xmlEOAindexsection)
xmlEOAindexsection = etree.Element("EOAindexsection")
xmlEOAindexsection.set("Character", strFirstChar)
# beginning a new entry
xmlEOAindexentry = etree.Element("EOAindexentry")
xmlEOAindexentry.set("main", strSortedKey)
xmlEOAindexentry.set("display", dictIndex[strSortedKey]["display_string"])
for xmlMainelement in dictIndex[strSortedKey]["listMainentries"]:
print(xmlMainelement.get("chapterorder") + ":" + xmlMainelement.get("elementorder"))
xmlEOAindexlink = etree.Element("EOAindexlink")
xmlEOAindexlink.set("chapterorder", xmlMainelement.get("chapterorder"))
xmlEOAindexlink.set("elementorder", xmlMainelement.get("elementorder"))
if xmlMainelement.get("bold") is not None:
xmlEOAindexlink.set("bold", "True")
xmlEOAindexentry.append(xmlEOAindexlink)
# If there are any subentries, process them now
if len(dictIndex[strSortedKey]["dictSubentries"]) > 0:
logging.debug("Processing Subentries")
listSortedSubKeys = sorted(dictIndex[strSortedKey]["dictSubentries"])
for strSortedSubKey in listSortedSubKeys:
xmlEOAindexsubentry = etree.Element("EOAindexsubentry")
xmlEOAindexsubentry.set("secondary", strSortedSubKey)
for xmlSubElement in dictIndex[strSortedKey]["dictSubentries"][strSortedSubKey]:
strSubEntry = xmlSubElement.get("secondary")
# Hier noch die Links auf den Untereintrag einfügen
xmlEOAindexlink = etree.Element("EOAindexlink")
xmlEOAindexlink.set("chapterorder", xmlSubElement.get("chapterorder"))
xmlEOAindexlink.set("elementorder", xmlSubElement.get("elementorder"))
xmlEOAindexsubentry.append(xmlEOAindexlink)
if xmlSubElement.get("bold") is not None:
xmlEOAindexlink.set("bold", "True")
logging.debug(strSubEntry)
xmlEOAindexentry.append(xmlEOAindexsubentry)
xmlEOAindexsection.append(xmlEOAindexentry)
# if xmlEOAindexsection is not None:
xmlEOAprintindex.append(xmlEOAindexsection)
return(xmlEOAprintindex)
# def make_index ends here
def djangoParseHeadline(xmlElement):
# Parse EOAauthor and append it to the Chapter Information
xmlAuthors = xmlElement.find(".//EOAauthor")
if xmlAuthors is not None:
strAuthors = xmlAuthors.text
xmlElement.remove(xmlAuthors)
strAuthors = re.sub("(, and | and | und )", ",", strAuthors)
listAuthors = re.split("\,", strAuthors)
logging.debug(listAuthors)
if len(listAuthors) >= 1:
for i in range(len(listAuthors)):
xmlAuthor = etree.Element("EOAauthor")
# Remove Spaces before and after AuthorString
if listAuthors[i][0] == " ":
strAuthor = listAuthors[i][1:]
elif listAuthors[i].endswith(" "):
strAuthor = listAuthors[i][:-1]
else:
strAuthor = listAuthors[i]
xmlAuthor.text = strAuthor
xmlElement.append(xmlAuthor)
return xmlElement
# def djangoParseHeadline ends here
# Iterate over Chapters, Sections, Subsections, and Subsubsections and
# Put all on one level: EOAchapter
intChapterNumber = 1
listPartIDs = []
for xmlChapter in xmlChapters:
intObjectNumber = 1
# Process Chapter Title
xmlEOAchapter = etree.Element("EOAchapter")
xmlEOAchapter.set("type","regular")
xmlLanguage = xmlChapter.get("language")
if xmlLanguage is not None:
# KT changing this after separating the big script
strLanguage = xmlLanguage #or "english"
else:
strLanguage = "english"
xmlEOAchapter.set("language", strLanguage)
# xmlEOAchapter.set("language", xmlChapter.get("language"))
xmlEOAchapter.set("order", str(intChapterNumber))
if xmlChapter.get("rend") != "nonumber":
xmlEOAchapter.set("id", xmlChapter.get("id"))
xmlChapterHeadline = xmlChapter.find(".//head")
if xmlChapter.get("id") in dictChapters:
xmlEOAchapter.set("number", dictChapters[xmlChapter.get("id")])
else:
xmlEOAchapter.set("number", "")
print("-----------------------------------------------------")
print(gettext(xmlChapterHeadline))
xmlEOAchapter.append(djangoParseHeadline(xmlChapterHeadline))
# Deal with EOAauthor
if xmlChapter.find(".//EOAauthor") is not None:
xmlEOAchapter.append(xmlChapter.find(".//EOAauthor"))
# Attache enclosing Part to Chapter, see django structure for this purpose
if xmlChapter.getparent().tag == "div0":
if xmlChapter.getparent().get("id") not in listPartIDs:
listPartIDs.append(xmlChapter.getparent().get("id"))
xmlPartHeadline = xmlChapter.getparent().find("head")
xmlPartHeadline.tag = "EOAparthtml"
xmlEOAchapter.append(xmlPartHeadline)
# Append Chapter to xmlEOAdocument
xmlEOAdocument.append(xmlEOAchapter)
# iterate over children of Chapter
for xmlChapterChild in xmlChapter.iterchildren():
if xmlChapterChild.tag == "div2":
# Process Section Title
xmlEOAsection = etree.Element("EOAsection")
xmlEOAsection.set("order", str(intObjectNumber))
if xmlChapterChild.get("rend") != "nonumber":
xmlEOAsection.set("id", xmlChapterChild.get("id"))
xmlEOAsection.set("number", dictSections[xmlChapterChild.get("id")])
intObjectNumber += 1
xmlHead = xmlChapter.find(".//head")
logging.debug("Section '%s'" % gettext(xmlHead))
xmlEOAsection.append(djangoParseHeadline(xmlHead))
xmlEOAchapter.append(xmlEOAsection)
# Iterate over Children of Section
for xmlSectionChild in xmlChapterChild.iterchildren():
if xmlSectionChild.tag == "div3":
# Process Subsection Title
xmlEOAsubsection = etree.Element("EOAsubsection")
xmlEOAsubsection.set("order", str(intObjectNumber))
if xmlSectionChild.get("rend") != "nonumber":
xmlEOAsubsection.set("id", xmlSectionChild.get("id"))
xmlEOAsubsection.set("number", dictSections[xmlSectionChild.get("id")])
intObjectNumber += 1
xmlHead = xmlSectionChild.find(".//head")
logging.debug("Subsection '%s'" % gettext(xmlHead))
xmlEOAsubsection.append(djangoParseHeadline(xmlHead))
xmlEOAchapter.append(xmlEOAsubsection)
# Iterate over children of Subsection
for xmlSubsectionChild in xmlSectionChild.iterchildren():
if xmlSubsectionChild.tag == "div4":
# Process Subsubsection Title
xmlEOAsubsubsection = etree.Element("EOAsubsubsection")
xmlEOAsubsubsection.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlHead = xmlSubsectionChild.find(".//head")
logging.debug(gettext(xmlHead))
xmlEOAsubsubsection.append(djangoParseHeadline(xmlHead))
xmlEOAchapter.append(xmlEOAsubsubsection)
# Iterate over children of Subsubsection
for xmlSubsubsectionChild in xmlSubsectionChild.iterchildren():
xmlEOAchapter.append(djangoParseObject(xmlSubsubsectionChild))
else:
xmlEOAchapter.append(djangoParseObject(xmlSubsectionChild))
elif xmlSectionChild.tag == "div4":
# Process Subsubsection Title
xmlEOAsubsubsection = etree.Element("EOAsubsubsection")
xmlEOAsubsubsection.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlHead = xmlSectionChild.find(".//head")
xmlEOAsubsubsection.append(djangoParseHeadline(xmlHead))
xmlEOAchapter.append(xmlEOAsubsubsection)
# Iterate over children of Subsubsection
for xmlSubsubsectionChild in xmlSectionChild.iterchildren():
if xmlSubsubsectionChild.tag == "div5":
logging.debug("jubel")
# although it's div5, promote it to subsubsection
xmlEOAparasection = etree.Element("EOAsubsubsection")
# xmlEOAparasection = etree.Element("EOAparasection")
xmlEOAparasection.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlHead = xmlSubsubsectionChild.find(".//head")
logging.debug(gettext(xmlHead))
xmlEOAparasection.append(djangoParseHeadline(xmlHead))
xmlEOAchapter.append(xmlEOAparasection)
for xmlParasectionChild in xmlSubsubsectionChild.iterchildren():
xmlEOAchapter.append(djangoParseObject(xmlParasectionChild))
else:
xmlEOAchapter.append(djangoParseObject(xmlSubsubsectionChild))
else:
xmlEOAchapter.append(djangoParseObject(xmlSectionChild))
else:
xmlEOAchapter.append(djangoParseObject(xmlChapterChild))
intChapterNumber += 1
libeoaconvert.debug_xml_here(xmlTree, "afterchapter")
print("----------------------------------------------")
print("Processing Facsimile Parts")
listModes = ["text", "textPollux", "xml"]
strBasicURL = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/interface/page-fragment.xql?document="
parserECHO = etree.XMLParser()
xmlParts = xmlTree.findall("//div0")
intFacNumber = 1
for xmlPart in xmlParts:
intObjectNumber = 1
intFacPartNumber = 1
if xmlPart.find(".//EOAfacsimilepart") is None:
continue
xmlEOAfacsimilepart = etree.Element("EOAfacsimilepart")
xmlEOAfacsimilepart.set("order", str(intChapterNumber))
xmlEOAfacsimileparthead = xmlPart.find(".//head")
for xmlChild in xmlEOAfacsimileparthead:
if xmlChild.tag == "hi":
xmlChild.tag = "em"
del xmlChild.attrib["rend"]
xmlEOAfacsimilepart.append(xmlEOAfacsimileparthead)
intChapterNumber += 1
xmlEOAdocument.append(xmlEOAfacsimilepart)
xmlFacsimilepages = xmlPart.findall(".//EOAfacsimilepage")
intFacPageNumber = 1
for xmlFacsimilepage in xmlFacsimilepages:
strImageFile = xmlFacsimilepage.find(".//file").text
strLabel = xmlFacsimilepage.find(".//label").text
strPagenumber = xmlFacsimilepage.find(".//pagenumber").text or ""
xmlEOAfacsimilepage = etree.Element("EOAfacsimilepage")
xmlEOAfacsimilepage.set("order", str(intObjectNumber))
# TODO: Hier noch irgendwie (fehlendem) Suffix der Datei umgehen. Und ggf. Dateien Konvertieren
strImageFile = strImageFile.rstrip("\n")
strImageFileDir = os.path.dirname(strImageFile)
strImageFileDir = re.sub("/", "", strImageFileDir)
strImageFileName = os.path.basename(strImageFile)
shutil.copy(os.getcwd() + "/" + strImageFile, os.getcwd() + "/CONVERT/django/images/" + strImageFileDir + strImageFileName)
intObjectNumber += 1
# Download transcription for this Page
if xmlFacsimilepage.find(".//fulltext").text is not None:
logging.debug("Found a link to full text.")
strFacsimileURL = re.split(",", xmlFacsimilepage.find(".//fulltext").text)[0]
strFacsimilePage = re.split(",", xmlFacsimilepage.find(".//fulltext").text)[1]
for strMode in listModes:
strURL = strBasicURL + strFacsimileURL + "&pn=" + strFacsimilePage + "&mode=" + strMode
logging.debug("Processing Facsimile : " + strURL)
xmlECHOtree = etree.parse(strURL, parserECHO)
# Remove ECHO-namespaces
objectify.deannotate(xmlECHOtree, xsi_nil=True)
etree.cleanup_namespaces(xmlECHOtree)
xmlDivs = xmlECHOtree.findall(".//div")
for xmlDiv in xmlDivs:
if xmlDiv.get("class") == "pageContent":
# Create new EOA-Element
xmlEOAfacsimileelement = etree.Element("EOAfacsimileelement")
xmlEOAfacsimileelement.set("type", strMode)
# Fix Images in the <div>-Element
xmlImages = xmlDiv.findall(".//img")
intFacImgNumber = 1
for xmlImage in xmlImages:
strImageSrc = xmlImage.get("src")
strCommand = "curl " + strImageSrc + " -o CONVERT/django/images/facsupplements_" + str(intFacNumber) + "_" + str(intFacPageNumber) + "_" + str(intFacImgNumber) + ".jpg"
listArguments = shlex.split(strCommand)
try:
exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
xmlImage.set("src", "facsupplements_" + str(intFacNumber) + "_" + str(intFacPageNumber) + "_" + str(intFacImgNumber) + ".jpg")
except:
xmlImage.tag = "temp"
intFacImgNumber += 1
# Change of scr of img-Element
xmlEOAfacsimileelement.append(xmlDiv)
xmlEOAfacsimilepage.append(xmlEOAfacsimileelement)
intFacPageNumber += 1
xmlEOAfacsimilepage.set("file", strImageFileDir + strImageFileName)
xmlEOAfacsimilepage.set("label", str(strLabel))
xmlEOAfacsimilepage.set("pagenumber", str(strPagenumber))
xmlEOAfacsimilepart.append(xmlEOAfacsimilepage)
intFacNumber =+ 1
etree.strip_tags(xmlDjangoTree, "temp")
print("----------------------------------------------")
print("Processing and linking Footnotes for django")
def bring_footnote_down_django(footnote, fragment, footnote_number, object_number, unique_id, destination):
"""
captures reusable behavior from the existing code
potentially, some of the old code could be replaced by calls to this helper
usage: intObjectNumber = bring_footnote_down_django(xmlFootnote, "fn"+str(intFootnoteNumber), str(intFootnoteNumber), intObjectNumber, tmpStrUID, xmlResult)
unfortunately, returning the result seemed like a better idea than mutating the global variable
"""
kids = list(footnote.getchildren())
footnote_text = footnote.text or ""
replace_footnote_with_sup(footnote)
footnote.set("class", "footnote")
anchor = etree.Element("a")
anchor.set("href", "#" + fragment) # "fn" + str(intFootnoteNumber)
anchor.text = footnote_number # str(intFootnoteNumber)
footnote.append(anchor)
foot = etree.Element("EOAfootnote")
foot.set("order", str(object_number))
object_number += 1
foot.set("number", footnote_number)
anchor_number = next(
iter(
(
parent.get("order")
for parent
in footnote.iterancestors()
if parent.get("order") is not None
)
)
)
foot.set("anchor", anchor_number)
foot.set("id", unique_id)
foot.text = footnote_text
for kid in kids:
if "EOAequationnonumber" == kid.tag:
cwd = os.getcwd()
shutil.copy(
"%s/items/%s" % (cwd, kid.get("filename")),
"%s/CONVERT/django/images/" % cwd,
)
foot.append(kid)
destination.append(foot)
return object_number
# def bring_footnote_down_django ends here
xmlEOAchapters = xmlEOAdocument.findall(".//EOAchapter")
debug_chapters(xmlEOAchapters)
for xmlEOAchapter in xmlEOAchapters:
groupings = libeoaconvert.get_bigfoot_data(xmlEOAchapter)
has_old = 0 != len(xmlEOAchapter.findall(".//note"))
has_new = 0 != len(
[ # flatten
note
for grouping, notes in groupings
for note in notes
]
)
# XOR falls through, AND is an error (that should have already been thrown during the epub phase), and NOR skips to the next chapter
if has_old:
if has_new:
raise FootnoteError("This chapter contains both old-style footnotes and new-style footnotes")
else:
if not has_new:
continue
# Find out running order of last item the chapter
# Hier pro FN zunächst die EOAequationnonumber in <p> korrigieren
# Dann pro FN die Kindelemente abarbeiten und an die neue FN dran hängen
# Ggf. aufpassen, ob ein Absatz mit indent versehen ist, dann blockquote drum herum machen
xmlElement = xmlEOAchapter[(len(xmlEOAchapter)-1)]
logging.debug(etree.tostring(xmlElement))
intObjectNumber = (int(xmlElement.get("order")) + 1)
intFootnoteNumber = 1
xmlResult = etree.Element("temp")
xmlEOAsection = etree.Element("EOAsection")
xmlEOAsection.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlHead = etree.Element("head")
xmlHead.text = libeoaconvert.dictLangFootnotes[libeoaconvert.two_letter_language(xmlEOAchapter.get("language"))]
xmlEOAsection.append(xmlHead)
xmlResult.append(xmlEOAsection)
for grouping, notes in groupings:
for index, note in enumerate(notes):
# do for the new-style notes what the old code did for the other footnotes
fntext = str(index+1)
if "lower-latin" == grouping:
fntext = alph_footnote_index(index)
unique_id = "fn%s" % fntext
intObjectNumber = bring_footnote_down_django(note, unique_id, fntext, intObjectNumber, unique_id, xmlResult)
intFootnoteNumber = 1
xmlFootnotes = xmlEOAchapter.findall(".//note")
for xmlFootnote in xmlFootnotes:
xmlFootnoteContent = xmlFootnote.getchildren()
strFootnoteText = xmlFootnote.text or ""
tmpTail = xmlFootnote.tail
tmpStrUID = xmlFootnote.get("id")
xmlFootnote.clear()
xmlFootnote.tail = tmpTail
xmlFootnote.tag = "sup"
xmlFootnote.set("class", "footnote")
xmlFootnoteLink = etree.Element("a")
xmlFootnoteLink.set("href", "#fn" + str(intFootnoteNumber))
xmlFootnoteLink.text = str(intFootnoteNumber)
xmlFootnote.append(xmlFootnoteLink)
xmlEOAfootnote = etree.Element("EOAfootnote")
xmlEOAfootnote.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlEOAfootnote.set("number", str(intFootnoteNumber))
for xmlParent in xmlFootnote.iterancestors():
if xmlParent.get("order") is not None:
strFootnoteAnchorNumber = xmlParent.get("order")
break
xmlEOAfootnote.set("anchor", strFootnoteAnchorNumber)
xmlEOAfootnote.set("id", tmpStrUID)
xmlEOAfootnote.text = strFootnoteText
for xmlElement in xmlFootnoteContent:
if xmlElement.tag == "EOAequationnonumber":
shutil.copy(os.getcwd() + "/items/" + xmlElement.get("filename"), os.getcwd() + "/CONVERT/django/images/")
xmlEOAfootnote.append(xmlElement)
xmlResult.append(xmlEOAfootnote)
intFootnoteNumber += 1
xmlEOAchapter.append(xmlResult)
# Remove temp-Tag
etree.strip_tags(xmlDjangoTree, "temp")
# print("----------------------------------------------")
# print("Processing Verses")
# for xmlEOAchapter in xmlEOAchapters:
# verses = xmlEOAchapter.findall(".//EOAverse")
# print("Found lotsa verses: ", len(verses))
print("----------------------------------------------")
print("Processing various Elements")
for xmlEOAchapter in xmlEOAchapters:
xmlEmphasized = xmlEOAchapter.findall(".//hi")
for xmlEmph in xmlEmphasized:
if xmlEmph.get("rend") == "it":
xmlEmph.tag = "em"
del xmlEmph.attrib["rend"]
xmlHyperlinks = xmlEOAchapter.findall(".//xref")
for xmlHyperlink in xmlHyperlinks:
strURL = xmlHyperlink.get('url')
if strURL.startswith("http://") == False:
if strURL.startswith("https://") == False:
strURL = "http://" + strURL
xmlHyperlink.tag = "a"
del xmlHyperlink.attrib["url"]
xmlHyperlink.set("href", strURL)
etree.strip_elements(xmlHyperlink, with_tail=True, *['allowbreak'])
xmlHyperlink.text = strURL
# Convert bold text
xmlBolds = xmlEOAchapter.findall(".//hi")
for xmlBold in xmlBolds:
if xmlBold.get("rend") == "bold":
xmlBold.tag = "b"
del xmlBold.attrib["rend"]
# Convert EOAup to <sup>
xmlUps = xmlEOAchapter.findall(".//EOAup")
for xmlUp in xmlUps:
xmlUp.tag = "sup"
# Convert EOAdown to <sub>
xmlDowns = xmlEOAchapter.findall(".//EOAdown")
for xmlDown in xmlDowns:
xmlDown.tag = "sub"
# Convert EOAst to <span>
xmlStrikeouts = xmlEOAchapter.findall(".//EOAst")
for xmlStrikeout in xmlStrikeouts:
xmlStrikeout.tag = "span"
xmlStrikeout.set("style", "text-decoration: line-through;")
# Convert letter-spacing into something nice
xmlLetterspaceds = xmlEOAchapter.findall(".//EOAls")
for xmlLetterspaced in xmlLetterspaceds:
xmlLetterspaced.tag = "span"
xmlLetterspaced.set("style", "letter-spacing: 0.5em;")
# Convert letter-spacing into something nice
xmlCaps = xmlEOAchapter.findall(".//EOAcaps")
for xmlCap in xmlCaps:
xmlCap.tag = "span"
xmlCap.set("style", "font-variant:small-caps;")
# Convert EOAineq into appropriate IMG-Tags
xmlInlineEquations = xmlEOAchapter.findall(".//EOAineq")
for xmlInlineEquation in xmlInlineEquations:
xmlInlineEquation.tag = "img"
xmlInlineEquation.set("class", "EOAineq")
xmlInlineEquation.set("alt", xmlInlineEquation.get("TeX"))
shutil.copy(os.getcwd() + "/items/" + xmlInlineEquation.get("src"), os.getcwd() + "/CONVERT/django/images/" + xmlInlineEquation.get("src"))
# Convert EOAchem into appropriate IMG-Tags
xml_inline_chems = xmlEOAchapter.findall(".//EOAchem")
for xml_inline_chem in xml_inline_chems:
xml_inline_chem.tag = "img"
xml_inline_chem.set("class", "EOAineq")
xml_inline_chem.set("alt", xml_inline_chem.get("TeX"))
shutil.copy(os.getcwd() + "/items/" + xml_inline_chem.get("src"), os.getcwd() + "/CONVERT/django/images/" + xml_inline_chem.get("src"))
# Convert EOAinline into appropriate IMG-Tags
xmlInlineElements = xmlEOAchapter.findall(".//EOAinline")
for xmlInlineElement in xmlInlineElements:
xmlInlineElement.tag = "img"
xmlInlineElement.set("class", "EOAinline")
xmlInlineElement.set("alt", "")
xmlInlineElement.set("class", "eoainlineimage")
strInlineElementFilePath = xmlInlineElement.text
strInlineElementFileName = os.path.basename(strInlineElementFilePath)
strInlineElementDirName = os.path.dirname(strInlineElementFilePath)
xmlInlineElement.text = None
xmlInlineElement.set("src", strInlineElementDirName + strInlineElementFileName)
shutil.copy(os.getcwd() + "/" + strInlineElementDirName + "/" + strInlineElementFileName, os.getcwd() + "/CONVERT/django/images/embedded/" + strInlineElementDirName + strInlineElementFileName)
strNewImagePath = os.getcwd() + "/CONVERT/django/images/embedded/" + strInlineElementDirName + strInlineElementFileName
strCommand = GM_PATH + " convert " + strNewImagePath + " -resize 20x20 " + strNewImagePath
listArguments = shlex.split(strCommand)
subprocess.check_output(listArguments, shell=False)
# Change EOAcitenumeric into a span to create approriate link
xmlEOAcitenumerics = xmlEOAchapter.findall(".//EOAcitenumeric")
for xmlEOAcitenumeric in xmlEOAcitenumerics:
xmlEOAcitenumeric.tag = "span"
xmlEOAcitenumeric.set("class", "citation")
xmlEOAcitenumeric.set("rel", "popover")
# Change EOAciteauthoryear into a span to create approriate link
xmlEOAciteauthoryears = xmlEOAchapter.findall(".//EOAciteauthoryear")
for xmlEOAciteauthoryear in xmlEOAciteauthoryears:
xmlEOAciteauthoryear.tag = "span"
xmlEOAciteauthoryear.set("class", "citation")
xmlEOAciteauthoryear.set("rel", "popover")
# Change EOAciteauthoryear into a span to create approriate link
xmlEOAciteyears = xmlEOAchapter.findall(".//EOAciteyear")
for xmlEOAciteyear in xmlEOAciteyears:
xmlEOAciteyear.tag = "span"
xmlEOAciteyear.set("class", "citation")
xmlEOAciteyear.set("rel", "popover")
# Change EOAciteauthoryear into a span to create approriate link
xmlEOAcitemanuals = xmlEOAchapter.findall(".//EOAcitemanual")
for xmlEOAcitemanual in xmlEOAcitemanuals:
xmlEOAcitemanual.tag = "span"
xmlEOAcitemanual.set("class", "citation")
xmlEOAcitemanual.set("rel", "popover")
print("----------------------------------------------")
print("Processing Cross References")
# Substitute References with their targets (wit links)
for xmlEOAchapter in xmlEOAchapters:
xmlReferences = xmlEOAchapter.findall(".//EOAref")
for xmlReference in xmlReferences:
strResult = "!!! Cross Reference !!!"
strChapterOrder = ""
strObjectOrder = ""
xmlReferenceLabel = xmlReference.find("Label")
xmlReferenceLabelText = xmlReferenceLabel.text
xmlReferenceRef = xmlReference.find("ref")
xmlReferenceRefTarget = xmlReferenceRef.get("target")
if xmlReferenceLabelText in dictEquations:
# Grab Number from Dictionary
strResult = dictEquations[xmlReferenceLabelText]
# Go through all equations and find the corresponding Equation
xmlEOAequations = xmlEOAdocument.findall(".//EOAequation")
for xmlEOAequation in xmlEOAequations:
tmpReferenceLabelText = xmlEOAequation.get("label")
if xmlReferenceLabelText == tmpReferenceLabelText:
logging.debug("Successfully found link to array formula: %s" % strResult)
for xmlParent in xmlEOAequation.iterancestors():
if xmlParent.tag == "EOAchapter":
strChapterOrder = xmlParent.get("order")
strObjectOrder = xmlEOAequation.get("order")
if xmlReferenceRefTarget in dictEquations:
# Grab Number from Dictionary
strResult = dictEquations[xmlReferenceRefTarget]
# Go through all equations and find the corresponding Equation
xmlEOAequations = xmlEOAdocument.findall(".//EOAequation")
for xmlEOAequation in xmlEOAequations:
tmpReferenceRefTarget = xmlEOAequation.get("uid")
if xmlReferenceRefTarget == tmpReferenceRefTarget:
logging.debug("Successfully found link to normal formula: %s" % strResult)
for xmlParent in xmlEOAequation.iterancestors():
if xmlParent.tag == "EOAchapter":
strChapterOrder = xmlParent.get("order")
strObjectOrder = xmlEOAequation.get("order")
if xmlReferenceRefTarget in dictLists:
logging.debug("Found link to list.")
strResult = dictLists[xmlReferenceRefTarget]
xmlEOAlistitem = xmlEOAdocument.xpath("//EOAchapter/*[contains(@id, $targetuid)]", targetuid = xmlReferenceRefTarget)[0]
for xmlParent in xmlEOAlistitem.iterancestors():
if xmlParent.tag == "EOAchapter":
strChapterOrder = xmlParent.get("order")
strObjectOrder = xmlEOAlistitem.get("order")
if xmlReferenceRefTarget in dictChapters:
logging.debug("Found link to chapter.")
strResult = dictChapters[xmlReferenceRefTarget]
for xmlEOAchapter in xmlEOAdocument.findall(".//EOAchapter"):
if xmlEOAchapter.get("id") == xmlReferenceRefTarget:
logging.debug("Successfully handled link to a chapter: %s" % strResult)
strObjectOrder = "top"
strChapterOrder = xmlEOAchapter.get("order")
if xmlReferenceRefTarget in dictTheorems:
logging.debug("Found link to ein Theorem")
strResult = dictTheorems[xmlReferenceRefTarget]
for xmlEOAtheorem in xmlEOAdocument.findall(".//EOAtheorem"):
if xmlEOAtheorem.get("uid") == xmlReferenceRefTarget:
logging.debug("Successfully handled link to a theorem: %s " % strResult)
for xmlParent in xmlEOAtheorem.iterancestors():
if xmlParent.tag == "EOAchapter":
strObjectOrder = xmlEOAtheorem.get("order")
strChapterOrder = xmlParent.get("order")
if xmlReferenceRefTarget in dictSections:
logging.debug("Found link to section")
strResult = dictSections[xmlReferenceRefTarget]
xmlEOAsections = xmlEOAdocument.findall(".//EOAsection")
for xmlEOAsection in xmlEOAsections:
tmpReferenceRefTarget = xmlEOAsection.get("id")
if xmlReferenceRefTarget == tmpReferenceRefTarget:
logging.debug("Successfully handled link to section: %s " % strResult)
for xmlParent in xmlEOAsection.iterancestors():
if xmlParent.tag == "EOAchapter":
strChapterOrder = xmlParent.get("order")
strObjectOrder = xmlEOAsection.get("order")
xmlEOAsubsections = xmlEOAdocument.findall(".//EOAsubsection")
for xmlEOAsubsection in xmlEOAsubsections:
tmpReferenceRefTarget = xmlEOAsubsection.get("id")
if xmlReferenceRefTarget == tmpReferenceRefTarget:
logging.debug("Successfully handled link to subsection %s: " % strResult)
for xmlParent in xmlEOAsubsection.iterancestors():
if xmlParent.tag == "EOAchapter":
strChapterOrder = xmlParent.get("order")
strObjectOrder = xmlEOAsubsection.get("order")
if xmlReferenceRefTarget in dictFigures:
logging.debug("Found link to figure")
strResult = dictFigures[xmlReferenceRefTarget]
xmlEOAfigures = xmlEOAdocument.findall(".//EOAfigure")
for xmlEOAfigure in xmlEOAfigures:
tmpReferenceRefTarget = xmlEOAfigure.get("id")
if xmlReferenceRefTarget == tmpReferenceRefTarget:
logging.debug("Successfully handled link to figure: %s" % strResult)
for xmlParent in xmlEOAfigure.iterancestors():
if xmlParent.tag == "EOAchapter":
strChapterOrder = xmlParent.get("order")
strObjectOrder = xmlEOAfigure.get("order")
if xmlReferenceRefTarget in dictFootnotes:
logging.debug("Found link to footnote")
strResult = dictFootnotes[xmlReferenceRefTarget]
xmlEOAfootnotes = xmlEOAdocument.findall(".//EOAfootnote")
for xmlEOAfootnote in xmlEOAfootnotes:
tmpReferenceRefTarget = xmlEOAfootnote.get("id")
if xmlReferenceRefTarget == tmpReferenceRefTarget:
logging.debug("Successfully handled link to footnote: %s" % strResult)
for xmlParent in xmlEOAfootnote.iterancestors():
if xmlParent.tag == "EOAchapter":
strChapterOrder = xmlParent.get("order")
strObjectOrder = xmlEOAfootnote.get("order")
if xmlReferenceLabelText in dictTables:
logging.debug("Found link to table")
strResult = dictTables[xmlReferenceLabelText]
xmlEOAtables = xmlEOAdocument.findall(".//EOAtable")
for xmlEOAtable in xmlEOAtables:
tmpReferenceRefTarget = xmlEOAtable.get("label")
if xmlReferenceLabelText == tmpReferenceRefTarget:
logging.debug("Successfully handled link to table: %s" % strResult)
for xmlParent in xmlEOAtable.iterancestors():
if xmlParent.tag == "EOAchapter":
strChapterOrder = xmlParent.get("order")
strObjectOrder = xmlEOAtable.get("order")
tmpTail = xmlReference.tail or ""
xmlReference.clear()
xmlReference.text = strResult
xmlReference.tail = tmpTail
xmlReference.tag = "a"
xmlReference.set("href", "../" + strChapterOrder + "/index.html#" + strObjectOrder)
print("----------------------------------------------")
print("Processing Page References")
for xmlEOAchapter in xmlEOAchapters:
xmlPageReferences = xmlEOAchapter.findall(".//EOApageref")
strResult = "!!! Page Reference !!!"
for xmlReference in xmlPageReferences:
xmlReferenceLabel = xmlReference.find("Label")
xmlReferenceLabelText = xmlReferenceLabel.text
xmlReferenceRef = xmlReference.find("ref")
xmlReferenceRefTarget = xmlReferenceRef.get("target")
if xmlReferenceLabelText in dictPagelabels:
logging.debug("Found link to page: %s" % xmlReferenceLabelText)
strResult = dictPagelabels[xmlReferenceLabelText]
xmlReference.text = strResult
for xmlChild in xmlReference.iterchildren():
xmlReference.remove(xmlChild)
# Check, if EOApageref points to a Facsimile-Page
# If yes, make a href to the facsimile
xmlEOAfacsimilepages = xmlEOAdocument.findall(".//EOAfacsimilepage")
for xmlEOAfacsimilepage in xmlEOAfacsimilepages:
if xmlEOAfacsimilepage.get("label") == xmlReferenceLabelText:
logging.debug("Found cross reference to facsimile.")
xmlReference.tag = "a"
strPartOrder = xmlEOAfacsimilepage.getparent().get("order")
strFacsimileOrder = xmlEOAfacsimilepage.get("order")
logging.debug(strFacsimileOrder)
xmlReference.set("href", "../" + strPartOrder + "/" + strFacsimileOrder + ".html")
print("----------------------------------------------")
print("Normalizing Index Entries")
for xmlEOAchapter in xmlEOAchapters:
xml_EOA_indices = xmlEOAchapter.xpath(".//EOAindex | .//EOAindexperson | .//EOAindexlocation")
for xmlEOAindex in xml_EOA_indices:
# Using the gettext function here, because of subelements
# strEOAindextext = xmlEOAindex.text
strEOAindextext = gettext(xmlEOAindex)
strEOAindextext = strEOAindextext.replace("\n", " ")
index_children = xmlEOAindex.getchildren()
if index_children is not None:
for sub_element in index_children:
xmlEOAindex.remove(sub_element)
xmlEOAindex.text = None
listFirstPart = re.split('\|', strEOAindextext)
tmpEntry = listFirstPart[0]
listSecondPart = re.split('\!', tmpEntry)
strMainEntry = listSecondPart[0]
# Check if a sortkey is present via @
listSortKey = re.split('@', strMainEntry)
if len(listSortKey) == 2:
xmlEOAindex.set("main", listSortKey[0])
xmlEOAindex.set("display", listSortKey[1])
else:
xmlEOAindex.set("main", strMainEntry)
if len(listSecondPart) > 1:
strSecondPart = listSecondPart[1]
listSecondarySortkey = re.split('@', strSecondPart)
if len(listSecondarySortkey) == 2:
xmlEOAindex.set("secondary", listSecondarySortkey[0])
xmlEOAindex.set("secondarydisplay", listSecondarySortkey[1])
else:
xmlEOAindex.set("secondary", strSecondPart)
if len(listFirstPart) > 1:
strAddition = listFirstPart[1]
if strAddition == "textbf":
xmlEOAindex.set("bold", "true")
tmpseealso = re.match('seealso', strAddition)
if tmpseealso != None:
tmpAddition = re.sub('seealso', '', strAddition)
xmlEOAindex.set("seealso", tmpAddition)
# Entries containing seealso are omitted for the time being
xmlEOAindex.tag = "temp"
tmpsee = re.match('^see(?!also)', strAddition)
if tmpsee != None:
tmpAddition = re.sub('see', '', strAddition)
xmlEOAindex.set("see", tmpAddition)
# Entries containing seealso are omitted for the time being
xmlEOAindex.tag = "temp"
# Figure out parent chapter number and parent Element order
for xmlParent in xmlEOAindex.iterancestors():
if xmlParent.get("order") != None and xmlParent.tag != "EOAchapter":
xmlEOAindex.set("elementorder", xmlParent.get("order"))
if xmlParent.get("order") != None and xmlParent.tag == "EOAchapter":
xmlEOAindex.set("chapterorder", xmlParent.get("order"))
# print(etree.tostring(xmlEOAindex))
etree.strip_tags(xmlDjangoTree, "temp")
print("----------------------------------------------")
print("Removing Duplicate Index Entries")
for xmlEOAchapter in xmlEOAchapters:
for xmlChild in xmlEOAchapter.iterchildren():
dictEntries = {}
xml_EOA_indices = xmlChild.xpath(".//EOAindex | .//EOAindexperson | .//EOAindexlocation")
for xmlEOAindex in xml_EOA_indices:
listEntry = []
strEntry = xmlEOAindex.get("main")
if strEntry in dictEntries:
strSubentry = xmlEOAindex.get("secondary")
if strSubentry in dictEntries[strEntry] or strSubentry == None:
if (xmlChild.get("see") is None) and (xmlChild.get("seealso") is None):
xmlEOAindex.tag = "temp"
else:
dictEntries[strEntry].append(strSubentry)
else:
dictEntries[strEntry] = listEntry
print("----------------------------------------------")
print("Removing Index Entries in Footnotes")
for xmlEOAchapter in xmlEOAchapters:
for xmlChild in xmlEOAchapter.iterchildren():
dictEntries = {}
xml_EOA_indices = xmlChild.xpath(".//EOAindex | .//EOAindexperson | .//EOAindexlocation")
for xmlEOAindex in xml_EOA_indices:
for xmlParent in xmlEOAindex.iterancestors():
if xmlParent.tag == "EOAfootnote":
xmlEOAindex.tag = "temp"
logging.debug("Found index in footnote")
print("----------------------------------------------")
print("Sorting and Creating Regular Index")
xml_regular_EOAindices = xmlDjangoTree.findall("//EOAindex")
if len(xml_regular_EOAindices) != 0:# is not None:
logging.debug("Sorting %s entries for regular index." % str(len(xml_regular_EOAindices)))
xml_eoa_print_regular_index = make_index(xml_regular_EOAindices, index_type = "regular")
libeoaconvert.debug_xml_here(xmlDjangoTree, "djangotree")
libeoaconvert.debug_xml_here(xmlEOAdocument, "xmleoadocument")
libeoaconvert.debug_xml_here(xmlTree, "xmltree")
# If EOAprintindex is found, append xml_eoa_print_regular_index to xmlEOAdocument
xmlPrintindex = xmlTree.find(".//EOAprintindex")
if xmlPrintindex is not None != 0:
# Remove <p><EOAprintindex/></p> from xmlDjangoTree
print("found an index")
xmlPrintindex.tag = "temp"
xmlPrintindex.getparent().tag = "temp"
xmlEOAdocument.append(xml_eoa_print_regular_index)
else:
print("found no index")
print("----------------------------------------------")
print("Sorting and Creating Person Index")
xml_person_EOAindices = xmlDjangoTree.findall("//EOAindexperson")
if len(xml_person_EOAindices) != 0:# is not None:
xml_eoa_print_person_index = make_index(xml_person_EOAindices, index_type = "person")
# If EOAprintpersonindex is found, append xml_eoa_print_person_index to xmlEOAdocument
# xmlPrintindex = xmlDjangoTree.find(".//EOAprintpersonindex")
xmlPrintindex = xmlTree.find(".//EOAprintpersonindex")
if xmlPrintindex is not None != 0:
# Remove <p><EOAprintindex/></p> from xmlDjangoTree
xmlPrintindex.tag = "temp"
xmlPrintindex.getparent().tag = "temp"
xmlEOAdocument.append(xml_eoa_print_person_index)
# doing the same for location index
print("----------------------------------------------")
print("Sorting and Creating Location Index")
xml_location_EOAindices = xmlDjangoTree.findall("//EOAindexlocation")
if len(xml_location_EOAindices) != 0:# is not None:
xml_eoa_print_location_index = make_index(xml_location_EOAindices, index_type = "location")
# If EOAprintlocationindex is found, append xml_eoa_print_location_index to xmlEOAdocument
xmlPrintindex = xmlTree.find(".//EOAprintlocationindex")
if xmlPrintindex is not None != 0:
xmlPrintindex.tag = "temp"
xmlPrintindex.getparent().tag = "temp"
xmlEOAdocument.append(xml_eoa_print_location_index)
############################################################################
# Cleaning up #
############################################################################
# TODO: Die unnötigen Attribute wie id löschen
# TODO: Die unnötigen Tags wie EOAlabel löschen
etree.strip_tags(xmlDjangoTree, "temp", "citetext", "EOAprintbibliography")
etree.strip_elements(xmlDjangoTree, "citekey", with_tail=False)
etree.strip_attributes(xmlDjangoTree, "id-text", "id", "noindent", "type", "label", "spacebefore")#, "rend")
############################################################################
# Save xmlDjangoTree #
############################################################################
tmpFile = open("CONVERT/django/Django.xml", "w")
tmpResult = etree.tostring(xmlDjangoTree, pretty_print=True, encoding="unicode")
tmpFile.write(tmpResult)
tmpFile.close()
logging.debug("Wrote Django.xml")