Skip to content
Navigation Menu
Toggle navigation
Sign in
In this repository
All GitHub Enterprise
↵
Jump to
↵
No suggested jump to results
In this repository
All GitHub Enterprise
↵
Jump to
↵
In this organization
All GitHub Enterprise
↵
Jump to
↵
In this repository
All GitHub Enterprise
↵
Jump to
↵
Sign in
Reseting focus
You signed in with another tab or window.
Reload
to refresh your session.
You signed out in another tab or window.
Reload
to refresh your session.
You switched accounts on another tab or window.
Reload
to refresh your session.
Dismiss alert
{{ message }}
EditionOpenAccess
/
EOASkripts
Public
Notifications
You must be signed in to change notification settings
Fork
0
Star
2
Code
Issues
34
Pull requests
0
Actions
Security
Insights
Additional navigation options
Code
Issues
Pull requests
Actions
Security
Insights
Files
3b6fa57
docs
scripts
src
bibformat
config
data
stylesheets
utils
bib_add_keyword.py
create_tmpbib.py
eoatex2imxml.py
eoatex2pdf.py
find_chapters.py
fix_tei.py
idassigner.py
imxml2django.py
imxml2epub.py
imxml2tei.py
mkimage.py
parsezotero.py
process_eoa_latex.py
process_tei.py
tei2eoatex.py
tei2html.py
tei2imxml.py
tei_add_bibl.py
tei_pickle.py
.dockerignore
.gitignore
.init-container.sh
Dockerfile
LICENSE
README.md
dependencies.conf
docker-compose.yaml
requirements.txt
Breadcrumbs
EOASkripts
/
src
/
imxml2django.py
Blame
Blame
Latest commit
kthoden
Specify where cover was copied from
Apr 9, 2020
3b6fa57
·
Apr 9, 2020
History
History
executable file
·
1962 lines (1802 loc) · 92.8 KB
Breadcrumbs
EOASkripts
/
src
/
imxml2django.py
Top
File metadata and controls
Code
Blame
executable file
·
1962 lines (1802 loc) · 92.8 KB
Raw
#!/usr/bin/env python3 # -*- coding: utf-8; mode: python -*- # Time-stamp: <2020-04-08 17:46:10 (kthoden)> """ Create an XML file that can be inserted into the Django database of an EOAv1 installation. Input file is a customized DocBook XML that has been generated either with eoatex2imxml or tei2imxml. """ from utils.load_config import load_config, exec_command, check_executable import utils.libeoaconvert as libeoaconvert import pickle import os import sys import re import shutil import shlex import subprocess import argparse import configparser import logging from copy import deepcopy from lxml import etree from pathlib import Path import time BASE_DIR = Path( __file__ ).resolve().parent SCRIPT_PATH = Path( __file__ ) SCRIPT_NAME = SCRIPT_PATH.name DEFAULT_INPUT_DIR = \ Path(os.environ['INPUT_DIR'] if 'INPUT_DIR' in os.environ else './input') DEFAULT_OUTPUT_DIR = \ Path(os.environ['OUTPUT_DIR'] if 'OUTPUT_DIR' in os.environ else './output') ##################### # Parsing arguments # ##################### parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument( "-c", "--config", default = BASE_DIR / "config" / "eoaconvert.cfg", dest="CONFIG_FILE", help="Name of configuration file", metavar="CONFIGURATION", type = Path, ) parser.add_argument( "--log-level", default = "INFO", help="log level: choose between DEBUG, INFO, WARNING, ERROR, CRITICAL" ) parser.add_argument( "-p", "--checkpublicationcfg", help="Check the publication.cfg for completeness.", action="store_true" ) parser.add_argument( "-i", "--input-dir", help = f"directory containing some intermediate xml created by previous steps. default: {DEFAULT_OUTPUT_DIR}/PUBLICATION_NAME/imxml", type = Path, ) parser.add_argument( "-o", "--output-dir", help = f"output directory. default: {DEFAULT_OUTPUT_DIR}/PUBLICATION_NAME/django", type = Path, ) parser.add_argument( "PUBLICATION_DIR", help = "directory containing the publication (including resources like pictures, etc.)", type = Path, ) args = parser.parse_args() ######################## # Paths to executables # ######################## GM_PATH = "gm" PDFCROP_EXEC = "pdfcrop" # (part of texlive distribution): ############################ # Paths: ############################ PUBLICATION_DIR = args.PUBLICATION_DIR INPUT_DIR = \ args.input_dir if args.input_dir is not None else DEFAULT_OUTPUT_DIR / PUBLICATION_DIR.resolve().stem / "imxml" OUTPUT_DIR = \ args.output_dir if args.output_dir is not None else (DEFAULT_OUTPUT_DIR / PUBLICATION_DIR.resolve().stem) / "django" LOG_DIR = OUTPUT_DIR / "log" LOG_FILE = (LOG_DIR / SCRIPT_NAME) . with_suffix( ".log" ) TEMP_DIR = OUTPUT_DIR / "tmp_files" DEBUG_DIR = OUTPUT_DIR / "debug" config_file = args.CONFIG_FILE print("The configfile is %s." % config_file) ################################## # Reading the configuration file # ################################## CONFIG = load_config( config_file, args.log_level, LOG_FILE, ) ############################ # Paths to auxiliary files # ############################ TRANSLATION_FILE = BASE_DIR / CONFIG['Auxiliaries']['TRANSLATIONS'] # prepare: logging.debug("PATH: {}".format( os.environ['PATH'] )) check_executable( GM_PATH ) check_executable( PDFCROP_EXEC ) if not TEMP_DIR.exists(): os.makedirs( TEMP_DIR ) if not DEBUG_DIR.exists(): os.makedirs( DEBUG_DIR ) # Check for folder and necessary files logging.info(f"The publication.cfg file is missing in django directory.") if os.path.exists(INPUT_DIR / "publication.cfg"): shutil.copy(INPUT_DIR / "publication.cfg", OUTPUT_DIR) logging.info(f"Copied from {INPUT_DIR}.") else: logging.error(f"Found no publication.cfg in {INPUT_DIR}. Exiting") sys.exit( 1 ) if os.path.exists(INPUT_DIR / "Cover.jpg"): shutil.copy(INPUT_DIR / "Cover.jpg", OUTPUT_DIR / "Cover.jpg") logging.info("Copied cover image from input directory.") else: logging.error("No coverfile found. You can create a temporary one with the mkimage.py script") sys.exit( 1 ) ########################################### # Loading data from first conversion step # ########################################### with open(INPUT_DIR / "tmp_files" / 'data.pickle', 'rb') as f: data = pickle.load(f) dictChapters = data["chapterdict"] dictEquations = data["eqdict"] dictLists = data["listdict"] dictTheorems = data["theoremdict"] dictSections = data["secdict"] dictFigures = data["figdict"] dictFootnotes = data["fndict"] dictTables = data["tabdict"] dictPagelabels = data["pagelabeldict"] if not os.path.exists(DEBUG_DIR): os.mkdir(DEBUG_DIR) xmlTree = etree.parse( str(INPUT_DIR / "IntermediateXMLFile.xml") ) libeoaconvert.debug_xml_here( xmlTree, "fresh", DEBUG_DIR ) print(""" ############################################################################ # Convert tralics-XML to Django Data Structure # ############################################################################ """) if not os.path.exists(OUTPUT_DIR / "images"): os.mkdir(OUTPUT_DIR / "images") if not os.path.exists(OUTPUT_DIR / "images" / "embedded"): os.mkdir(OUTPUT_DIR / "images" / "embedded") if not os.path.exists(OUTPUT_DIR / "files"): os.mkdir(OUTPUT_DIR / "files") # Create empty xmlTree xmlEOAdocument = etree.Element("EOAdocument") xmlDjangoTree = etree.ElementTree(xmlEOAdocument) etree.strip_attributes(xmlTree, "noindent") # Remove temp-Tag etree.strip_tags(xmlTree, "temp") libeoaconvert.debug_xml_here( xmlTree, "afterstriptags", DEBUG_DIR ) # Write Temporary XML-Maintree ergebnisdatei = open(TEMP_DIR / "Devel_django.xml", "w") ergebnis = etree.tostring(xmlTree, pretty_print=True, encoding="unicode") ergebnisdatei.write(ergebnis) ergebnisdatei.close() # Find all Chapters from the original tralics XML xmlChapters = xmlTree.findall("//div1") def replace_footnote_with_sup(note): """ captures reusable behavior from the existing code potentially, some of the old code could be replaced by calls to this helper this behavior showed up in a few places I thought I would be able to extract a little more, but this was all that was actually common """ tail = note.tail note.clear() note.tail = tail note.tag = "sup" # def replace_footnote_with_sup ends here def alph_footnote_index(fndex): """ lowercase Latin footnotes need to support more than 26 values These are zero-indexed. >>> alph_footnote_index(0) 'a' >>> alph_footnote_index(1) 'b' >>> alph_footnote_index(24) 'y' >>> alph_footnote_index(25) 'z' >>> alph_footnote_index(26) 'aa' >>> alph_footnote_index(27) 'ab' """ alphabet = "abcdefghijklmnopqrstuvwxyz" quotient, remainder = divmod(fndex, len(alphabet)) if not quotient: return alphabet[fndex] return alph_footnote_index(quotient - 1) + alph_footnote_index(remainder) # def alph_footnote_index ends here def debug_chapters(xmlEOAchapters): """Write individual chapters to files""" chap_num = 1 for chapter in xmlEOAchapters: tmp_filename = DEBUG_DIR / ("debug-chapter-%02d.xml" % chap_num) tmp_file = open (tmp_filename, "w") tmp_result = etree.tostring(chapter, pretty_print=True, encoding="unicode") tmp_file.write(tmp_result) tmp_file.close() chap_num += 1 # def debug_chapters ends here def djangoParseObject(xmlElement, indent=False, listtype=None, listnumber=0, uid=None): # Get Dictionaries of Numbers via Global Variables global dictChapters global dictFigures global dictEquations global dictSections global dictFootnotes global dictPagelabels global dictTables global dictLists global intObjectNumber # Check what kind of Element we have and change the data if isinstance(xmlElement.tag, str): if xmlElement.tag == "EOAtranscripted": xmlResult = etree.Element("temp") xmlEOATranscription = etree.Element("EOAtranscription") xmlEOATranscription.set("order", str(intObjectNumber)) intObjectNumber += 1 xmlLeftheader = xmlElement.find(".//Leftheader") etree.strip_tags(xmlLeftheader, "p") xmlEOATranscription.append(xmlLeftheader) xmlRightheader = xmlElement.find(".//Rightheader") etree.strip_tags(xmlRightheader, "p") xmlEOATranscription.append(xmlRightheader) xmlTranscriptedtext = xmlElement.find(".//EOAtranscriptedtext") # change \n\n into </p><p> and pagebreak intto </p><pagebreak><p> to create some valid markup strTranscriptedtext = etree.tostring(xmlTranscriptedtext, encoding="unicode") #strTranscriptedtext = re.sub (r"\n\n", "</p><p>", str(strTranscriptedtext)) #strTranscriptedtext = re.sub (r"<p><pagebreak/></p>", "<pagebreak/>", strTranscriptedtext) xmlLeftColumn = etree.Element("EOAtranscriptionleft") xmlRightColumn = etree.Element("EOAtranscriptionright") boolRightColumn = False xmlTemp = etree.XML(str(strTranscriptedtext)) for xmlElement in xmlTemp.iterchildren(): if xmlElement.tag == "pagebreak": boolRightColumn = True continue if boolRightColumn == False: xmlLeftColumn.append(xmlElement) if boolRightColumn == True: xmlRightColumn.append(xmlElement) xmlEOATranscription.append(xmlLeftColumn) xmlEOATranscription.append(xmlRightColumn) # Convert Images within the transcription logging.debug("EOAfigurenonumber") xmlFigures = xmlEOATranscription.findall(".//EOAfigurenonumber") logging.debug(xmlFigures) if xmlFigures is not None: for xmlFigure in xmlFigures: # example 'images/1.jpg' strImageFileString = xmlFigure.find(".//file").text strImageFileString = strImageFileString.rstrip("\n") strImageFileDir = os.path.dirname(strImageFileString) strImageFileDir = re.sub("/", "", strImageFileDir) strImageFileName = os.path.basename(strImageFileString) strImageFileNamewoSuffix = os.path.splitext(strImageFileName)[0] strCommand = "{cmd} convert {arg1} -resize 250x250\\> {arg2}".format( cmd = GM_PATH, arg1 = PUBLICATION_DIR / strImageFileString, arg2 = OUTPUT_DIR / "images/embedded" / (strImageFileDir + strImageFileName), ) # strCommand = GM_PATH + " convert " + os.getcwd() + "/" + strImageFileString + " -resize 250x250\\> " + os.getcwd() + "/CONVERT/django/images/embedded/" + strImageFileDir + strImageFileName listArguments = shlex.split(strCommand) subprocess.check_output(listArguments, shell=False) tmpStrTail = xmlFigure.tail xmlFigure.clear() xmlFigure.tag = "img" xmlFigure.set("src", strImageFileDir + strImageFileName) xmlFigure.set("alt", "") xmlResult.append(xmlEOATranscription) elif xmlElement.tag == "EOAletterhead": xmlResult = etree.Element("temp") xmlEOAletterhead = etree.Element("EOAletterhead") xmlEOAletterrecipient = xmlElement.find(".//Recipient") xmlEOAletterhead.append(xmlEOAletterrecipient) xmlEOAletterarchive = xmlElement.find(".//Archive") xmlEOAletterhead.append(xmlEOAletterarchive) xmlEOAletteradditional = xmlElement.find(".//Additional") xmlEOAletterhead.append(xmlEOAletteradditional) xmlEOAletterpages = xmlElement.find(".//Pages") xmlEOAletterhead.append(xmlEOAletterpages) xmlEOAletterhead.set("order", str(intObjectNumber)) intObjectNumber += 1 xmlResult.append(xmlEOAletterhead) elif xmlElement.tag == "EOAfigurenonumber": xmlResult = etree.Element("temp") xmlEOAfigure = etree.Element("EOAfigurenonumber") # Copy Image strImageFileString = xmlElement.find(".//file").text strImageFileString = strImageFileString.rstrip("\n") strImageFileDir = os.path.dirname(strImageFileString) strImageFileDir = re.sub("/", "", strImageFileDir) strImageFileName = os.path.basename(strImageFileString) strImageFileNamewoSuffix = os.path.splitext(strImageFileName)[0] shutil.copy( PUBLICATION_DIR / strImageFileString, OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName) ) xmlEOAfigure.set("file", strImageFileDir + strImageFileName) xmlEOAfigure.set("width", xmlElement.find(".//width").text + "px;") xmlEOAfigure.set("order", str(intObjectNumber)) intObjectNumber += 1 xmlResult.append(xmlEOAfigure) elif xmlElement.tag == "EOAfigure": hi_figure_types = ["hitrue", "hionly", "hionlycollage", "hionlysub"] xmlResult = etree.Element("temp") # Create basic Element EOAfigure xmlEOAfigure = etree.Element("EOAfigure") figure_type = xmlElement.get("type") strImageFileString = xmlElement.find(".//file").text strImageFileString = strImageFileString.rstrip("\n") strImageFileDir = os.path.dirname(strImageFileString) strImageFileDir = re.sub("/", "", strImageFileDir) strImageFileName = os.path.basename(strImageFileString) logging.debug("This is figure %s", strImageFileName) strImageFileNamewoSuffix = os.path.splitext(strImageFileName)[0] # Copy Image if figure_type in ["hionly", "hionlycollage", "hionlysub"]: logging.debug(f"Found hyperimage figure ({figure_type}), no need to copy them.") xmlEOAfigure.set("file", strImageFileDir + strImageFileName) pass else: shutil.copy( PUBLICATION_DIR / strImageFileString, OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName) ) logging.debug("Django figure %s." % strImageFileName) # yellow if os.path.splitext(strImageFileName)[1].lower() == ".pdf": logging.debug(f"""Found a PDF file: {OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName)}""") strImageFilepath = libeoaconvert.sanitizeImage( OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName), TEMP_DIR, # os.getcwd() + "/CONVERT/django/images/" + strImageFileDir + strImageFileName, GM_PATH, PDFCROP_EXEC ) xmlEOAfigure.set("file", strImageFileDir + strImageFileName.replace(".pdf", ".png")) logging.debug("The filename is %s" % xmlEOAfigure.get("file")) else: xmlEOAfigure.set("file", strImageFileDir + strImageFileName) if figure_type in hi_figure_types: xmlEOAfigure.set("hielement", xmlElement.get("hielement")) if figure_type in ["hionly", "hionlycollage", "hionlysub"]: logging.debug(f"Found hyperimage figure ({figure_type}), no need for caption and size information.") strFigureNumber = dictFigures[xmlElement.find(".//anchor").get("id")] xmlEOAfigure.set("number", strFigureNumber) else: xmlEOAfigure.set("width", xmlElement.find(".//width").text + "px;") xmlEOAfigure.append(xmlElement.find(".//caption")) # Insert visual Number and uid strFigureNumber = dictFigures[xmlElement.find(".//anchor").get("id")] xmlEOAfigure.set("number", strFigureNumber) strFigureUID = xmlElement.find(".//anchor").get("id") xmlEOAfigure.set("id", strFigureUID) xmlEOAfigure.set("order", str(intObjectNumber)) xmlResult.append(xmlEOAfigure) intObjectNumber += 1 elif xmlElement.findall(".//EOAtable"): xmlResult = etree.Element("EOAtable") xmlRawTable = xmlElement.find(".//table") xmlResult.set("order", str(intObjectNumber)) intObjectNumber += 1 xmlResult.append(xmlRawTable) # Copy Number, Label and Caption if xmlElement.find(".//EOAtablecaption").text != "nonumber": xmlResult.append(xmlElement.find(".//EOAtablecaption")) xmlResult.set("label", xmlElement.find(".//EOAtablelabel").text) table_id = xmlRawTable.get("id") table_label = xmlRawTable.get("id") xmlResult.set("number", dictTables[table_label]) xmlResult.set("id", xmlRawTable.get("id")) else: xmlElement.set("numbering", "false") #if xmlElement.find(".//EOAtablelabel").text is not None: # Transform width of Columns strColumnString = xmlElement.find(".//EOAtablecolumns").text strColumnString = re.sub(r"\|", "", strColumnString) reMatchObjects = re.findall(r'([L|R|C].*?cm)', strColumnString) intTableWidth = 0 listColumnAlignments = [None] listColumnWidths = [None] intNumberOfColumns = 0 for strColumnDefinition in reMatchObjects: strColumnDefinition = strColumnDefinition.rstrip("cm") strColumnAlignment = strColumnDefinition[0] if strColumnAlignment == "L": strColumnAlignment = "left" if strColumnAlignment == "C": strColumnAlignment = "center" if strColumnAlignment == "R": strColumnAlignment = "right" listColumnAlignments.append(strColumnAlignment) intColumnWidth = int(float(strColumnDefinition.lstrip("LRC")) * 75) listColumnWidths.append(intColumnWidth) intTableWidth += intColumnWidth intNumberOfColumns += 1 xmlRawTable.set("width", str(intTableWidth)) # Figure out and deal with the Header xmlHeader = xmlRawTable.find(".//row/cell/tableheader") if xmlHeader is not None: xmlHeader.text = "" xmlHeader.getparent().text = xmlHeader.tail xmlHeader.getparent().remove(xmlHeader) xmlFirstRow = xmlRawTable.find(".//row") xmlFirstRow.tag = "tr" xmlFirstRowCells = xmlFirstRow.findall(".//cell") for xmlFirstRowCell in xmlFirstRowCells: xmlFirstRowCell.tag = "th" # Now Deal with the rest of the rows xmlTableRows = xmlRawTable.findall(".//row") for xmlTableRow in xmlTableRows: xmlTableCells = xmlTableRow.findall(".//cell") intCurrentColumn = 1 for xmlTableCell in xmlTableCells: xmlTableCell.tag = "td" xmlTableCell.set("align",listColumnAlignments[intCurrentColumn]) xmlTableCell.set("style","width: " + str(listColumnWidths[intCurrentColumn]) + ";") # Deal with multicolumn if xmlTableCell.get("cols") is not None: xmlTableCell.set("colspan", xmlTableCell.get("cols")) if intCurrentColumn > len(xmlTableCells): intCurrentColumn = 1 # Deal with multicolumn again, increase intCurrentColumn by the columns being spanned elif xmlTableCell.get("cols") is not None: intCurrentColumn = intCurrentColumn + int(xmlTableCell.get("cols")) del xmlTableCell.attrib["cols"] else: intCurrentColumn += 1 # deal with multirow if xmlTableCell.get("rowspan") is not None: cellchildren = xmlTableCell.getchildren() for child in cellchildren: if child.tag == "figure": child.tag = "img" imagepath = f"{child.get('file')}.{child.get('extension')}" logging.debug(f"{imagepath}") strImageFileDir = os.path.dirname(imagepath) strImageFileDir = re.sub("/", "", strImageFileDir) strImageFileName = os.path.basename(imagepath) logging.debug(f"{strImageFileDir} and {strImageFileName}") shutil.copy( PUBLICATION_DIR / imagepath, OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName) ) if child.get('extension') == "pdf": strImageFilepath = libeoaconvert.sanitizeImage( OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName), TEMP_DIR, GM_PATH, PDFCROP_EXEC ) child.set("src", f"{strImageFileDir + strImageFileName}".replace(".pdf", ".png")) else: child.set("src", strImageFileDir) child.set("width", f"{str(listColumnWidths[intCurrentColumn])}px") del child.attrib["rend"] del child.attrib["file"] del child.attrib["extension"] xmlTableRow.tag = "tr" xmlTableRow.set("valign", "top") elif xmlElement.tag == "list" and xmlElement.get('type') != 'description': xmlResult = etree.Element("temp") if xmlElement.get('type') == 'ordered': # Change first item into EOAlistfirstitem xmlFirstItem = xmlElement.find("..//item") xmlFirstItemElement = xmlFirstItem.getchildren()[0] xmlResult.append(djangoParseObject(xmlFirstItemElement,indent=True, listtype="ordered", listnumber=xmlFirstItem.get("label"), uid=xmlFirstItem.get("id"))) # Process Child Elements which are Part of this item if len(xmlFirstItem.getchildren()) >= 1: for xmlChild in xmlFirstItem.iterchildren(): xmlResult.append(djangoParseObject(xmlChild,indent=True)) xmlFirstItem.getparent().remove(xmlFirstItem) # Process remaining items in this list tmpIntNumber = 2 for xmlItem in xmlElement.iterchildren(): xmlItemElement = xmlItem.getchildren()[0] xmlResult.append(djangoParseObject(xmlItemElement,indent=True,listtype="ordered",listnumber=xmlItem.get("label"), uid=xmlItem.get("id"))) tmpIntNumber += 1 if len(xmlItem.getchildren()) >= 1: for xmlChild in xmlItem.iterchildren(): xmlResult.append(djangoParseObject(xmlChild, indent=True)) xmlItem.getparent().remove(xmlItem) if xmlElement.get('type') == 'simple': xml_first_child = xmlElement.getchildren()[0] if xml_first_child.tag == 'item': logging.debug("a simple list with no special items") # Change first item into EOAlistfirstitem xmlFirstItem = xmlElement.find("..//item") xmlFirstItemElement = xmlFirstItem.getchildren()[0] xmlResult.append(djangoParseObject(xmlFirstItemElement,indent=True,listtype="unordered", listnumber="-")) # Process Child Elements which are Part of this item if len(xmlFirstItem.getchildren()) >= 1: logging.debug("len xmlFirstItem.getchildren is greater or equal 1") for xmlChild in xmlFirstItem.iterchildren(): xmlResult.append(djangoParseObject(xmlChild,indent=True)) xmlFirstItem.getparent().remove(xmlFirstItem) for xmlItem in xmlElement.iterchildren(): xmlItemElement = xmlItem.getchildren()[0] xmlResult.append(djangoParseObject(xmlItemElement,indent=True)) if len(xmlItem.getchildren()) >= 1: for xmlChild in xmlItem.iterchildren(): xmlResult.append(djangoParseObject(xmlChild,indent=True)) xmlItem.getparent().remove(xmlItem) ############# # Baustelle # ############# elif xml_first_child.tag == 'label': logging.debug("a simple list with named items") # Change first item into EOAlistfirstitem xmlFirstItem = xmlElement.find("..//item") xmlFirstItemElement = xmlFirstItem.getchildren()[0] logging.debug(xmlFirstItemElement.text) # debugging logging.debug(etree.tostring(xmlFirstItemElement)) # end of debugging xml_first_label = xmlElement.find("..//label") listnumber_text = xml_first_label.text xmlResult.append(djangoParseObject(xmlFirstItemElement,indent=True,listtype="unordered custom", listnumber=listnumber_text)) logging.debug("The length of the children of the first item: %s." % len(xmlFirstItem.getchildren())) # Process Child Elements which are Part of this item if len(xmlFirstItem.getchildren()) >= 1: logging.debug("len xmlFirstItem.getchildren is greater or equal 1") for xmlChild in xmlFirstItem.iterchildren(): xmlResult.append(djangoParseObject(xmlChild,indent=True)) xmlFirstItem.getparent().remove(xmlFirstItem) xml_first_label.getparent().remove(xml_first_label) all_the_labels = xmlElement.findall("label") all_the_items = xmlElement.findall("item") logging.debug("itemlength %s." % len(all_the_items)) logging.debug("labellength %s." % len(all_the_labels)) for listlabel, listitem in zip(all_the_labels, all_the_items): logging.debug("listitem text %s." % listitem.text) logging.debug("listlabel text %s." % listlabel.text) xml_item_element = listitem.getchildren()[0] xmlResult.append(djangoParseObject(xml_item_element, indent=True, listnumber=listlabel.text)) listlabel.getparent().remove(listlabel) listitem.getparent().remove(listitem) # for xmlItem in xmlElement.iterchildren(): # print("So many items have we: ", len(xmlItem)) # xmlItemElement = xmlItem.getchildren()[0] # xmlResult.append(djangoParseObject(xmlItemElement,indent=True)) # if len(xmlItem.getchildren()) >= 1: # for xmlChild in xmlItem.iterchildren(): # xmlResult.append(djangoParseObject(xmlChild,indent=True)) # xmlItem.getparent().remove(xmlItem) ################## # Ende Baustelle # ################## elif xmlElement.tag == "list" and xmlElement.get('type') == 'description': logging.debug("A description") xmlResult = etree.Element("temp") while len(xmlElement.getchildren()) != 0: xmlDescription = etree.Element("EOAdescription") xmlDescription.set("order", str(intObjectNumber)) xmlLabel = xmlElement.getchildren()[0] label_children = xmlLabel.getchildren() if label_children: last_child = label_children[-1] if last_child.tail.endswith(":"): last_child.tail = last_child.tail[:-1] else: if xmlLabel.text.endswith(":"): xmlLabel.text = xmlLabel.text[:-1] xmlItem = xmlElement.getchildren()[1] if len(xmlItem.getchildren()) > 0: xmlContent = xmlItem.getchildren()[0] else: xmlContent = etree.Element("p") xmlLabel.tag = "description" xmlDescription.append(xmlLabel) xmlDescription.append(xmlContent) xmlResult.append(xmlDescription) intObjectNumber += 1 if len(xmlItem.getchildren()) > 0: for xmlChild in xmlItem.iterchildren(): xmlResult.append(djangoParseObject(xmlChild,indent=True)) xmlItem.getparent().remove(xmlItem) elif xmlElement.tag == "theorem": xmlTheoremHead = xmlElement.find(".//head") xmlTheoremText = xmlElement.find(".//p") strTheoremNumber = xmlElement.get("id-text") strTheoremID = xmlElement.get("id") xmlResult = etree.Element("EOAtheorem") xmlResult.append(xmlTheoremHead) xmlResult.append(xmlTheoremText) xmlResult.set("order", str(intObjectNumber)) xmlResult.set("number", strTheoremNumber) xmlResult.set("uid", strTheoremID) intObjectNumber += 1 elif xmlElement.findall(".//EOAequationarray"): xmlResult = etree.Element("temp") for xmlEquation in xmlElement.findall(".//EOAequation"): xmlEOAequation = etree.Element("EOAequation") xmlEOAequation.set("order", str(intObjectNumber)) intObjectNumber += 1 xmlEOAequation.set("number", xmlEquation.get("number")) xmlEOAequation.set("filename", xmlEquation.get("filename")) if xmlEquation.get("label") is not None: xmlEOAequation.set("label", xmlEquation.get("label")) shutil.copy( INPUT_DIR / "items" /xmlEquation.get("filename"), OUTPUT_DIR / "images/" ) # shutil.copy(os.getcwd() + "/items/" + xmlEquation.get("filename"), os.getcwd() + "/CONVERT/django/images/") xmlEOAequation.set("TeX", xmlEquation.get("TeX")) if xmlEquation.get("label") is not None: xmlEOAequation.set("label", xmlEquation.get("label")) xmlResult.append(xmlEOAequation) elif xmlElement.findall(".//EOAequationarraynonumber"): xmlResult = etree.Element("temp") for xmlEquation in xmlElement.findall(".//EOAequationarraynonumber"): xmlEOAequation = etree.Element("EOAequation") xmlEOAequation.set("order", str(intObjectNumber)) intObjectNumber += 1 xmlEOAequation.set("number", "") xmlEOAequation.set("filename", xmlEquation.get("filename")) shutil.copy( INPUT_DIR / "items" / xmlEquation.get("filename"), OUTPUT_DIR / "images/" ) # shutil.copy(os.getcwd() + "/items/" + xmlEquation.get("filename"), os.getcwd() + "/CONVERT/django/images/") xmlEOAequation.set("TeX", xmlEquation.get("TeX")) xmlResult.append(xmlEOAequation) elif xmlElement.tag == "EOAequationnonumber": # Process one EOAequation which is not encapsulated xmlResult = etree.Element("EOAequation") xmlResult.set("order", str(intObjectNumber)) intObjectNumber += 1 xmlResult.set("filename", xmlElement.get("filename")) xmlResult.set("TeX", xmlElement.get("TeX")) shutil.copy( INPUT_DIR / "items" / xmlElement.get("filename"), OUTPUT_DIR / "images/" ) # shutil.copy(os.getcwd() + "/items/" + xmlElement.get("filename"), os.getcwd() + "/CONVERT/django/images/") xmlResult.set("number", "") elif xmlElement.findall(".//EOAequation"): # Process various Equations which may be encapsulated within <p> xmlEquations = xmlElement.findall(".//EOAequation") xmlResult = etree.Element("temp") for xmlEquation in xmlEquations: # Create basic Element EOAequation xmlEOAequation = etree.Element("EOAequation") xmlEOAequation.set("order", str(intObjectNumber)) intObjectNumber += 1 xmlEOAequation.set("number", xmlEquation.get("number")) xmlEOAequation.set("TeX", xmlEquation.get("TeX")) if xmlEquation.get("uid") is not None: xmlEOAequation.set("uid", xmlEquation.get("uid")) shutil.copy( INPUT_DIR / "items" / xmlEquation.get("filename"), OUTPUT_DIR / "images/" ) # shutil.copy(os.getcwd() + "/items/" + xmlEquation.get("filename"), os.getcwd() + "/CONVERT/django/images/") xmlEOAequation.set("filename", xmlEquation.get("filename")) xmlResult.append(xmlEOAequation) elif xmlElement.tag == "EOAequation": # Process one EOAequation which is not encapsulated xmlResult = etree.Element("EOAequation") xmlResult.set("order", str(intObjectNumber)) intObjectNumber += 1 xmlResult.set("number", xmlElement.get("number")) xmlResult.set("TeX", xmlElement.get("TeX")) if xmlElement.get("uid") is not None: xmlResult.set("uid", xmlElement.get("uid")) shutil.copy( INPUT_DIR / "items" / xmlElement.get("filename"), OUTPUT_DIR / "images/" ) # shutil.copy(os.getcwd() + "/items/" + xmlElement.get("filename"), os.getcwd() + "/CONVERT/django/images/") xmlResult.set("filename", xmlElement.get("filename")) elif xmlElement.tag == "div3": xmlResult = etree.Element("EOAsubsection") xmlResult.set("order", str(intObjectNumber)) intObjectNumber += 1 xmlResult.append(xmlElement.find("head")) for xmlChild in xmlElement.iterchildren(): xmlResult.append(djangoParseObject(xmlChild)) elif xmlElement.tag == "div4": xmlResult = etree.Element("EOAsubsubsection") xmlResult.set("order", str(intObjectNumber)) intObjectNumber += 1 xmlResult.append(xmlElement.find("head")) for xmlChild in xmlElement.iterchildren(): xmlResult.append(djangoParseObject(xmlChild)) elif xmlElement.tag == "epigraph": xmlResult = etree.Element("EOAparagraph") xmlResult.set("class", "epigraph") xmlResult.set("order", str(intObjectNumber)) intObjectNumber += 1 x_children = xmlElement.getchildren() first_element = True for child in x_children: if child.tag == "p": child.tag = "tagtobestripped" linebreak = etree.Element("br") xmlResult.append(linebreak) if not first_element: paragraphbreak = etree.Element("br") xmlResult.append(paragraphbreak) xmlResult.append(deepcopy(child)) elif child.tag == "EOAverse": if not first_element: paragraphbreak = etree.Element("br") xmlResult.append(paragraphbreak) verse_result = treat_verselines(child) xmlResult.append(verse_result) first_element = False elif xmlElement.tag == "EOAverse": xmlResult = etree.Element("EOAparagraph") if xmlElement.get("class") is not None: xmlResult.set("class", xmlElement.get("class")) xmlResult.set("style", "verse") xmlResult.set("order", str(intObjectNumber)) intObjectNumber += 1 xml_verselines = xmlElement.findall("p") xmlResult.append(deepcopy(xml_verselines[0])) for xml_verseline in xml_verselines[1:]: linebreak = etree.Element("br") xmlResult.append(linebreak) copied_line = deepcopy(xml_verseline) xmlResult.append(copied_line) etree.strip_tags(xmlResult, "p") elif xmlElement.tag == "EOAbox": logging.debug("Found a box") xmlResult = etree.Element("temp") xmlResult.set("style", "box") box_header = xmlElement.find("head") box_header.tag = "EOAparagraph" box_header.set("style", "box") box_header.set("order", str(intObjectNumber)) head_contents = box_header.find("p") head_contents.tag = "b" # etree.strip_tags(box_header, "p") xmlResult.append(box_header) intObjectNumber += 1 # question: what to do about paragraph equivalent objects? box_elements = xmlElement.getchildren() logging.debug(len(box_elements)) for box_element in box_elements: if box_element.tag == "p": box_element.tag = "EOAparagraph" box_element.set("style", "box") box_element.set("order", str(intObjectNumber)) xmlResult.append(box_element) intObjectNumber += 1 elif xmlElement.tag == "p" and xmlElement.get("class") == "divider": xmlElement.tag = "EOAparagraph" xmlElement.set("order", str(intObjectNumber)) intObjectNumber += 1 xmlResult = xmlElement elif xmlElement.tag == "EOAtocentry": # throw them out for the time being xmlResult = etree.Element("temp") elif xmlElement.tag == "pagebreak": # throw them out for the time being xmlResult = etree.Element("temp") else: if xmlElement.getchildren() == [] and not xmlElement.text: logging.debug(f"Removing empty paragraph") xmlResult = etree.Element("temp") else: xmlElement.tag = "EOAparagraph" logging.debug(f"The beginning of this paragraph is: '{libeoaconvert.gettext(xmlElement)[:40]}…'") quoted_paragraph = xmlElement.get("rend") if quoted_paragraph is not None and quoted_paragraph == "quoted": xmlElement.set("rend", "quoted") xmlElement.set("order", str(intObjectNumber)) intObjectNumber += 1 xmlResult = xmlElement else: logging.info("SPECIAL: %s - %s" % (xmlElement, xmlElement.text)) xmlResult = xmlElement if indent==True: xmlResult.set("indent", "True") if listtype != None: xmlResult.set("listtype", listtype) if listnumber != 0: xmlResult.set("listnumber", listnumber) if uid != None: xmlResult.set("id", uid) return xmlResult # def djangoParseObject ends here def make_index(index_hits, index_type): """Make an index""" dictIndex = {} for xmlEOAindex in index_hits: strMainEntry = xmlEOAindex.get("main") str_display_entry = xmlEOAindex.get("display") # If strMainEntry not in Index, then create new index element if strMainEntry not in dictIndex: dictIndex[strMainEntry] = {} dictIndex[strMainEntry]["display_string"] = "" dictIndex[strMainEntry]["listMainentries"] = [] dictIndex[strMainEntry]["dictSubentries"] = {} # store the display string here. if str_display_entry is not None: dictIndex[strMainEntry]["display_string"] = str_display_entry else: dictIndex[strMainEntry]["display_string"] = strMainEntry # if entry has no subentry then append it to listMainentries if strMainEntry in dictIndex and xmlEOAindex.get("secondary") == None: dictIndex[strMainEntry]["listMainentries"].append(xmlEOAindex) # if entry has subentry, proceed on the second level if strMainEntry in dictIndex and xmlEOAindex.get("secondary") is not None: # put the next line in anyway # dictIndex[strMainEntry]["listMainentries"].append(xmlEOAindex) strSubEntry = xmlEOAindex.get("secondary") # if strSubEntry is not in dictSubentries, then create new list if strSubEntry not in dictIndex[strMainEntry]["dictSubentries"]: dictIndex[strMainEntry]["dictSubentries"][strSubEntry] = [] dictIndex[strMainEntry]["dictSubentries"][strSubEntry].append(xmlEOAindex) else: dictIndex[strMainEntry]["dictSubentries"][strSubEntry].append(xmlEOAindex) # Sort the main index listSortedKeys = sorted(dictIndex.keys(), key=str.lower) if index_type == "regular": new_index_element = "EOAprintindex" else: new_index_element = "EOAprint%sindex" % index_type # Create new and empty xmlTree for xmlEOAindex xmlEOAprintindex = etree.Element(new_index_element) xmlEOAindexsection = None listFirstChars = [] for strSortedKey in listSortedKeys: strFirstChar = strSortedKey[0].upper() if strFirstChar not in listFirstChars: logging.debug("Beginning a new letter: %s." % strFirstChar) listFirstChars.append(strFirstChar) if xmlEOAindexsection is not None: xmlEOAprintindex.append(xmlEOAindexsection) xmlEOAindexsection = etree.Element("EOAindexsection") xmlEOAindexsection.set("Character", strFirstChar) # beginning a new entry xmlEOAindexentry = etree.Element("EOAindexentry") xmlEOAindexentry.set("main", strSortedKey) xmlEOAindexentry.set("display", dictIndex[strSortedKey]["display_string"]) logging.debug("Index entry: %s." % strSortedKey) for xmlMainelement in dictIndex[strSortedKey]["listMainentries"]: logging.info(xmlMainelement.get("chapterorder") + ":" + xmlMainelement.get("elementorder")) xmlEOAindexlink = etree.Element("EOAindexlink") xmlEOAindexlink.set("chapterorder", xmlMainelement.get("chapterorder")) xmlEOAindexlink.set("elementorder", xmlMainelement.get("elementorder")) if xmlMainelement.get("bold") is not None: xmlEOAindexlink.set("bold", "True") xmlEOAindexentry.append(xmlEOAindexlink) # If there are any subentries, process them now if len(dictIndex[strSortedKey]["dictSubentries"]) > 0: logging.debug("Processing Subentries") listSortedSubKeys = sorted(dictIndex[strSortedKey]["dictSubentries"]) for strSortedSubKey in listSortedSubKeys: xmlEOAindexsubentry = etree.Element("EOAindexsubentry") xmlEOAindexsubentry.set("secondary", strSortedSubKey) for xmlSubElement in dictIndex[strSortedKey]["dictSubentries"][strSortedSubKey]: strSubEntry = xmlSubElement.get("secondary") # Hier noch die Links auf den Untereintrag einfügen xmlEOAindexlink = etree.Element("EOAindexlink") xmlEOAindexlink.set("chapterorder", xmlSubElement.get("chapterorder")) xmlEOAindexlink.set("elementorder", xmlSubElement.get("elementorder")) xmlEOAindexsubentry.append(xmlEOAindexlink) if xmlSubElement.get("bold") is not None: xmlEOAindexlink.set("bold", "True") logging.debug(strSubEntry) xmlEOAindexentry.append(xmlEOAindexsubentry) xmlEOAindexsection.append(xmlEOAindexentry) # if xmlEOAindexsection is not None: xmlEOAprintindex.append(xmlEOAindexsection) return(xmlEOAprintindex) # def make_index ends here def djangoParseHeadline(xmlElement): # Parse EOAauthor and append it to the Chapter Information xmlAuthors = xmlElement.find(".//EOAauthor") if xmlAuthors is not None: strAuthors = xmlAuthors.text xmlElement.remove(xmlAuthors) strAuthors = re.sub("(, and | and | und )", ",", strAuthors) listAuthors = re.split("\,", strAuthors) logging.debug(listAuthors) if len(listAuthors) >= 1: for i in range(len(listAuthors)): xmlAuthor = etree.Element("EOAauthor") # Remove Spaces before and after AuthorString if listAuthors[i][0] == " ": strAuthor = listAuthors[i][1:] elif listAuthors[i].endswith(" "): strAuthor = listAuthors[i][:-1] else: strAuthor = listAuthors[i] xmlAuthor.text = strAuthor xmlElement.append(xmlAuthor) return xmlElement # def djangoParseHeadline ends here def check_publication_cfg(configuration_file): """Check the configuration file before uploading This function is adapted from the publicationimport script. """ logging.debug("Checking configuration file %s.", configuration_file) config = configparser.ConfigParser() try: config.read(configuration_file) except configparser.ParsingError as err: logging.error(err) technical_items = ["Serie", "Number", "Title", "Subtitle", "PublicationDate", "Language", "License", "ISBN", "Price", "Shoplink"] general_items = ["BriefDescription", "DetailedDescription", "Submitter", "EditorialCoordination", "Copyediting", "Translator", "Dedication"] authors_items = ["Author1", "Author2", "Author3", "Author4", "Author5", "Zusatz"] categories = {"Technical" : technical_items, "General" : general_items, "Authors" : authors_items} for cat in categories: for item in categories[cat]: try: config[cat][item] except KeyError: logging.error("%s is missing in configuration.", item) return # def check_publication_cfg ends here def treat_verselines(verse_element): "Dissolve verselines to lines with linebreak milestones" xml_result = etree.Element("tagtobestripped") xml_verselines = verse_element.findall("p") for xml_verseline in xml_verselines: xml_verseline.tag = "tagtobestripped" xml_result.append(deepcopy(xml_verselines[0])) for xml_verseline in xml_verselines[1:]: linebreak = etree.Element("br") xml_result.append(linebreak) copied_line = deepcopy(xml_verseline) xml_result.append(copied_line) return xml_result # def treat_verselines ends here # Iterate over Chapters, Sections, Subsections, and Subsubsections and # Put all on one level: EOAchapter intChapterNumber = 1 listPartIDs = [] for xmlChapter in xmlChapters: intObjectNumber = 1 # Process Chapter Title xmlEOAchapter = etree.Element("EOAchapter") xmlEOAchapter.set("type","regular") xmlLanguage = xmlChapter.get("language") if xmlLanguage is not None: # KT changing this after separating the big script strLanguage = xmlLanguage #or "english" else: strLanguage = "english" xmlEOAchapter.set("language", strLanguage) # xmlEOAchapter.set("language", xmlChapter.get("language")) xmlEOAchapter.set("order", str(intChapterNumber)) if xmlChapter.get("n") != "nonumber": xmlEOAchapter.set("id", xmlChapter.get("id")) xmlChapterHeadline = xmlChapter.find(".//head") if xmlChapter.get("id") in dictChapters: xmlEOAchapter.set("number", dictChapters[xmlChapter.get("id")]) else: xmlEOAchapter.set("number", "") logging.info("-----------------------------------------------------") logging.info(libeoaconvert.gettext(xmlChapterHeadline)) xmlEOAchapter.append(djangoParseHeadline(xmlChapterHeadline)) # Deal with EOAauthor if xmlChapter.find(".//EOAauthor") is not None: xmlEOAchapter.append(xmlChapter.find(".//EOAauthor")) # Attache enclosing Part to Chapter, see django structure for this purpose if xmlChapter.getparent().tag == "div0": if xmlChapter.getparent().get("id") not in listPartIDs: listPartIDs.append(xmlChapter.getparent().get("id")) xmlPartHeadline = xmlChapter.getparent().find("head") xmlPartHeadline.tag = "EOAparthtml" xmlEOAchapter.append(xmlPartHeadline) # Append Chapter to xmlEOAdocument xmlEOAdocument.append(xmlEOAchapter) # iterate over children of Chapter for xmlChapterChild in xmlChapter.iterchildren(): if xmlChapterChild.tag == "div2": # Process Section Title xmlEOAsection = etree.Element("EOAsection") xmlEOAsection.set("order", str(intObjectNumber)) if xmlChapterChild.get("n") != "nonumber": xmlEOAsection.set("id", xmlChapterChild.get("id")) xmlEOAsection.set("number", dictSections[xmlChapterChild.get("id")]) intObjectNumber += 1 xmlHead = xmlChapter.find(".//head") logging.debug("Section '%s'" % libeoaconvert.gettext(xmlHead)) xmlEOAsection.append(djangoParseHeadline(xmlHead)) xmlEOAchapter.append(xmlEOAsection) # Iterate over Children of Section for xmlSectionChild in xmlChapterChild.iterchildren(): if xmlSectionChild.tag == "div3": # Process Subsection Title xmlEOAsubsection = etree.Element("EOAsubsection") xmlEOAsubsection.set("order", str(intObjectNumber)) if xmlSectionChild.get("n") != "nonumber": xmlEOAsubsection.set("id", xmlSectionChild.get("id")) xmlEOAsubsection.set("number", dictSections[xmlSectionChild.get("id")]) intObjectNumber += 1 xmlHead = xmlSectionChild.find(".//head") logging.debug("Subsection '%s'" % libeoaconvert.gettext(xmlHead)) xmlEOAsubsection.append(djangoParseHeadline(xmlHead)) xmlEOAchapter.append(xmlEOAsubsection) # Iterate over children of Subsection for xmlSubsectionChild in xmlSectionChild.iterchildren(): if xmlSubsectionChild.tag == "div4": # Process Subsubsection Title xmlEOAsubsubsection = etree.Element("EOAsubsubsection") xmlEOAsubsubsection.set("order", str(intObjectNumber)) intObjectNumber += 1 xmlHead = xmlSubsectionChild.find(".//head") logging.debug(libeoaconvert.gettext(xmlHead)) xmlEOAsubsubsection.append(djangoParseHeadline(xmlHead)) xmlEOAchapter.append(xmlEOAsubsubsection) # Iterate over children of Subsubsection for xmlSubsubsectionChild in xmlSubsectionChild.iterchildren(): xmlEOAchapter.append(djangoParseObject(xmlSubsubsectionChild)) else: xmlEOAchapter.append(djangoParseObject(xmlSubsectionChild)) elif xmlSectionChild.tag == "div4": # Process Subsubsection Title xmlEOAsubsubsection = etree.Element("EOAsubsubsection") xmlEOAsubsubsection.set("order", str(intObjectNumber)) intObjectNumber += 1 xmlHead = xmlSectionChild.find(".//head") xmlEOAsubsubsection.append(djangoParseHeadline(xmlHead)) xmlEOAchapter.append(xmlEOAsubsubsection) # Iterate over children of Subsubsection for xmlSubsubsectionChild in xmlSectionChild.iterchildren(): if xmlSubsubsectionChild.tag == "div5": logging.debug("jubel") # although it's div5, promote it to subsubsection xmlEOAparasection = etree.Element("EOAsubsubsection") # xmlEOAparasection = etree.Element("EOAparasection") xmlEOAparasection.set("order", str(intObjectNumber)) intObjectNumber += 1 xmlHead = xmlSubsubsectionChild.find(".//head") logging.debug(libeoaconvert.gettext(xmlHead)) xmlEOAparasection.append(djangoParseHeadline(xmlHead)) xmlEOAchapter.append(xmlEOAparasection) for xmlParasectionChild in xmlSubsubsectionChild.iterchildren(): xmlEOAchapter.append(djangoParseObject(xmlParasectionChild)) else: xmlEOAchapter.append(djangoParseObject(xmlSubsubsectionChild)) else: xmlEOAchapter.append(djangoParseObject(xmlSectionChild)) else: xmlEOAchapter.append(djangoParseObject(xmlChapterChild)) intChapterNumber += 1 libeoaconvert.debug_xml_here( xmlTree, "afterchapter", DEBUG_DIR ) logging.info("----------------------------------------------") logging.info("Processing Facsimile Parts") listModes = ["text", "textPollux", "xml"] strBasicURL = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/interface/page-fragment.xql?document=" parserECHO = etree.XMLParser() xmlParts = xmlTree.findall("//div0") intFacNumber = 1 for xmlPart in xmlParts: intObjectNumber = 1 intFacPartNumber = 1 if xmlPart.find(".//EOAfacsimilepart") is None: continue xmlEOAfacsimilepart = etree.Element("EOAfacsimilepart") xmlEOAfacsimilepart.set("order", str(intChapterNumber)) xmlEOAfacsimileparthead = xmlPart.find(".//head") for xmlChild in xmlEOAfacsimileparthead: if xmlChild.tag == "hi": xmlChild.tag = "em" del xmlChild.attrib["rend"] xmlEOAfacsimilepart.append(xmlEOAfacsimileparthead) intChapterNumber += 1 xmlEOAdocument.append(xmlEOAfacsimilepart) xmlFacsimilepages = xmlPart.findall(".//EOAfacsimilepage") intFacPageNumber = 1 for xmlFacsimilepage in xmlFacsimilepages: strImageFile = xmlFacsimilepage.find(".//file").text strLabel = xmlFacsimilepage.find(".//label").text strPagenumber = xmlFacsimilepage.find(".//pagenumber").text or "" xmlEOAfacsimilepage = etree.Element("EOAfacsimilepage") xmlEOAfacsimilepage.set("order", str(intObjectNumber)) # TODO: Hier noch irgendwie (fehlendem) Suffix der Datei umgehen. Und ggf. Dateien Konvertieren strImageFile = strImageFile.rstrip("\n") strImageFileDir = os.path.dirname(strImageFile) strImageFileDir = re.sub("/", "", strImageFileDir) strImageFileName = os.path.basename(strImageFile) shutil.copy( PUBLICATION_DIR / strImageFile, OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName) ) # shutil.copy(os.getcwd() + "/" + strImageFile, os.getcwd() + "/CONVERT/django/images/" + strImageFileDir + strImageFileName) intObjectNumber += 1 # Download transcription for this Page fulltext_string = xmlFacsimilepage.find(".//fulltext").text if fulltext_string is not None: logging.debug(f"Found a link to full text: {fulltext_string}") if fulltext_string.find(",") == -1: logging.info("Fulltext is linked in the document.") # hier weiter!!! else: strFacsimileURL = re.split(",", fulltext_string)[0] strFacsimilePage = re.split(",", fulltext_string)[1] for strMode in listModes: strURL = strBasicURL + strFacsimileURL + "&pn=" + strFacsimilePage + "&mode=" + strMode logging.debug("Processing Facsimile : " + strURL) xmlECHOtree = etree.parse(strURL, parserECHO) # Remove ECHO-namespaces objectify.deannotate(xmlECHOtree, xsi_nil=True) etree.cleanup_namespaces(xmlECHOtree) xmlDivs = xmlECHOtree.findall(".//div") for xmlDiv in xmlDivs: if xmlDiv.get("class") == "pageContent": # Create new EOA-Element xmlEOAfacsimileelement = etree.Element("EOAfacsimileelement") xmlEOAfacsimileelement.set("type", strMode) # Fix Images in the <div>-Element xmlImages = xmlDiv.findall(".//img") intFacImgNumber = 1 for xmlImage in xmlImages: strImageSrc = xmlImage.get("src") strCommand = "{cmd} {src} -o {dst}".format( cmd = curl, src = strImageSrc, dst = OUTPUT_DIR / "images" / ("facsupplements_" + str(intFacNumber) + "_" + str(intFacPageNumber) + "_" + str(intFacImgNumber) + ".jpg") ) # strCommand = "curl " + strImageSrc + " -o CONVERT/django/images/facsupplements_" + str(intFacNumber) + "_" + str(intFacPageNumber) + "_" + str(intFacImgNumber) + ".jpg" listArguments = shlex.split(strCommand) try: exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True) xmlImage.set("src", "facsupplements_" + str(intFacNumber) + "_" + str(intFacPageNumber) + "_" + str(intFacImgNumber) + ".jpg") except: xmlImage.tag = "temp" intFacImgNumber += 1 # Change of scr of img-Element xmlEOAfacsimileelement.append(xmlDiv) xmlEOAfacsimilepage.append(xmlEOAfacsimileelement) intFacPageNumber += 1 xmlEOAfacsimilepage.set("file", strImageFileDir + strImageFileName) xmlEOAfacsimilepage.set("label", str(strLabel)) xmlEOAfacsimilepage.set("pagenumber", str(strPagenumber)) xmlEOAfacsimilepart.append(xmlEOAfacsimilepage) intFacNumber =+ 1 etree.strip_tags(xmlDjangoTree, "temp") logging.info("----------------------------------------------") logging.info("Processing and linking Footnotes for django") def bring_footnote_down_django(footnote, fragment, footnote_number, object_number, unique_id, destination): """ captures reusable behavior from the existing code potentially, some of the old code could be replaced by calls to this helper usage: intObjectNumber = bring_footnote_down_django(xmlFootnote, "fn"+str(intFootnoteNumber), str(intFootnoteNumber), intObjectNumber, tmpStrUID, xmlResult) unfortunately, returning the result seemed like a better idea than mutating the global variable """ kids = list(footnote.getchildren()) footnote_text = footnote.text or "" replace_footnote_with_sup(footnote) footnote.set("class", "footnote") anchor = etree.Element("a") anchor.set("href", "#" + fragment) # "fn" + str(intFootnoteNumber) anchor.text = footnote_number # str(intFootnoteNumber) footnote.append(anchor) foot = etree.Element("EOAfootnote") foot.set("order", str(object_number)) object_number += 1 foot.set("number", footnote_number) anchor_number = next( iter( ( parent.get("order") for parent in footnote.iterancestors() if parent.get("order") is not None ) ) ) foot.set("anchor", anchor_number) foot.set("id", unique_id) foot.text = footnote_text for kid in kids: if "EOAequationnonumber" == kid.tag: cwd = os.getcwd() shutil.copy( "%s/items/%s" % (cwd, kid.get("filename")), "%s/images/" % cwd, ) foot.append(kid) destination.append(foot) return object_number # def bring_footnote_down_django ends here xmlEOAchapters = xmlEOAdocument.findall(".//EOAchapter") debug_chapters(xmlEOAchapters) translation_xml = etree.parse( str( TRANSLATION_FILE ) ) dictLangFootnotes = translation_xml.find("//entry[@name='footnotes']").attrib for xmlEOAchapter in xmlEOAchapters: groupings = libeoaconvert.get_bigfoot_data(xmlEOAchapter) has_old = 0 != len(xmlEOAchapter.findall(".//note[@place='Inline']")) has_new = 0 != len( [ # flatten note for grouping, notes in groupings for note in notes ] ) # XOR falls through, AND is an error (that should have already been thrown during the epub phase), and NOR skips to the next chapter if has_old: if has_new: raise FootnoteError("This chapter contains both old-style footnotes and new-style footnotes") else: if not has_new: continue # Find out running order of last item the chapter # Hier pro FN zunächst die EOAequationnonumber in <p> korrigieren # Dann pro FN die Kindelemente abarbeiten und an die neue FN dran hängen # Ggf. aufpassen, ob ein Absatz mit indent versehen ist, dann blockquote drum herum machen xmlElement = xmlEOAchapter[(len(xmlEOAchapter)-1)] logging.debug(etree.tostring(xmlElement)) intObjectNumber = (int(xmlElement.get("order")) + 1) intFootnoteNumber = 1 xmlResult = etree.Element("temp") xmlEOAsection = etree.Element("EOAsection") xmlEOAsection.set("order", str(intObjectNumber)) intObjectNumber += 1 xmlHead = etree.Element("head") xmlHead.text = dictLangFootnotes[libeoaconvert.two_letter_language(xmlEOAchapter.get("language"))] xmlEOAsection.append(xmlHead) xmlResult.append(xmlEOAsection) for grouping, notes in groupings: for index, note in enumerate(notes): # do for the new-style notes what the old code did for the other footnotes fntext = str(index+1) if "lower-latin" == grouping: fntext = alph_footnote_index(index) unique_id = "fn%s" % fntext intObjectNumber = bring_footnote_down_django(note, unique_id, fntext, intObjectNumber, unique_id, xmlResult) intFootnoteNumber = 1 xmlFootnotes = xmlEOAchapter.findall(".//note[@place='Inline']") for xmlFootnote in xmlFootnotes: xmlFootnoteContent = xmlFootnote.getchildren() strFootnoteText = xmlFootnote.text or "" tmpTail = xmlFootnote.tail tmpStrUID = xmlFootnote.get("id") logging.debug(f"Looking at footnote {tmpStrUID}.") xmlFootnote.clear() xmlFootnote.tail = tmpTail xmlFootnote.tag = "sup" xmlFootnote.set("class", "footnote") xmlFootnoteLink = etree.Element("a") xmlFootnoteLink.set("href", "#fn" + str(intFootnoteNumber)) xmlFootnoteLink.text = str(intFootnoteNumber) xmlFootnote.append(xmlFootnoteLink) xmlEOAfootnote = etree.Element("EOAfootnote") xmlEOAfootnote.set("order", str(intObjectNumber)) intObjectNumber += 1 xmlEOAfootnote.set("number", str(intFootnoteNumber)) for xmlParent in xmlFootnote.iterancestors(): if xmlParent.get("order") is not None: strFootnoteAnchorNumber = xmlParent.get("order") break xmlEOAfootnote.set("anchor", strFootnoteAnchorNumber) xmlEOAfootnote.set("id", tmpStrUID) xmlEOAfootnote.text = strFootnoteText for xmlElement in xmlFootnoteContent: if xmlElement.tag != "p": surrounding_p = etree.fromstring("""<p></p>""") if xmlElement.tag == "EOAequationnonumber": shutil.copy( PUBLICATION_DIR / "items" / xmlElement.get("filename"), OUTPUT_DIR / "images/" ) elif xmlElement.tag == "EOAverse": verse_tail = xmlElement.tail xmlElement.tail = "" xmlElement.tag = "span" xmlElement.set("style", "verse") versecontent_bytes = etree.tostring(xmlElement) versecontent_string = versecontent_bytes.decode("utf-8") xml_verselines = versecontent_string.split("\n") logging.debug("Removing surrounding EOAverse tags") xml_result_string = xml_verselines[0] for xml_verseline in xml_verselines[1:]: xml_result_string += f"<br/>{xml_verseline}" verse_paragraph = etree.fromstring(xml_result_string) xmlElement.tail = verse_tail surrounding_p.append(xmlElement) elif xmlElement.tag == "span": surrounding_p.append(xmlElement) elif xmlElement.tag == "EOAfigurenonumber": surrounding_p = etree.fromstring("""<img/>""") strImageFileString = xmlElement.find(".//file").text strImageFileString = strImageFileString.rstrip("\n") strImageFileDir = os.path.dirname(strImageFileString) strImageFileDir = re.sub("/", "", strImageFileDir) strImageFileName = os.path.basename(strImageFileString) strImageFileNamewoSuffix = os.path.splitext(strImageFileName)[0] shutil.copy( PUBLICATION_DIR / strImageFileString, OUTPUT_DIR / "images" / "embedded" / (strImageFileDir + strImageFileName) ) surrounding_p.set("src", strImageFileDir + strImageFileName) surrounding_p.set("width", xmlElement.find(".//width").text + "%;") xmlElement = surrounding_p else: logging.debug("Footnote paragraph") xmlEOAfootnote.append(xmlElement) xmlResult.append(xmlEOAfootnote) intFootnoteNumber += 1 xmlEOAchapter.append(xmlResult) # Remove temp-Tag etree.strip_tags(xmlDjangoTree, "temp") logging.info("----------------------------------------------") logging.info("Processing various Elements") for xmlEOAchapter in xmlEOAchapters: xmlEmphasized = xmlEOAchapter.findall(".//hi") for xmlEmph in xmlEmphasized: if xmlEmph.get("rend") == "it": xmlEmph.tag = "em" del xmlEmph.attrib["rend"] xmlHyperlinks = xmlEOAchapter.findall(".//xref") for xmlHyperlink in xmlHyperlinks: libeoaconvert.format_hyperlinks_django_epub(xmlHyperlink, strLanguage) # Convert bold text xmlBolds = xmlEOAchapter.findall(".//EOAbold") for xmlBold in xmlBolds: if xmlBold.get("rend") == "bold": xmlBold.tag = "b" del xmlBold.attrib["rend"] # Convert EOAup to <sup> xmlUps = xmlEOAchapter.findall(".//EOAup") for xmlUp in xmlUps: xmlUp.tag = "sup" # Convert EOAdown to <sub> xmlDowns = xmlEOAchapter.findall(".//EOAdown") for xmlDown in xmlDowns: xmlDown.tag = "sub" # Convert EOAst to <span> xmlStrikeouts = xmlEOAchapter.findall(".//EOAst") for xmlStrikeout in xmlStrikeouts: xmlStrikeout.tag = "span" xmlStrikeout.set("style", "text-decoration: line-through;") # Convert letter-spacing into something nice xmlLetterspaceds = xmlEOAchapter.findall(".//EOAls") for xmlLetterspaced in xmlLetterspaceds: xmlLetterspaced.tag = "span" xmlLetterspaced.set("style", "letter-spacing: 0.5em;") # Convert letter-spacing into something nice xmlCaps = xmlEOAchapter.findall(".//EOAcaps") for xmlCap in xmlCaps: xmlCap.tag = "span" xmlCap.set("style", "font-variant:small-caps;") # Convert EOAineq into appropriate IMG-Tags xmlInlineEquations = xmlEOAchapter.findall(".//EOAineq") for xmlInlineEquation in xmlInlineEquations: xmlInlineEquation.tag = "img" xmlInlineEquation.set("class", "EOAineq") xmlInlineEquation.set("alt", xmlInlineEquation.get("TeX")) shutil.copy( INPUT_DIR / "items" / xmlInlineEquation.get("src"), OUTPUT_DIR / "images" / xmlInlineEquation.get("src") ) # shutil.copy(os.getcwd() + "/items/" + xmlInlineEquation.get("src"), os.getcwd() + "/CONVERT/django/images/" + xmlInlineEquation.get("src")) # Convert EOAchem into appropriate IMG-Tags xml_inline_chems = xmlEOAchapter.findall(".//EOAchem") for xml_inline_chem in xml_inline_chems: xml_inline_chem.tag = "img" xml_inline_chem.set("class", "EOAineq") xml_inline_chem.set("alt", xml_inline_chem.get("TeX")) shutil.copy( INPUT_DIR / "items" / xml_inline_chem.get("src"), OUTPUT_DIR / "images" / xml_inline_chem.get("src") ) # shutil.copy(os.getcwd() + "/items/" + xml_inline_chem.get("src"), os.getcwd() + "/CONVERT/django/images/" + xml_inline_chem.get("src")) # Convert EOAinline into appropriate IMG-Tags xmlInlineElements = xmlEOAchapter.findall(".//EOAinline") for xmlInlineElement in xmlInlineElements: xmlInlineElement.tag = "img" xmlInlineElement.set("class", "EOAinline") xmlInlineElement.set("alt", "") xmlInlineElement.set("class", "eoainlineimage") strInlineElementFilePath = xmlInlineElement.text strInlineElementFileName = os.path.basename(strInlineElementFilePath) strInlineElementDirName = os.path.dirname(strInlineElementFilePath) strInlineElementSubDirName = os.path.dirname(strInlineElementFilePath).split(os.path.sep)[-1] xmlInlineElement.text = None xmlInlineElement.set("src", strInlineElementSubDirName + strInlineElementFileName) logging.debug(f"{strInlineElementDirName} is dirname, {strInlineElementFileName} is filename/basepath") logging.debug(f"""copy from {PUBLICATION_DIR / strInlineElementDirName / strInlineElementFileName} to {OUTPUT_DIR / "images/embedded" / (strInlineElementDirName + strInlineElementFileName)}""") shutil.copy( PUBLICATION_DIR / strInlineElementDirName / strInlineElementFileName, OUTPUT_DIR / "images/embedded" / (strInlineElementSubDirName + strInlineElementFileName) ) # shutil.copy(os.getcwd() + "/" + strInlineElementDirName + "/" + strInlineElementFileName, os.getcwd() + "/CONVERT/django/images/embedded/" + strInlineElementDirName + strInlineElementFileName) strNewImagePath = OUTPUT_DIR / "images/embedded" / (strInlineElementSubDirName + strInlineElementFileName) # strNewImagePath = os.getcwd() + "/CONVERT/django/images/embedded/" + strInlineElementDirName + strInlineElementFileName strCommand = GM_PATH + " convert " + str(strNewImagePath) + " -resize 20x20 " + str(strNewImagePath) listArguments = shlex.split(strCommand) subprocess.check_output(listArguments, shell=False) # Change EOAcitenumeric into a span to create approriate link xmlEOAcitenumerics = xmlEOAchapter.findall(".//EOAcitenumeric") for xmlEOAcitenumeric in xmlEOAcitenumerics: xmlEOAcitenumeric.tag = "span" xmlEOAcitenumeric.set("class", "citation") xmlEOAcitenumeric.set("rel", "popover") # Change EOAciteauthoryear into a span to create approriate link xmlEOAciteauthoryears = xmlEOAchapter.findall(".//EOAciteauthoryear") for xmlEOAciteauthoryear in xmlEOAciteauthoryears: xmlEOAciteauthoryear.tag = "span" xmlEOAciteauthoryear.set("class", "citation") xmlEOAciteauthoryear.set("rel", "popover") # Change EOAciteauthoryear into a span to create approriate link xmlEOAciteyears = xmlEOAchapter.findall(".//EOAciteyear") for xmlEOAciteyear in xmlEOAciteyears: xmlEOAciteyear.tag = "span" xmlEOAciteyear.set("class", "citation") xmlEOAciteyear.set("rel", "popover") # Change EOAciteauthoryear into a span to create approriate link xmlEOAcitemanuals = xmlEOAchapter.findall(".//EOAcitemanual") for xmlEOAcitemanual in xmlEOAcitemanuals: xmlEOAcitemanual.tag = "span" xmlEOAcitemanual.set("class", "citation") xmlEOAcitemanual.set("rel", "popover") logging.info("----------------------------------------------") logging.info("Processing Cross References") # Substitute References with their targets (wit links) for xmlEOAchapter in xmlEOAchapters: # for hyperimage collages originalcontents = xmlEOAchapter.findall(".//originalcontents") if originalcontents is not None: for originalcontent in originalcontents: previous_element = originalcontent.getprevious() if originalcontent.getparent().tag == "EOAref": pass elif previous_element.tag != "EOAref": logging.error("Found a stray originalcontents element.") else: oc_tail = originalcontent.tail originalcontent.tail = "" previous_element.append(originalcontent) if previous_element.tail is not None: logging.warning("Appending the old tail of EOAref") previous_element.tail += oc_tail else: previous_element.tail = oc_tail else: logging.debug("No originalcontents elements found.") xmlReferences = xmlEOAchapter.findall(".//EOAref") for xmlReference in xmlReferences: strResult = "!!! Cross Reference !!!" strChapterOrder = "" strObjectOrder = "" xmlReferenceLabel = xmlReference.find("Label") xmlReferenceLabelText = xmlReferenceLabel.text xmlReferenceRef = xmlReference.find("ref") xmlReferenceRefTarget = xmlReferenceRef.get("target") if xmlReferenceLabelText in dictEquations: # Grab Number from Dictionary strResult = dictEquations[xmlReferenceLabelText] # Go through all equations and find the corresponding Equation xmlEOAequations = xmlEOAdocument.findall(".//EOAequation") for xmlEOAequation in xmlEOAequations: tmpReferenceLabelText = xmlEOAequation.get("label") if xmlReferenceLabelText == tmpReferenceLabelText: logging.debug("Successfully found link to array formula: %s" % strResult) for xmlParent in xmlEOAequation.iterancestors(): if xmlParent.tag == "EOAchapter": strChapterOrder = xmlParent.get("order") strObjectOrder = xmlEOAequation.get("order") if xmlReferenceRefTarget in dictEquations: # Grab Number from Dictionary strResult = dictEquations[xmlReferenceRefTarget] # Go through all equations and find the corresponding Equation xmlEOAequations = xmlEOAdocument.findall(".//EOAequation") for xmlEOAequation in xmlEOAequations: tmpReferenceRefTarget = xmlEOAequation.get("uid") if xmlReferenceRefTarget == tmpReferenceRefTarget: logging.debug("Successfully found link to normal formula: %s" % strResult) for xmlParent in xmlEOAequation.iterancestors(): if xmlParent.tag == "EOAchapter": strChapterOrder = xmlParent.get("order") strObjectOrder = xmlEOAequation.get("order") if xmlReferenceRefTarget in dictLists: logging.debug("Found link to list.") strResult = dictLists[xmlReferenceRefTarget] xmlEOAlistitem = xmlEOAdocument.xpath("//EOAchapter/*[contains(@id, $targetuid)]", targetuid = xmlReferenceRefTarget)[0] for xmlParent in xmlEOAlistitem.iterancestors(): if xmlParent.tag == "EOAchapter": strChapterOrder = xmlParent.get("order") strObjectOrder = xmlEOAlistitem.get("order") if xmlReferenceRefTarget in dictChapters: logging.debug("Found link to chapter.") strResult = dictChapters[xmlReferenceRefTarget] for xmlEOAchapter in xmlEOAdocument.findall(".//EOAchapter"): if xmlEOAchapter.get("id") == xmlReferenceRefTarget: logging.debug("Successfully handled link to a chapter: %s" % strResult) strObjectOrder = "top" strChapterOrder = xmlEOAchapter.get("order") if xmlReferenceRefTarget in dictTheorems: logging.debug("Found link to ein Theorem") strResult = dictTheorems[xmlReferenceRefTarget] for xmlEOAtheorem in xmlEOAdocument.findall(".//EOAtheorem"): if xmlEOAtheorem.get("uid") == xmlReferenceRefTarget: logging.debug("Successfully handled link to a theorem: %s " % strResult) for xmlParent in xmlEOAtheorem.iterancestors(): if xmlParent.tag == "EOAchapter": strObjectOrder = xmlEOAtheorem.get("order") strChapterOrder = xmlParent.get("order") if xmlReferenceRefTarget in dictSections: logging.debug("Found link to section") strResult = dictSections[xmlReferenceRefTarget] xmlEOAsections = xmlEOAdocument.findall(".//EOAsection") for xmlEOAsection in xmlEOAsections: tmpReferenceRefTarget = xmlEOAsection.get("id") if xmlReferenceRefTarget == tmpReferenceRefTarget: logging.debug("Successfully handled link to section: %s " % strResult) for xmlParent in xmlEOAsection.iterancestors(): if xmlParent.tag == "EOAchapter": strChapterOrder = xmlParent.get("order") strObjectOrder = xmlEOAsection.get("order") xmlEOAsubsections = xmlEOAdocument.findall(".//EOAsubsection") for xmlEOAsubsection in xmlEOAsubsections: tmpReferenceRefTarget = xmlEOAsubsection.get("id") if xmlReferenceRefTarget == tmpReferenceRefTarget: logging.debug("Successfully handled link to subsection %s: " % strResult) for xmlParent in xmlEOAsubsection.iterancestors(): if xmlParent.tag == "EOAchapter": strChapterOrder = xmlParent.get("order") strObjectOrder = xmlEOAsubsection.get("order") if xmlReferenceRefTarget in dictFigures: logging.debug("Found link to figure") strResult = dictFigures[xmlReferenceRefTarget] xmlEOAfigures = xmlEOAdocument.findall(".//EOAfigure") for xmlEOAfigure in xmlEOAfigures: tmpReferenceRefTarget = xmlEOAfigure.get("id") if xmlReferenceRefTarget == tmpReferenceRefTarget: logging.debug("Successfully handled link to figure: %s" % strResult) for xmlParent in xmlEOAfigure.iterancestors(): if xmlParent.tag == "EOAchapter": strChapterOrder = xmlParent.get("order") strObjectOrder = xmlEOAfigure.get("order") if xmlReferenceRefTarget in dictFootnotes: logging.debug("Found link to footnote") strResult = dictFootnotes[xmlReferenceRefTarget] xmlEOAfootnotes = xmlEOAdocument.findall(".//EOAfootnote") for xmlEOAfootnote in xmlEOAfootnotes: tmpReferenceRefTarget = xmlEOAfootnote.get("id") if xmlReferenceRefTarget == tmpReferenceRefTarget: logging.debug("Successfully handled link to footnote: %s" % strResult) for xmlParent in xmlEOAfootnote.iterancestors(): if xmlParent.tag == "EOAchapter": strChapterOrder = xmlParent.get("order") strObjectOrder = xmlEOAfootnote.get("order") if xmlReferenceRefTarget in dictTables: logging.debug("Found link to table") strResult = dictTables[xmlReferenceRefTarget] xmlEOAtables = xmlEOAdocument.findall(".//EOAtable") for xmlEOAtable in xmlEOAtables: tmpReferenceRefTarget = xmlEOAtable.get("label") if xmlReferenceLabelText == tmpReferenceRefTarget: logging.debug("Successfully handled link to table: %s" % strResult) for xmlParent in xmlEOAtable.iterancestors(): if xmlParent.tag == "EOAchapter": strChapterOrder = xmlParent.get("order") strObjectOrder = xmlEOAtable.get("order") tmpTail = xmlReference.tail or "" originalcontents = xmlReference.find("originalcontents") if xmlReference.get("type") == "collage": ref_is_collage = True else: ref_is_collage = False xmlReference.clear() if originalcontents is not None: logging.info("Found originalcontents") xmlReference.append(originalcontents) else: xmlReference.text = strResult xmlReference.tail = tmpTail xmlReference.tag = "a" # hyperimage if xmlReferenceRef.get("data-hilayer"): xmlReference.set("data-hilayer", xmlReferenceRef.get("data-hilayer")) if xmlReference.text: logging.debug(xmlReference.text) xmlReference.text if xmlReferenceRef.get("hitarget"): xmlReference.set("class", "HILink") href_string = "#" + xmlReferenceRef.get("hitarget") elif strObjectOrder: href_string = "../" + strChapterOrder + "/index.html#" + strObjectOrder else: href_string = "strChapterOrder missing" logging.warning("strObjectOrder is missing!") xmlReference.set("href", href_string) if ref_is_collage: xmlReference.set("type", "collage") else: pass logging.info("----------------------------------------------") logging.info("Processing Page References") for xmlEOAchapter in xmlEOAchapters: xmlPageReferences = xmlEOAchapter.findall(".//EOApageref") strResult = "!!! Page Reference !!!" for xmlReference in xmlPageReferences: xmlReferenceLabel = xmlReference.find("Label") xmlReferenceLabelText = xmlReferenceLabel.text xmlReferenceRef = xmlReference.find("ref") xmlReferenceRefTarget = xmlReferenceRef.get("target") if xmlReferenceLabelText in dictPagelabels: logging.debug("Found link to page: %s" % xmlReferenceLabelText) strResult = dictPagelabels[xmlReferenceLabelText] xmlReference.text = strResult for xmlChild in xmlReference.iterchildren(): xmlReference.remove(xmlChild) # Check, if EOApageref points to a Facsimile-Page # If yes, make a href to the facsimile xmlEOAfacsimilepages = xmlEOAdocument.findall(".//EOAfacsimilepage") for xmlEOAfacsimilepage in xmlEOAfacsimilepages: if xmlEOAfacsimilepage.get("label") == xmlReferenceLabelText: logging.debug("Found cross reference to facsimile.") xmlReference.tag = "a" strPartOrder = xmlEOAfacsimilepage.getparent().get("order") strFacsimileOrder = xmlEOAfacsimilepage.get("order") logging.debug(strFacsimileOrder) xmlReference.set("href", "../" + strPartOrder + "/" + strFacsimileOrder + ".html") logging.info("----------------------------------------------") logging.info("Normalizing Index Entries") for xmlEOAchapter in xmlEOAchapters: xml_EOA_indices = xmlEOAchapter.xpath(".//EOAindex | .//EOAindexperson | .//EOAindexlocation") for xmlEOAindex in xml_EOA_indices: # Using the gettext function here, because of subelements # strEOAindextext = xmlEOAindex.text strEOAindextext = libeoaconvert.gettext(xmlEOAindex) strEOAindextext = strEOAindextext.replace("\n", " ") index_children = xmlEOAindex.getchildren() if index_children is not None: for sub_element in index_children: xmlEOAindex.remove(sub_element) xmlEOAindex.text = None listFirstPart = re.split('\|', strEOAindextext) tmpEntry = listFirstPart[0] listSecondPart = re.split('\!', tmpEntry) strMainEntry = listSecondPart[0] # Check if a sortkey is present via @ listSortKey = re.split('@', strMainEntry) if len(listSortKey) == 2: xmlEOAindex.set("main", listSortKey[0]) xmlEOAindex.set("display", listSortKey[1]) else: xmlEOAindex.set("main", strMainEntry) if len(listSecondPart) > 1: strSecondPart = listSecondPart[1] listSecondarySortkey = re.split('@', strSecondPart) if len(listSecondarySortkey) == 2: xmlEOAindex.set("secondary", listSecondarySortkey[0]) xmlEOAindex.set("secondarydisplay", listSecondarySortkey[1]) else: xmlEOAindex.set("secondary", strSecondPart) if len(listFirstPart) > 1: strAddition = listFirstPart[1] if strAddition == "textbf": xmlEOAindex.set("bold", "true") tmpseealso = re.match('seealso', strAddition) if tmpseealso != None: tmpAddition = re.sub('seealso', '', strAddition) xmlEOAindex.set("seealso", tmpAddition) # Entries containing seealso are omitted for the time being xmlEOAindex.tag = "temp" tmpsee = re.match('^see(?!also)', strAddition) if tmpsee != None: tmpAddition = re.sub('see', '', strAddition) xmlEOAindex.set("see", tmpAddition) # Entries containing seealso are omitted for the time being xmlEOAindex.tag = "temp" # Figure out parent chapter number and parent Element order for xmlParent in xmlEOAindex.iterancestors(): if xmlParent.get("order") != None and xmlParent.tag != "EOAchapter": xmlEOAindex.set("elementorder", xmlParent.get("order")) if xmlParent.get("order") != None and xmlParent.tag == "EOAchapter": xmlEOAindex.set("chapterorder", xmlParent.get("order")) # logging.info(etree.tostring(xmlEOAindex)) etree.strip_tags(xmlDjangoTree, "temp") logging.info("----------------------------------------------") logging.info("Removing Duplicate Index Entries") for xmlEOAchapter in xmlEOAchapters: for xmlChild in xmlEOAchapter.iterchildren(): dictEntries = {} xml_EOA_indices = xmlChild.xpath(".//EOAindex | .//EOAindexperson | .//EOAindexlocation") for xmlEOAindex in xml_EOA_indices: listEntry = [] strEntry = xmlEOAindex.get("main") if strEntry in dictEntries: strSubentry = xmlEOAindex.get("secondary") if strSubentry in dictEntries[strEntry] or strSubentry == None: if (xmlChild.get("see") is None) and (xmlChild.get("seealso") is None): xmlEOAindex.tag = "temp" else: dictEntries[strEntry].append(strSubentry) else: dictEntries[strEntry] = listEntry logging.info("----------------------------------------------") logging.info("Removing Index Entries in Footnotes") for xmlEOAchapter in xmlEOAchapters: for xmlChild in xmlEOAchapter.iterchildren(): dictEntries = {} xml_EOA_indices = xmlChild.xpath(".//EOAindex | .//EOAindexperson | .//EOAindexlocation") for xmlEOAindex in xml_EOA_indices: for xmlParent in xmlEOAindex.iterancestors(): if xmlParent.tag == "EOAfootnote": xmlEOAindex.tag = "temp" logging.debug("Found index in footnote") logging.info("----------------------------------------------") logging.info("Sorting and Creating Regular Index") xml_regular_EOAindices = xmlDjangoTree.findall("//EOAindex") if len(xml_regular_EOAindices) != 0:# is not None: logging.debug("Sorting %s entries for regular index." % str(len(xml_regular_EOAindices))) xml_eoa_print_regular_index = make_index(xml_regular_EOAindices, index_type = "regular") libeoaconvert.debug_xml_here( xmlDjangoTree, "djangotree", DEBUG_DIR ) libeoaconvert.debug_xml_here( xmlEOAdocument, "xmleoadocument", DEBUG_DIR ) libeoaconvert.debug_xml_here( xmlTree, "xmltree", DEBUG_DIR ) # If EOAprintindex is found, append xml_eoa_print_regular_index to xmlEOAdocument xmlPrintindex = xmlTree.find(".//EOAprintindex") if xmlPrintindex is not None != 0: # Remove <p><EOAprintindex/></p> from xmlDjangoTree logging.info("found an index") xmlPrintindex.tag = "temp" xmlPrintindex.getparent().tag = "temp" xmlEOAdocument.append(xml_eoa_print_regular_index) else: logging.info("found no index") logging.info("----------------------------------------------") logging.info("Sorting and Creating Person Index") xml_person_EOAindices = xmlDjangoTree.findall("//EOAindexperson") if len(xml_person_EOAindices) != 0:# is not None: xml_eoa_print_person_index = make_index(xml_person_EOAindices, index_type = "person") # If EOAprintpersonindex is found, append xml_eoa_print_person_index to xmlEOAdocument # xmlPrintindex = xmlDjangoTree.find(".//EOAprintpersonindex") xmlPrintindex = xmlTree.find("//EOAprintpersonindex") if xmlPrintindex is not None != 0: # Remove <p><EOAprintindex/></p> from xmlDjangoTree xmlPrintindex.tag = "temp" xmlPrintindex.getparent().tag = "temp" xmlEOAdocument.append(xml_eoa_print_person_index) # doing the same for location index logging.info("----------------------------------------------") logging.info("Sorting and Creating Location Index") xml_location_EOAindices = xmlDjangoTree.findall("//EOAindexlocation") if len(xml_location_EOAindices) != 0:# is not None: xml_eoa_print_location_index = make_index(xml_location_EOAindices, index_type = "location") # If EOAprintlocationindex is found, append xml_eoa_print_location_index to xmlEOAdocument xmlPrintindex = xmlTree.find(".//EOAprintlocationindex") if xmlPrintindex is not None != 0: xmlPrintindex.tag = "temp" xmlPrintindex.getparent().tag = "temp" xmlEOAdocument.append(xml_eoa_print_location_index) ############################################################################ # Cleaning up # ############################################################################ # TODO: Die unnötigen Attribute wie id löschen # TODO: Die unnötigen Tags wie EOAlabel löschen collagelinks = xmlDjangoTree.xpath(".//a[@type='collage']/originalcontents/a") for link in collagelinks: link.tag = "temp" etree.strip_tags(xmlDjangoTree, "temp", "citetext", "EOAprintbibliography", "originalcontents", "tagtobestripped") etree.strip_elements(xmlDjangoTree, "citekey", "elementtobestripped", with_tail=False) etree.strip_attributes(xmlDjangoTree, "id-text", "id", "noindent", "type", "label", "spacebefore")#, "rend") ############################################################################ # Save xmlDjangoTree # ############################################################################ tmpFile = open( OUTPUT_DIR / "Django.xml", "w") tmpResult = etree.tostring(xmlDjangoTree, pretty_print=True, encoding="unicode") tmpFile.write(tmpResult) tmpFile.close() logging.debug(f"Wrote {OUTPUT_DIR}/Django.xml.") if args.checkpublicationcfg: check_publication_cfg(INPUT_DIR / "publication.cfg") else: pass
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
You can’t perform that action at this time.