Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
EOASkripts/src/imxml2django.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
1996 lines (1830 sloc)
94.4 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8; mode: python -*- | |
# Time-stamp: <2020-09-29 11:38:15 (kthoden)> | |
""" | |
Create an XML file that can be inserted into the Django database | |
of an EOAv1 installation. | |
Input file is a customized DocBook XML that has been generated either | |
with eoatex2imxml or tei2imxml. | |
""" | |
from utils.load_config import load_config, exec_command, check_executable | |
import utils.libeoaconvert as libeoaconvert | |
import pickle | |
import os | |
import sys | |
import re | |
import shutil | |
import shlex | |
import subprocess | |
import argparse | |
import configparser | |
import logging | |
from copy import deepcopy | |
from lxml import etree | |
from pathlib import Path | |
import time | |
BASE_DIR = Path( __file__ ).resolve().parent | |
SCRIPT_PATH = Path( __file__ ) | |
SCRIPT_NAME = SCRIPT_PATH.name | |
DEFAULT_INPUT_DIR = \ | |
Path(os.environ['INPUT_DIR'] if 'INPUT_DIR' in os.environ else './input') | |
DEFAULT_OUTPUT_DIR = \ | |
Path(os.environ['OUTPUT_DIR'] if 'OUTPUT_DIR' in os.environ else './output') | |
##################### | |
# Parsing arguments # | |
##################### | |
parser = argparse.ArgumentParser( | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter | |
) | |
parser.add_argument( | |
"-c", "--config", | |
default = BASE_DIR / "config" / "eoaconvert.cfg", | |
dest="CONFIG_FILE", | |
help="Name of configuration file", | |
metavar="CONFIGURATION", | |
type = Path, | |
) | |
parser.add_argument( | |
"--log-level", | |
default = "INFO", | |
help="log level: choose between DEBUG, INFO, WARNING, ERROR, CRITICAL" | |
) | |
parser.add_argument( | |
"-p", "--checkpublicationcfg", | |
help="Check the publication.cfg for completeness.", | |
action="store_true" | |
) | |
parser.add_argument( | |
"-i", "--input-dir", | |
help = f"directory containing some intermediate xml created by previous steps. default: {DEFAULT_OUTPUT_DIR}/PUBLICATION_NAME/imxml", | |
type = Path, | |
) | |
parser.add_argument( | |
"-o", "--output-dir", | |
help = f"output directory. default: {DEFAULT_OUTPUT_DIR}/PUBLICATION_NAME/django", | |
type = Path, | |
) | |
parser.add_argument( | |
"PUBLICATION_DIR", | |
help = "directory containing the publication (including resources like pictures, etc.)", | |
type = Path, | |
) | |
args = parser.parse_args() | |
######################## | |
# Paths to executables # | |
######################## | |
GM_PATH = "gm" | |
PDFCROP_EXEC = "pdfcrop" # (part of texlive distribution): | |
############################ | |
# Paths: | |
############################ | |
PUBLICATION_DIR = args.PUBLICATION_DIR | |
INPUT_DIR = \ | |
args.input_dir if args.input_dir is not None else DEFAULT_OUTPUT_DIR / PUBLICATION_DIR.resolve().stem / "imxml" | |
OUTPUT_DIR = \ | |
args.output_dir if args.output_dir is not None else (DEFAULT_OUTPUT_DIR / PUBLICATION_DIR.resolve().stem) / "django" | |
LOG_DIR = OUTPUT_DIR / "log" | |
LOG_FILE = (LOG_DIR / SCRIPT_NAME) . with_suffix( ".log" ) | |
TEMP_DIR = OUTPUT_DIR / "tmp_files" | |
DEBUG_DIR = OUTPUT_DIR / "debug" | |
config_file = args.CONFIG_FILE | |
print("The configfile is %s." % config_file) | |
################################## | |
# Reading the configuration file # | |
################################## | |
CONFIG = load_config( | |
config_file, | |
args.log_level, | |
LOG_FILE, | |
) | |
############################ | |
# Paths to auxiliary files # | |
############################ | |
TRANSLATION_FILE = BASE_DIR / CONFIG['Auxiliaries']['TRANSLATIONS'] | |
# prepare: | |
logging.debug("PATH: {}".format( os.environ['PATH'] )) | |
check_executable( GM_PATH ) | |
check_executable( PDFCROP_EXEC ) | |
if not TEMP_DIR.exists(): | |
os.makedirs( TEMP_DIR ) | |
if not DEBUG_DIR.exists(): | |
os.makedirs( DEBUG_DIR ) | |
# Check for folder and necessary files | |
logging.info(f"The publication.cfg file is missing in django directory.") | |
if os.path.exists(INPUT_DIR / "publication.cfg"): | |
shutil.copy(INPUT_DIR / "publication.cfg", OUTPUT_DIR) | |
logging.info(f"Copied from {INPUT_DIR}.") | |
else: | |
logging.error(f"Found no publication.cfg in {INPUT_DIR}. Exiting") | |
sys.exit( 1 ) | |
if os.path.exists(INPUT_DIR / "Cover.jpg"): | |
shutil.copy(INPUT_DIR / "Cover.jpg", OUTPUT_DIR / "Cover.jpg") | |
logging.info("Copied cover image from input directory.") | |
else: | |
logging.error("No coverfile found. You can create a temporary one with the mkimage.py script") | |
sys.exit( 1 ) | |
########################################### | |
# Loading data from first conversion step # | |
########################################### | |
with open(INPUT_DIR / "tmp_files" / 'data.pickle', 'rb') as f: | |
data = pickle.load(f) | |
dictChapters = data["chapterdict"] | |
dictEquations = data["eqdict"] | |
dictLists = data["listdict"] | |
dictTheorems = data["theoremdict"] | |
dictSections = data["secdict"] | |
dictFigures = data["figdict"] | |
dictFootnotes = data["fndict"] | |
dictTables = data["tabdict"] | |
dictPagelabels = data["pagelabeldict"] | |
if not os.path.exists(DEBUG_DIR): | |
os.mkdir(DEBUG_DIR) | |
xmlTree = etree.parse( str(INPUT_DIR / "IntermediateXMLFile.xml") ) | |
libeoaconvert.debug_xml_here( | |
xmlTree, | |
"fresh", | |
DEBUG_DIR | |
) | |
print(""" | |
############################################################################ | |
# Convert tralics-XML to Django Data Structure # | |
############################################################################ | |
""") | |
if not os.path.exists(OUTPUT_DIR / "images"): | |
os.mkdir(OUTPUT_DIR / "images") | |
if not os.path.exists(OUTPUT_DIR / "images" / "embedded"): | |
os.mkdir(OUTPUT_DIR / "images" / "embedded") | |
if not os.path.exists(OUTPUT_DIR / "files"): | |
os.mkdir(OUTPUT_DIR / "files") | |
# Create empty xmlTree | |
xmlEOAdocument = etree.Element("EOAdocument") | |
xmlDjangoTree = etree.ElementTree(xmlEOAdocument) | |
etree.strip_attributes(xmlTree, "noindent") | |
# Remove temp-Tag | |
etree.strip_tags(xmlTree, "temp") | |
libeoaconvert.debug_xml_here( | |
xmlTree, | |
"afterstriptags", | |
DEBUG_DIR | |
) | |
# Write Temporary XML-Maintree | |
ergebnisdatei = open(TEMP_DIR / "Devel_django.xml", "w") | |
ergebnis = etree.tostring(xmlTree, pretty_print=True, encoding="unicode") | |
ergebnisdatei.write(ergebnis) | |
ergebnisdatei.close() | |
# Find all Chapters from the original tralics XML | |
xmlChapters = xmlTree.findall("//div1") | |
def replace_footnote_with_sup(note): | |
""" | |
captures reusable behavior from the existing code | |
potentially, some of the old code could be replaced by calls to this helper | |
this behavior showed up in a few places | |
I thought I would be able to extract a little more, but this was all that was actually common | |
""" | |
tail = note.tail | |
note.clear() | |
note.tail = tail | |
note.tag = "sup" | |
# def replace_footnote_with_sup ends here | |
def alph_footnote_index(fndex): | |
""" | |
lowercase Latin footnotes need to support more than 26 values | |
These are zero-indexed. | |
>>> alph_footnote_index(0) | |
'a' | |
>>> alph_footnote_index(1) | |
'b' | |
>>> alph_footnote_index(24) | |
'y' | |
>>> alph_footnote_index(25) | |
'z' | |
>>> alph_footnote_index(26) | |
'aa' | |
>>> alph_footnote_index(27) | |
'ab' | |
""" | |
alphabet = "abcdefghijklmnopqrstuvwxyz" | |
quotient, remainder = divmod(fndex, len(alphabet)) | |
if not quotient: return alphabet[fndex] | |
return alph_footnote_index(quotient - 1) + alph_footnote_index(remainder) | |
# def alph_footnote_index ends here | |
def debug_chapters(xmlEOAchapters): | |
"""Write individual chapters to files""" | |
chap_num = 1 | |
for chapter in xmlEOAchapters: | |
tmp_filename = DEBUG_DIR / ("debug-chapter-%02d.xml" % chap_num) | |
tmp_file = open (tmp_filename, "w") | |
tmp_result = etree.tostring(chapter, pretty_print=True, encoding="unicode") | |
tmp_file.write(tmp_result) | |
tmp_file.close() | |
chap_num += 1 | |
# def debug_chapters ends here | |
def djangoParseObject(xmlElement, indent=False, listtype=None, listnumber=0, uid=None): | |
# Get Dictionaries of Numbers via Global Variables | |
global dictChapters | |
global dictFigures | |
global dictEquations | |
global dictSections | |
global dictFootnotes | |
global dictPagelabels | |
global dictTables | |
global dictLists | |
global intObjectNumber | |
# Check what kind of Element we have and change the data | |
if isinstance(xmlElement.tag, str): | |
if xmlElement.tag == "EOAtranscripted": | |
xmlResult = etree.Element("temp") | |
xmlEOATranscription = etree.Element("EOAtranscription") | |
xmlEOATranscription.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlLeftheader = xmlElement.find(".//Leftheader") | |
etree.strip_tags(xmlLeftheader, "p") | |
xmlEOATranscription.append(xmlLeftheader) | |
xmlRightheader = xmlElement.find(".//Rightheader") | |
etree.strip_tags(xmlRightheader, "p") | |
xmlEOATranscription.append(xmlRightheader) | |
xmlTranscriptedtext = xmlElement.find(".//EOAtranscriptedtext") | |
# change \n\n into </p><p> and pagebreak intto </p><pagebreak><p> to create some valid markup | |
strTranscriptedtext = etree.tostring(xmlTranscriptedtext, encoding="unicode") | |
#strTranscriptedtext = re.sub (r"\n\n", "</p><p>", str(strTranscriptedtext)) | |
#strTranscriptedtext = re.sub (r"<p><pagebreak/></p>", "<pagebreak/>", strTranscriptedtext) | |
xmlLeftColumn = etree.Element("EOAtranscriptionleft") | |
xmlRightColumn = etree.Element("EOAtranscriptionright") | |
boolRightColumn = False | |
xmlTemp = etree.XML(str(strTranscriptedtext)) | |
for xmlElement in xmlTemp.iterchildren(): | |
if xmlElement.tag == "pagebreak": | |
boolRightColumn = True | |
continue | |
if boolRightColumn == False: | |
xmlLeftColumn.append(xmlElement) | |
if boolRightColumn == True: | |
xmlRightColumn.append(xmlElement) | |
xmlEOATranscription.append(xmlLeftColumn) | |
xmlEOATranscription.append(xmlRightColumn) | |
# Convert Images within the transcription | |
logging.debug("EOAfigurenonumber") | |
xmlFigures = xmlEOATranscription.findall(".//EOAfigurenonumber") | |
logging.debug(xmlFigures) | |
if xmlFigures is not None: | |
for xmlFigure in xmlFigures: | |
# example 'images/1.jpg' | |
strImageFileString = xmlFigure.find(".//file").text | |
strImageFileString = strImageFileString.rstrip("\n") | |
strImageFileDir = os.path.dirname(strImageFileString) | |
strImageFileDir = re.sub("/", "", strImageFileDir) | |
strImageFileName = os.path.basename(strImageFileString) | |
strImageFileNamewoSuffix = os.path.splitext(strImageFileName)[0] | |
strCommand = "{cmd} convert {arg1} -resize 250x250\\> {arg2}".format( | |
cmd = GM_PATH, | |
arg1 = PUBLICATION_DIR / strImageFileString, | |
arg2 = OUTPUT_DIR / "images/embedded" / (strImageFileDir + strImageFileName), | |
) | |
listArguments = shlex.split(strCommand) | |
subprocess.check_output(listArguments, shell=False) | |
tmpStrTail = xmlFigure.tail | |
xmlFigure.clear() | |
xmlFigure.tag = "img" | |
xmlFigure.set("src", strImageFileDir + strImageFileName) | |
xmlFigure.set("alt", "") | |
xmlResult.append(xmlEOATranscription) | |
elif xmlElement.tag == "EOAletterhead": | |
xmlResult = etree.Element("temp") | |
xmlEOAletterhead = etree.Element("EOAletterhead") | |
xmlEOAletterrecipient = xmlElement.find(".//Recipient") | |
xmlEOAletterhead.append(xmlEOAletterrecipient) | |
xmlEOAletterarchive = xmlElement.find(".//Archive") | |
xmlEOAletterhead.append(xmlEOAletterarchive) | |
xmlEOAletteradditional = xmlElement.find(".//Additional") | |
xmlEOAletterhead.append(xmlEOAletteradditional) | |
xmlEOAletterpages = xmlElement.find(".//Pages") | |
xmlEOAletterhead.append(xmlEOAletterpages) | |
xmlEOAletterhead.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlResult.append(xmlEOAletterhead) | |
elif xmlElement.tag == "EOAfigurenonumber": | |
xmlResult = etree.Element("temp") | |
xmlEOAfigure = etree.Element("EOAfigurenonumber") | |
# Copy Image | |
strImageFileString = xmlElement.find(".//file").text | |
strImageFileString = strImageFileString.rstrip("\n") | |
strImageFileDir = os.path.dirname(strImageFileString) | |
strImageFileDir = re.sub("/", "", strImageFileDir) | |
strImageFileName = os.path.basename(strImageFileString) | |
strImageFileNamewoSuffix = os.path.splitext(strImageFileName)[0] | |
shutil.copy( | |
PUBLICATION_DIR / strImageFileString, | |
OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName) | |
) | |
style_attribute = xmlElement.get("style") | |
if style_attribute is not None: | |
xmlEOAfigure.set("style", style_attribute) | |
xmlEOAfigure.set("file", strImageFileDir + strImageFileName) | |
xmlEOAfigure.set("width", xmlElement.find(".//width").text + "px;") | |
xmlEOAfigure.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlResult.append(xmlEOAfigure) | |
elif xmlElement.tag == "EOAfigure": | |
hi_figure_types = ["hitrue", "hionly", "hionlycollage", "hionlysub"] | |
xmlResult = etree.Element("temp") | |
# Create basic Element EOAfigure | |
xmlEOAfigure = etree.Element("EOAfigure") | |
figure_type = xmlElement.get("type") | |
strImageFileString = xmlElement.find(".//file").text | |
strImageFileString = strImageFileString.rstrip("\n") | |
strImageFileDir = os.path.dirname(strImageFileString) | |
strImageFileDir = re.sub("/", "", strImageFileDir) | |
strImageFileName = os.path.basename(strImageFileString) | |
logging.debug("This is figure %s", strImageFileName) | |
strImageFileNamewoSuffix = os.path.splitext(strImageFileName)[0] | |
# Copy Image | |
if figure_type in ["hionly", "hionlycollage", "hionlysub"]: | |
logging.debug(f"Found hyperimage figure ({figure_type}), no need to copy them.") | |
xmlEOAfigure.set("file", strImageFileDir + strImageFileName) | |
pass | |
else: | |
shutil.copy( | |
PUBLICATION_DIR / strImageFileString, | |
OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName) | |
) | |
logging.debug("Django figure %s." % strImageFileName) | |
# yellow | |
if os.path.splitext(strImageFileName)[1].lower() == ".pdf": | |
logging.debug(f"""Found a PDF file: {OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName)}""") | |
strImageFilepath = libeoaconvert.sanitizeImage( | |
OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName), | |
TEMP_DIR, | |
# os.getcwd() + "/CONVERT/django/images/" + strImageFileDir + strImageFileName, | |
GM_PATH, | |
PDFCROP_EXEC | |
) | |
xmlEOAfigure.set("file", strImageFileDir + strImageFileName.replace(".pdf", ".png")) | |
logging.debug("The filename is %s" % xmlEOAfigure.get("file")) | |
else: | |
xmlEOAfigure.set("file", strImageFileDir + strImageFileName) | |
if figure_type in hi_figure_types: | |
xmlEOAfigure.set("hielement", xmlElement.get("hielement")) | |
if figure_type in ["hionly", "hionlycollage", "hionlysub"]: | |
logging.debug(f"Found hyperimage figure ({figure_type}), no need for caption and size information.") | |
strFigureNumber = dictFigures[xmlElement.find(".//anchor").get("id")] | |
xmlEOAfigure.set("number", strFigureNumber) | |
else: | |
xmlEOAfigure.set("width", xmlElement.find(".//width").text + "px;") | |
xmlEOAfigure.append(xmlElement.find(".//caption")) | |
# Insert visual Number and uid | |
strFigureNumber = dictFigures[xmlElement.find(".//anchor").get("id")] | |
xmlEOAfigure.set("number", strFigureNumber) | |
strFigureUID = xmlElement.find(".//anchor").get("id") | |
xmlEOAfigure.set("id", strFigureUID) | |
xmlEOAfigure.set("order", str(intObjectNumber)) | |
xmlResult.append(xmlEOAfigure) | |
intObjectNumber += 1 | |
elif xmlElement.findall(".//EOAtable"): | |
xmlResult = etree.Element("EOAtable") | |
xmlRawTable = xmlElement.find(".//table") | |
xmlResult.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlResult.append(xmlRawTable) | |
# Copy Number, Label and Caption | |
if xmlElement.find(".//EOAtablecaption").text != "nonumber": | |
xmlResult.append(xmlElement.find(".//EOAtablecaption")) | |
xmlResult.set("label", xmlElement.find(".//EOAtablelabel").text) | |
table_id = xmlRawTable.get("id") | |
table_label = xmlRawTable.get("id") | |
xmlResult.set("number", dictTables[table_label]) | |
xmlResult.set("id", xmlRawTable.get("id")) | |
else: | |
xmlElement.set("numbering", "false") | |
#if xmlElement.find(".//EOAtablelabel").text is not None: | |
# Transform width of Columns | |
strColumnString = xmlElement.find(".//EOAtablecolumns").text | |
strColumnString = re.sub(r"\|", "", strColumnString) | |
reMatchObjects = re.findall(r'([L|R|C].*?[c|m]m)', strColumnString) | |
intTableWidth = 0 | |
listColumnAlignments = [None] | |
listColumnWidths = [None] | |
intNumberOfColumns = 0 | |
for strColumnDefinition in reMatchObjects: | |
strColumnDefinition = strColumnDefinition.rstrip("cm") | |
logging.info(strColumnDefinition) | |
strColumnAlignment = strColumnDefinition[0] | |
if strColumnAlignment == "L": | |
strColumnAlignment = "left" | |
if strColumnAlignment == "C": | |
strColumnAlignment = "center" | |
if strColumnAlignment == "R": | |
strColumnAlignment = "right" | |
listColumnAlignments.append(strColumnAlignment) | |
intColumnWidth = int(float(strColumnDefinition.lstrip("LRC")) * 75) | |
listColumnWidths.append(intColumnWidth) | |
intTableWidth += intColumnWidth | |
intNumberOfColumns += 1 | |
xmlRawTable.set("width", str(intTableWidth)) | |
# Figure out and deal with the Header | |
xmlHeader = xmlRawTable.find(".//row/cell/tableheader") | |
if xmlHeader is not None: | |
xmlHeader.text = "" | |
xmlHeader.getparent().text = xmlHeader.tail | |
xmlHeader.getparent().remove(xmlHeader) | |
xmlFirstRow = xmlRawTable.find(".//row") | |
xmlFirstRow.tag = "tr" | |
xmlFirstRowCells = xmlFirstRow.findall(".//cell") | |
for xmlFirstRowCell in xmlFirstRowCells: | |
xmlFirstRowCell.tag = "th" | |
# Now Deal with the rest of the rows | |
xmlTableRows = xmlRawTable.findall(".//row") | |
for xmlTableRow in xmlTableRows: | |
xmlTableCells = xmlTableRow.findall(".//cell") | |
intCurrentColumn = 1 | |
for xmlTableCell in xmlTableCells: | |
xmlTableCell.tag = "td" | |
xmlTableCell.set("align",listColumnAlignments[intCurrentColumn]) | |
xmlTableCell.set("style","width: " + str(listColumnWidths[intCurrentColumn]) + ";") | |
# Deal with multicolumn | |
if xmlTableCell.get("cols") is not None: | |
xmlTableCell.set("colspan", xmlTableCell.get("cols")) | |
if intCurrentColumn > len(xmlTableCells): | |
intCurrentColumn = 1 | |
# Deal with multicolumn again, increase intCurrentColumn by the columns being spanned | |
elif xmlTableCell.get("cols") is not None: | |
intCurrentColumn = intCurrentColumn + int(xmlTableCell.get("cols")) | |
del xmlTableCell.attrib["cols"] | |
else: | |
intCurrentColumn += 1 | |
# deal with multirow | |
if xmlTableCell.get("rowspan") is not None: | |
cellchildren = xmlTableCell.getchildren() | |
for child in cellchildren: | |
if child.tag == "figure": | |
child.tag = "img" | |
imagepath = f"{child.get('file')}.{child.get('extension')}" | |
logging.debug(f"{imagepath}") | |
strImageFileDir = os.path.dirname(imagepath) | |
strImageFileDir = re.sub("/", "", strImageFileDir) | |
strImageFileName = os.path.basename(imagepath) | |
logging.debug(f"{strImageFileDir} and {strImageFileName}") | |
shutil.copy( | |
PUBLICATION_DIR / imagepath, | |
OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName) | |
) | |
if child.get('extension') == "pdf": | |
strImageFilepath = libeoaconvert.sanitizeImage( | |
OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName), | |
TEMP_DIR, GM_PATH, PDFCROP_EXEC | |
) | |
child.set("src", f"{strImageFileDir + strImageFileName}".replace(".pdf", ".png")) | |
else: | |
child.set("src", strImageFileDir) | |
child.set("width", f"{str(listColumnWidths[intCurrentColumn])}px") | |
del child.attrib["rend"] | |
del child.attrib["file"] | |
del child.attrib["extension"] | |
xmlTableRow.tag = "tr" | |
xmlTableRow.set("valign", "top") | |
elif xmlElement.tag == "list" and xmlElement.get('type') != 'description': | |
xmlResult = etree.Element("temp") | |
if xmlElement.get('type') == 'ordered': | |
# Change first item into EOAlistfirstitem | |
xmlFirstItem = xmlElement.find("..//item") | |
xmlFirstItemElement = xmlFirstItem.getchildren()[0] | |
xmlResult.append(djangoParseObject(xmlFirstItemElement,indent=True, listtype="ordered", listnumber=xmlFirstItem.get("label"), uid=xmlFirstItem.get("id"))) | |
# Process Child Elements which are Part of this item | |
if len(xmlFirstItem.getchildren()) >= 1: | |
for xmlChild in xmlFirstItem.iterchildren(): | |
xmlResult.append(djangoParseObject(xmlChild,indent=True)) | |
xmlFirstItem.getparent().remove(xmlFirstItem) | |
# Process remaining items in this list | |
tmpIntNumber = 2 | |
for xmlItem in xmlElement.iterchildren(): | |
xmlItemElement = xmlItem.getchildren()[0] | |
xmlResult.append(djangoParseObject(xmlItemElement,indent=True,listtype="ordered",listnumber=xmlItem.get("label"), uid=xmlItem.get("id"))) | |
tmpIntNumber += 1 | |
if len(xmlItem.getchildren()) >= 1: | |
for xmlChild in xmlItem.iterchildren(): | |
xmlResult.append(djangoParseObject(xmlChild, indent=True)) | |
xmlItem.getparent().remove(xmlItem) | |
if xmlElement.get('type') == 'simple': | |
xml_first_child = xmlElement.getchildren()[0] | |
if xml_first_child.tag == 'item': | |
logging.debug("a simple list with no special items") | |
# Change first item into EOAlistfirstitem | |
xmlFirstItem = xmlElement.find("..//item") | |
xmlFirstItemElement = xmlFirstItem.getchildren()[0] | |
xmlResult.append(djangoParseObject(xmlFirstItemElement,indent=True,listtype="unordered", listnumber="-")) | |
# Process Child Elements which are Part of this item | |
if len(xmlFirstItem.getchildren()) >= 1: | |
logging.debug("len xmlFirstItem.getchildren is greater or equal 1") | |
for xmlChild in xmlFirstItem.iterchildren(): | |
xmlResult.append(djangoParseObject(xmlChild,indent=True)) | |
xmlFirstItem.getparent().remove(xmlFirstItem) | |
for xmlItem in xmlElement.iterchildren(): | |
xmlItemElement = xmlItem.getchildren()[0] | |
xmlResult.append(djangoParseObject(xmlItemElement,indent=True)) | |
if len(xmlItem.getchildren()) >= 1: | |
for xmlChild in xmlItem.iterchildren(): | |
xmlResult.append(djangoParseObject(xmlChild,indent=True)) | |
xmlItem.getparent().remove(xmlItem) | |
############# | |
# Baustelle # | |
############# | |
elif xml_first_child.tag == 'label': | |
logging.debug("a simple list with named items") | |
# Change first item into EOAlistfirstitem | |
xmlFirstItem = xmlElement.find("..//item") | |
xmlFirstItemElement = xmlFirstItem.getchildren()[0] | |
logging.debug(xmlFirstItemElement.text) | |
# debugging | |
logging.debug(etree.tostring(xmlFirstItemElement)) | |
# end of debugging | |
xml_first_label = xmlElement.find("..//label") | |
listnumber_text = xml_first_label.text | |
xmlResult.append(djangoParseObject(xmlFirstItemElement,indent=True,listtype="unordered custom", listnumber=listnumber_text)) | |
logging.debug("The length of the children of the first item: %s." % len(xmlFirstItem.getchildren())) | |
# Process Child Elements which are Part of this item | |
if len(xmlFirstItem.getchildren()) >= 1: | |
logging.debug("len xmlFirstItem.getchildren is greater or equal 1") | |
for xmlChild in xmlFirstItem.iterchildren(): | |
xmlResult.append(djangoParseObject(xmlChild,indent=True)) | |
xmlFirstItem.getparent().remove(xmlFirstItem) | |
xml_first_label.getparent().remove(xml_first_label) | |
all_the_labels = xmlElement.findall("label") | |
all_the_items = xmlElement.findall("item") | |
logging.debug("itemlength %s." % len(all_the_items)) | |
logging.debug("labellength %s." % len(all_the_labels)) | |
for listlabel, listitem in zip(all_the_labels, all_the_items): | |
logging.debug("listitem text %s." % listitem.text) | |
logging.debug("listlabel text %s." % listlabel.text) | |
xml_item_element = listitem.getchildren()[0] | |
xmlResult.append(djangoParseObject(xml_item_element, indent=True, listnumber=listlabel.text)) | |
listlabel.getparent().remove(listlabel) | |
listitem.getparent().remove(listitem) | |
# for xmlItem in xmlElement.iterchildren(): | |
# print("So many items have we: ", len(xmlItem)) | |
# xmlItemElement = xmlItem.getchildren()[0] | |
# xmlResult.append(djangoParseObject(xmlItemElement,indent=True)) | |
# if len(xmlItem.getchildren()) >= 1: | |
# for xmlChild in xmlItem.iterchildren(): | |
# xmlResult.append(djangoParseObject(xmlChild,indent=True)) | |
# xmlItem.getparent().remove(xmlItem) | |
################## | |
# Ende Baustelle # | |
################## | |
elif xmlElement.tag == "list" and xmlElement.get('type') == 'description': | |
logging.debug("A description") | |
xmlResult = etree.Element("temp") | |
while len(xmlElement.getchildren()) != 0: | |
xmlDescription = etree.Element("EOAdescription") | |
xmlDescription.set("order", str(intObjectNumber)) | |
xmlLabel = xmlElement.getchildren()[0] | |
label_children = xmlLabel.getchildren() | |
if label_children: | |
last_child = label_children[-1] | |
if last_child.tail.endswith(":"): | |
last_child.tail = last_child.tail[:-1] | |
else: | |
if xmlLabel.text.endswith(":"): | |
xmlLabel.text = xmlLabel.text[:-1] | |
xmlItem = xmlElement.getchildren()[1] | |
if len(xmlItem.getchildren()) > 0: | |
xmlContent = xmlItem.getchildren()[0] | |
else: | |
xmlContent = etree.Element("p") | |
xmlLabel.tag = "description" | |
xmlDescription.append(xmlLabel) | |
xmlDescription.append(xmlContent) | |
xmlResult.append(xmlDescription) | |
intObjectNumber += 1 | |
if len(xmlItem.getchildren()) > 0: | |
for xmlChild in xmlItem.iterchildren(): | |
xmlResult.append(djangoParseObject(xmlChild,indent=True)) | |
xmlItem.getparent().remove(xmlItem) | |
elif xmlElement.tag == "theorem": | |
xmlTheoremHead = xmlElement.find(".//head") | |
xmlTheoremText = xmlElement.find(".//p") | |
strTheoremNumber = xmlElement.get("id-text") | |
strTheoremID = xmlElement.get("id") | |
xmlResult = etree.Element("EOAtheorem") | |
xmlResult.append(xmlTheoremHead) | |
xmlResult.append(xmlTheoremText) | |
xmlResult.set("order", str(intObjectNumber)) | |
xmlResult.set("number", strTheoremNumber) | |
xmlResult.set("uid", strTheoremID) | |
intObjectNumber += 1 | |
elif xmlElement.findall(".//EOAequationarray"): | |
xmlResult = etree.Element("temp") | |
for xmlEquation in xmlElement.findall(".//EOAequation"): | |
xmlEOAequation = etree.Element("EOAequation") | |
xmlEOAequation.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlEOAequation.set("number", xmlEquation.get("number")) | |
xmlEOAequation.set("filename", xmlEquation.get("filename")) | |
if xmlEquation.get("label") is not None: | |
xmlEOAequation.set("label", xmlEquation.get("label")) | |
shutil.copy( | |
INPUT_DIR / "items" /xmlEquation.get("filename"), | |
OUTPUT_DIR / "images/" | |
) | |
# shutil.copy(os.getcwd() + "/items/" + xmlEquation.get("filename"), os.getcwd() + "/CONVERT/django/images/") | |
xmlEOAequation.set("TeX", xmlEquation.get("TeX")) | |
if xmlEquation.get("label") is not None: | |
xmlEOAequation.set("label", xmlEquation.get("label")) | |
xmlResult.append(xmlEOAequation) | |
elif xmlElement.findall(".//EOAequationarraynonumber"): | |
xmlResult = etree.Element("temp") | |
for xmlEquation in xmlElement.findall(".//EOAequationarraynonumber"): | |
xmlEOAequation = etree.Element("EOAequation") | |
xmlEOAequation.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlEOAequation.set("number", "") | |
xmlEOAequation.set("filename", xmlEquation.get("filename")) | |
shutil.copy( | |
INPUT_DIR / "items" / xmlEquation.get("filename"), | |
OUTPUT_DIR / "images/" | |
) | |
# shutil.copy(os.getcwd() + "/items/" + xmlEquation.get("filename"), os.getcwd() + "/CONVERT/django/images/") | |
xmlEOAequation.set("TeX", xmlEquation.get("TeX")) | |
xmlResult.append(xmlEOAequation) | |
elif xmlElement.tag == "EOAequationnonumber": | |
# Process one EOAequation which is not encapsulated | |
xmlResult = etree.Element("EOAequation") | |
xmlResult.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlResult.set("filename", xmlElement.get("filename")) | |
xmlResult.set("TeX", xmlElement.get("TeX")) | |
shutil.copy( | |
INPUT_DIR / "items" / xmlElement.get("filename"), | |
OUTPUT_DIR / "images/" | |
) | |
# shutil.copy(os.getcwd() + "/items/" + xmlElement.get("filename"), os.getcwd() + "/CONVERT/django/images/") | |
xmlResult.set("number", "") | |
elif xmlElement.findall(".//EOAequation"): | |
# Process various Equations which may be encapsulated within <p> | |
xmlEquations = xmlElement.findall(".//EOAequation") | |
xmlResult = etree.Element("temp") | |
for xmlEquation in xmlEquations: | |
# Create basic Element EOAequation | |
xmlEOAequation = etree.Element("EOAequation") | |
xmlEOAequation.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlEOAequation.set("number", xmlEquation.get("number")) | |
xmlEOAequation.set("TeX", xmlEquation.get("TeX")) | |
if xmlEquation.get("uid") is not None: | |
xmlEOAequation.set("uid", xmlEquation.get("uid")) | |
shutil.copy( | |
INPUT_DIR / "items" / xmlEquation.get("filename"), | |
OUTPUT_DIR / "images/" | |
) | |
# shutil.copy(os.getcwd() + "/items/" + xmlEquation.get("filename"), os.getcwd() + "/CONVERT/django/images/") | |
xmlEOAequation.set("filename", xmlEquation.get("filename")) | |
xmlResult.append(xmlEOAequation) | |
elif xmlElement.tag == "EOAequation": | |
# Process one EOAequation which is not encapsulated | |
xmlResult = etree.Element("EOAequation") | |
xmlResult.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlResult.set("number", xmlElement.get("number")) | |
xmlResult.set("TeX", xmlElement.get("TeX")) | |
if xmlElement.get("uid") is not None: | |
xmlResult.set("uid", xmlElement.get("uid")) | |
shutil.copy( | |
INPUT_DIR / "items" / xmlElement.get("filename"), | |
OUTPUT_DIR / "images/" | |
) | |
# shutil.copy(os.getcwd() + "/items/" + xmlElement.get("filename"), os.getcwd() + "/CONVERT/django/images/") | |
xmlResult.set("filename", xmlElement.get("filename")) | |
elif xmlElement.tag == "div3": | |
xmlResult = etree.Element("EOAsubsection") | |
xmlResult.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlResult.append(xmlElement.find("head")) | |
for xmlChild in xmlElement.iterchildren(): | |
xmlResult.append(djangoParseObject(xmlChild)) | |
elif xmlElement.tag == "div4": | |
xmlResult = etree.Element("EOAsubsubsection") | |
xmlResult.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlResult.append(xmlElement.find("head")) | |
for xmlChild in xmlElement.iterchildren(): | |
xmlResult.append(djangoParseObject(xmlChild)) | |
elif xmlElement.tag == "epigraph": | |
xmlResult = etree.Element("EOAparagraph") | |
xmlResult.set("class", "epigraph") | |
xmlResult.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
x_children = xmlElement.getchildren() | |
first_element = True | |
for child in x_children: | |
if child.tag == "p": | |
child.tag = "tagtobestripped" | |
linebreak = etree.Element("br") | |
xmlResult.append(linebreak) | |
if not first_element: | |
paragraphbreak = etree.Element("br") | |
xmlResult.append(paragraphbreak) | |
xmlResult.append(deepcopy(child)) | |
elif child.tag == "EOAverse": | |
if not first_element: | |
paragraphbreak = etree.Element("br") | |
xmlResult.append(paragraphbreak) | |
verse_result = treat_verselines(child) | |
xmlResult.append(verse_result) | |
first_element = False | |
elif xmlElement.tag == "EOAverse": | |
xmlResult = etree.Element("EOAparagraph") | |
if xmlElement.get("class") is not None: | |
xmlResult.set("class", xmlElement.get("class")) | |
xmlResult.set("style", "verse") | |
xmlResult.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xml_verselines = xmlElement.findall("p") | |
xmlResult.append(deepcopy(xml_verselines[0])) | |
for xml_verseline in xml_verselines[1:]: | |
linebreak = etree.Element("br") | |
xmlResult.append(linebreak) | |
copied_line = deepcopy(xml_verseline) | |
xmlResult.append(copied_line) | |
etree.strip_tags(xmlResult, "p") | |
elif xmlElement.tag == "head" and xmlElement.get("style") == "boxhead": | |
xmlElement.tag = "b" | |
del xmlElement.attrib["style"] | |
wrapping_paragraph = etree.Element("EOAparagraph") | |
wrapping_paragraph.set("style", "box") | |
libeoaconvert.wrap_into_element(wrapping_paragraph, xmlElement) | |
wrapping_paragraph.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlResult = wrapping_paragraph | |
elif xmlElement.tag == "p" and xmlElement.get("class") == "divider": | |
xmlElement.tag = "EOAparagraph" | |
xmlElement.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlResult = xmlElement | |
elif xmlElement.tag == "EOAtocentry": | |
# throw them out for the time being | |
xmlResult = etree.Element("temp") | |
elif xmlElement.tag == "pagebreak": | |
# throw them out for the time being | |
xmlResult = etree.Element("temp") | |
else: | |
if xmlElement.getchildren() == [] and not xmlElement.text: | |
logging.debug(f"Removing empty paragraph") | |
xmlResult = etree.Element("temp") | |
else: | |
xmlElement.tag = "EOAparagraph" | |
logging.debug(f"The beginning of this paragraph is: '{libeoaconvert.gettext(xmlElement)[:40]}…'") | |
quoted_paragraph = xmlElement.get("rend") | |
if quoted_paragraph is not None and quoted_paragraph == "quoted": | |
xmlElement.set("rend", "quoted") | |
xmlElement.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlResult = xmlElement | |
else: | |
logging.info("SPECIAL: %s - %s" % (xmlElement, xmlElement.text)) | |
xmlResult = xmlElement | |
if indent==True: | |
xmlResult.set("indent", "True") | |
if listtype != None: | |
xmlResult.set("listtype", listtype) | |
if listnumber != 0: | |
xmlResult.set("listnumber", listnumber) | |
if uid != None: | |
xmlResult.set("id", uid) | |
return xmlResult | |
# def djangoParseObject ends here | |
def make_index(index_hits, index_type): | |
"""Make an index""" | |
dictIndex = {} | |
for xmlEOAindex in index_hits: | |
strMainEntry = xmlEOAindex.get("main") | |
str_display_entry = xmlEOAindex.get("display") | |
if len(strMainEntry) == 0: | |
strMainEntry = str_display_entry | |
logging.warning("Index found without main entry, only display string. Using display string for sorting.") | |
# If strMainEntry not in Index, then create new index element | |
if strMainEntry not in dictIndex: | |
dictIndex[strMainEntry] = {} | |
dictIndex[strMainEntry]["display_string"] = "" | |
dictIndex[strMainEntry]["listMainentries"] = [] | |
dictIndex[strMainEntry]["dictSubentries"] = {} | |
# store the display string here. | |
if str_display_entry is not None: | |
dictIndex[strMainEntry]["display_string"] = str_display_entry | |
else: | |
dictIndex[strMainEntry]["display_string"] = strMainEntry | |
# if entry has no subentry then append it to listMainentries | |
if strMainEntry in dictIndex and xmlEOAindex.get("secondary") == None: | |
dictIndex[strMainEntry]["listMainentries"].append(xmlEOAindex) | |
# if entry has subentry, proceed on the second level | |
if strMainEntry in dictIndex and xmlEOAindex.get("secondary") is not None: | |
# put the next line in anyway | |
# dictIndex[strMainEntry]["listMainentries"].append(xmlEOAindex) | |
strSubEntry = xmlEOAindex.get("secondary") | |
# if strSubEntry is not in dictSubentries, then create new list | |
if strSubEntry not in dictIndex[strMainEntry]["dictSubentries"]: | |
dictIndex[strMainEntry]["dictSubentries"][strSubEntry] = [] | |
dictIndex[strMainEntry]["dictSubentries"][strSubEntry].append(xmlEOAindex) | |
else: | |
dictIndex[strMainEntry]["dictSubentries"][strSubEntry].append(xmlEOAindex) | |
# Sort the main index | |
listSortedKeys = sorted(dictIndex.keys(), key=str.lower) | |
if index_type == "regular": | |
new_index_element = "EOAprintindex" | |
else: | |
new_index_element = "EOAprint%sindex" % index_type | |
# Create new and empty xmlTree for xmlEOAindex | |
xmlEOAprintindex = etree.Element(new_index_element) | |
xmlEOAindexsection = None | |
listFirstChars = [] | |
for strSortedKey in listSortedKeys: | |
strFirstChar = strSortedKey[0].upper() | |
if strFirstChar not in listFirstChars: | |
logging.debug("Beginning a new letter: %s." % strFirstChar) | |
listFirstChars.append(strFirstChar) | |
if xmlEOAindexsection is not None: | |
xmlEOAprintindex.append(xmlEOAindexsection) | |
xmlEOAindexsection = etree.Element("EOAindexsection") | |
xmlEOAindexsection.set("Character", strFirstChar) | |
# beginning a new entry | |
xmlEOAindexentry = etree.Element("EOAindexentry") | |
xmlEOAindexentry.set("main", strSortedKey) | |
xmlEOAindexentry.set("display", dictIndex[strSortedKey]["display_string"]) | |
logging.debug("Index entry: %s." % strSortedKey) | |
for xmlMainelement in dictIndex[strSortedKey]["listMainentries"]: | |
logging.info(xmlMainelement.get("chapterorder") + ":" + xmlMainelement.get("elementorder")) | |
xmlEOAindexlink = etree.Element("EOAindexlink") | |
xmlEOAindexlink.set("chapterorder", xmlMainelement.get("chapterorder")) | |
xmlEOAindexlink.set("elementorder", xmlMainelement.get("elementorder")) | |
if xmlMainelement.get("bold") is not None: | |
xmlEOAindexlink.set("bold", "True") | |
xmlEOAindexentry.append(xmlEOAindexlink) | |
# If there are any subentries, process them now | |
if len(dictIndex[strSortedKey]["dictSubentries"]) > 0: | |
logging.debug("Processing Subentries") | |
listSortedSubKeys = sorted(dictIndex[strSortedKey]["dictSubentries"]) | |
for strSortedSubKey in listSortedSubKeys: | |
xmlEOAindexsubentry = etree.Element("EOAindexsubentry") | |
xmlEOAindexsubentry.set("secondary", strSortedSubKey) | |
for xmlSubElement in dictIndex[strSortedKey]["dictSubentries"][strSortedSubKey]: | |
strSubEntry = xmlSubElement.get("secondary") | |
# Hier noch die Links auf den Untereintrag einfügen | |
xmlEOAindexlink = etree.Element("EOAindexlink") | |
xmlEOAindexlink.set("chapterorder", xmlSubElement.get("chapterorder")) | |
xmlEOAindexlink.set("elementorder", xmlSubElement.get("elementorder")) | |
xmlEOAindexsubentry.append(xmlEOAindexlink) | |
if xmlSubElement.get("bold") is not None: | |
xmlEOAindexlink.set("bold", "True") | |
logging.debug(strSubEntry) | |
xmlEOAindexentry.append(xmlEOAindexsubentry) | |
xmlEOAindexsection.append(xmlEOAindexentry) | |
# if xmlEOAindexsection is not None: | |
xmlEOAprintindex.append(xmlEOAindexsection) | |
return(xmlEOAprintindex) | |
# def make_index ends here | |
def djangoParseHeadline(xmlElement): | |
# Parse EOAauthor and append it to the Chapter Information | |
xmlAuthors = xmlElement.find(".//EOAauthor") | |
if xmlAuthors is not None: | |
strAuthors = xmlAuthors.text | |
xmlElement.remove(xmlAuthors) | |
strAuthors = re.sub("(, and | and | und )", ",", strAuthors) | |
listAuthors = re.split("\,", strAuthors) | |
logging.debug(listAuthors) | |
if len(listAuthors) >= 1: | |
for i in range(len(listAuthors)): | |
xmlAuthor = etree.Element("EOAauthor") | |
# Remove Spaces before and after AuthorString | |
if listAuthors[i][0] == " ": | |
strAuthor = listAuthors[i][1:] | |
elif listAuthors[i].endswith(" "): | |
strAuthor = listAuthors[i][:-1] | |
else: | |
strAuthor = listAuthors[i] | |
xmlAuthor.text = strAuthor | |
xmlElement.append(xmlAuthor) | |
return xmlElement | |
# def djangoParseHeadline ends here | |
def check_publication_cfg(configuration_file): | |
"""Check the configuration file before uploading | |
This function is adapted from the publicationimport script. | |
""" | |
logging.debug("Checking configuration file %s.", configuration_file) | |
config = configparser.ConfigParser() | |
try: | |
config.read(configuration_file) | |
except configparser.ParsingError as err: | |
logging.error(err) | |
technical_items = ["Serie", "Number", "Title", "Subtitle", "PublicationDate", "Language", "License", "ISBN", "Price", "Shoplink"] | |
general_items = ["BriefDescription", "DetailedDescription", "Submitter", "EditorialCoordination", "Copyediting", "Translator", "Dedication"] | |
authors_items = ["Author1", "Author2", "Author3", "Author4", "Author5", "Zusatz"] | |
categories = {"Technical" : technical_items, "General" : general_items, "Authors" : authors_items} | |
for cat in categories: | |
for item in categories[cat]: | |
try: | |
config[cat][item] | |
except KeyError: | |
logging.error("%s is missing in configuration.", item) | |
return | |
# def check_publication_cfg ends here | |
def treat_verselines(verse_element): | |
"Dissolve verselines to lines with linebreak milestones" | |
xml_result = etree.Element("tagtobestripped") | |
xml_verselines = verse_element.findall("p") | |
for xml_verseline in xml_verselines: | |
xml_verseline.tag = "tagtobestripped" | |
xml_result.append(deepcopy(xml_verselines[0])) | |
for xml_verseline in xml_verselines[1:]: | |
linebreak = etree.Element("br") | |
xml_result.append(linebreak) | |
copied_line = deepcopy(xml_verseline) | |
xml_result.append(copied_line) | |
return xml_result | |
# def treat_verselines ends here | |
def bring_footnote_down_django(footnote, fragment, footnote_number, object_number, unique_id, destination): | |
""" | |
captures reusable behavior from the existing code | |
potentially, some of the old code could be replaced by calls to this helper | |
usage: intObjectNumber = bring_footnote_down_django(xmlFootnote, "fn"+str(intFootnoteNumber), str(intFootnoteNumber), intObjectNumber, tmpStrUID, xmlResult) | |
unfortunately, returning the result seemed like a better idea than mutating the global variable | |
""" | |
kids = list(footnote.getchildren()) | |
footnote_text = footnote.text or "" | |
replace_footnote_with_sup(footnote) | |
footnote.set("class", "footnote") | |
anchor = etree.Element("a") | |
anchor.set("href", "#" + fragment) # "fn" + str(intFootnoteNumber) | |
anchor.text = footnote_number # str(intFootnoteNumber) | |
footnote.append(anchor) | |
foot = etree.Element("EOAfootnote") | |
foot.set("order", str(object_number)) | |
object_number += 1 | |
foot.set("number", footnote_number) | |
anchor_number = next( | |
iter( | |
( | |
parent.get("order") | |
for parent | |
in footnote.iterancestors() | |
if parent.get("order") is not None | |
) | |
) | |
) | |
foot.set("anchor", anchor_number) | |
foot.set("id", unique_id) | |
foot.text = footnote_text | |
for kid in kids: | |
if "EOAequationnonumber" == kid.tag: | |
cwd = os.getcwd() | |
shutil.copy( | |
"%s/items/%s" % (cwd, kid.get("filename")), | |
"%s/images/" % cwd, | |
) | |
foot.append(kid) | |
destination.append(foot) | |
return object_number | |
# def bring_footnote_down_django ends here | |
############################### | |
# End of function definitions # | |
############################### | |
# Iterate over Chapters, Sections, Subsections, and Subsubsections and | |
# Put all on one level: EOAchapter | |
intChapterNumber = 1 | |
listPartIDs = [] | |
for xmlChapter in xmlChapters: | |
intObjectNumber = 1 | |
# Process Chapter Title | |
xmlEOAchapter = etree.Element("EOAchapter") | |
xmlEOAchapter.set("type","regular") | |
xmlLanguage = xmlChapter.get("language") | |
if xmlLanguage is not None: | |
# KT changing this after separating the big script | |
strLanguage = xmlLanguage #or "english" | |
else: | |
strLanguage = "english" | |
xmlEOAchapter.set("language", strLanguage) | |
# xmlEOAchapter.set("language", xmlChapter.get("language")) | |
xmlEOAchapter.set("order", str(intChapterNumber)) | |
if xmlChapter.get("rend") != "nonumber": | |
xmlEOAchapter.set("id", xmlChapter.get("id")) | |
xmlChapterHeadline = xmlChapter.find(".//head") | |
if xmlChapter.get("id") in dictChapters: | |
xmlEOAchapter.set("number", dictChapters[xmlChapter.get("id")]) | |
else: | |
xmlEOAchapter.set("number", "") | |
logging.info("-----------------------------------------------------") | |
logging.info(libeoaconvert.gettext(xmlChapterHeadline)) | |
xmlEOAchapter.append(djangoParseHeadline(xmlChapterHeadline)) | |
# Deal with EOAauthor | |
if xmlChapter.find(".//EOAauthor") is not None: | |
xmlEOAchapter.append(xmlChapter.find(".//EOAauthor")) | |
# Attache enclosing Part to Chapter, see django structure for this purpose | |
if xmlChapter.getparent().tag == "div0": | |
if xmlChapter.getparent().get("id") not in listPartIDs: | |
listPartIDs.append(xmlChapter.getparent().get("id")) | |
xmlPartHeadline = xmlChapter.getparent().find("head") | |
xmlPartHeadline.tag = "EOAparthtml" | |
xmlEOAchapter.append(xmlPartHeadline) | |
# Append Chapter to xmlEOAdocument | |
xmlEOAdocument.append(xmlEOAchapter) | |
# iterate over children of Chapter | |
for xmlChapterChild in xmlChapter.iterchildren(): | |
if xmlChapterChild.tag == "div2": | |
# Process Section Title | |
xmlEOAsection = etree.Element("EOAsection") | |
xmlEOAsection.set("order", str(intObjectNumber)) | |
if xmlChapterChild.get("rend") != "nonumber": | |
xmlEOAsection.set("id", xmlChapterChild.get("id")) | |
xmlEOAsection.set("number", dictSections[xmlChapterChild.get("id")]) | |
intObjectNumber += 1 | |
xmlHead = xmlChapter.find(".//head") | |
logging.debug("Section '%s'" % libeoaconvert.gettext(xmlHead)) | |
xmlEOAsection.append(djangoParseHeadline(xmlHead)) | |
xmlEOAchapter.append(xmlEOAsection) | |
# Iterate over Children of Section | |
for xmlSectionChild in xmlChapterChild.iterchildren(): | |
if xmlSectionChild.tag == "div3": | |
# Process Subsection Title | |
xmlEOAsubsection = etree.Element("EOAsubsection") | |
xmlEOAsubsection.set("order", str(intObjectNumber)) | |
if xmlSectionChild.get("rend") != "nonumber": | |
xmlEOAsubsection.set("id", xmlSectionChild.get("id")) | |
xmlEOAsubsection.set("number", dictSections[xmlSectionChild.get("id")]) | |
intObjectNumber += 1 | |
xmlHead = xmlSectionChild.find(".//head") | |
logging.debug("Subsection '%s'" % libeoaconvert.gettext(xmlHead)) | |
xmlEOAsubsection.append(djangoParseHeadline(xmlHead)) | |
xmlEOAchapter.append(xmlEOAsubsection) | |
# Iterate over children of Subsection | |
for xmlSubsectionChild in xmlSectionChild.iterchildren(): | |
if xmlSubsectionChild.tag == "div4": | |
# Process Subsubsection Title | |
xmlEOAsubsubsection = etree.Element("EOAsubsubsection") | |
xmlEOAsubsubsection.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlHead = xmlSubsectionChild.find(".//head") | |
logging.debug(libeoaconvert.gettext(xmlHead)) | |
xmlEOAsubsubsection.append(djangoParseHeadline(xmlHead)) | |
xmlEOAchapter.append(xmlEOAsubsubsection) | |
# Iterate over children of Subsubsection | |
for xmlSubsubsectionChild in xmlSubsectionChild.iterchildren(): | |
xmlEOAchapter.append(djangoParseObject(xmlSubsubsectionChild)) | |
else: | |
xmlEOAchapter.append(djangoParseObject(xmlSubsectionChild)) | |
elif xmlSectionChild.tag == "div4": | |
# Process Subsubsection Title | |
xmlEOAsubsubsection = etree.Element("EOAsubsubsection") | |
xmlEOAsubsubsection.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlHead = xmlSectionChild.find(".//head") | |
xmlEOAsubsubsection.append(djangoParseHeadline(xmlHead)) | |
xmlEOAchapter.append(xmlEOAsubsubsection) | |
# Iterate over children of Subsubsection | |
for xmlSubsubsectionChild in xmlSectionChild.iterchildren(): | |
if xmlSubsubsectionChild.tag == "div5": | |
logging.debug("jubel") | |
# although it's div5, promote it to subsubsection | |
xmlEOAparasection = etree.Element("EOAsubsubsection") | |
# xmlEOAparasection = etree.Element("EOAparasection") | |
xmlEOAparasection.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlHead = xmlSubsubsectionChild.find(".//head") | |
logging.debug(libeoaconvert.gettext(xmlHead)) | |
xmlEOAparasection.append(djangoParseHeadline(xmlHead)) | |
xmlEOAchapter.append(xmlEOAparasection) | |
for xmlParasectionChild in xmlSubsubsectionChild.iterchildren(): | |
xmlEOAchapter.append(djangoParseObject(xmlParasectionChild)) | |
else: | |
xmlEOAchapter.append(djangoParseObject(xmlSubsubsectionChild)) | |
else: | |
xmlEOAchapter.append(djangoParseObject(xmlSectionChild)) | |
else: | |
xmlEOAchapter.append(djangoParseObject(xmlChapterChild)) | |
intChapterNumber += 1 | |
libeoaconvert.debug_xml_here( | |
xmlTree, | |
"afterchapter", | |
DEBUG_DIR | |
) | |
logging.info("----------------------------------------------") | |
logging.info("Processing Facsimile Parts") | |
listModes = ["text", "textPollux", "xml"] | |
strBasicURL = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/interface/page-fragment.xql?document=" | |
parserECHO = etree.XMLParser() | |
xmlParts = xmlTree.findall("//div0") | |
intFacNumber = 1 | |
for xmlPart in xmlParts: | |
intObjectNumber = 1 | |
intFacPartNumber = 1 | |
if xmlPart.find(".//EOAfacsimilepart") is None: | |
continue | |
xmlEOAfacsimilepart = etree.Element("EOAfacsimilepart") | |
xmlEOAfacsimilepart.set("order", str(intChapterNumber)) | |
xmlEOAfacsimileparthead = xmlPart.find(".//head") | |
for xmlChild in xmlEOAfacsimileparthead: | |
if xmlChild.tag == "hi": | |
xmlChild.tag = "em" | |
del xmlChild.attrib["rend"] | |
xmlEOAfacsimilepart.append(xmlEOAfacsimileparthead) | |
intChapterNumber += 1 | |
xmlEOAdocument.append(xmlEOAfacsimilepart) | |
xmlFacsimilepages = xmlPart.findall(".//EOAfacsimilepage") | |
intFacPageNumber = 1 | |
for xmlFacsimilepage in xmlFacsimilepages: | |
strImageFile = xmlFacsimilepage.find(".//file").text | |
strLabel = xmlFacsimilepage.find(".//label").text | |
strPagenumber = xmlFacsimilepage.find(".//pagenumber").text or "" | |
xmlEOAfacsimilepage = etree.Element("EOAfacsimilepage") | |
xmlEOAfacsimilepage.set("order", str(intObjectNumber)) | |
# TODO: Hier noch irgendwie (fehlendem) Suffix der Datei umgehen. Und ggf. Dateien Konvertieren | |
strImageFile = strImageFile.rstrip("\n") | |
strImageFileDir = os.path.dirname(strImageFile) | |
strImageFileDir = re.sub("/", "", strImageFileDir) | |
strImageFileName = os.path.basename(strImageFile) | |
shutil.copy( | |
PUBLICATION_DIR / strImageFile, | |
OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName) | |
) | |
# shutil.copy(os.getcwd() + "/" + strImageFile, os.getcwd() + "/CONVERT/django/images/" + strImageFileDir + strImageFileName) | |
intObjectNumber += 1 | |
# Download transcription for this Page | |
fulltext_string = xmlFacsimilepage.find(".//fulltext").text | |
if fulltext_string is not None: | |
logging.debug(f"Found a link to full text: {fulltext_string}") | |
if fulltext_string.find(",") == -1: | |
logging.info("Fulltext is linked in the document.") | |
# hier weiter!!! | |
else: | |
strFacsimileURL = re.split(",", fulltext_string)[0] | |
strFacsimilePage = re.split(",", fulltext_string)[1] | |
for strMode in listModes: | |
strURL = strBasicURL + strFacsimileURL + "&pn=" + strFacsimilePage + "&mode=" + strMode | |
logging.debug("Processing Facsimile : " + strURL) | |
xmlECHOtree = etree.parse(strURL, parserECHO) | |
# Remove ECHO-namespaces | |
objectify.deannotate(xmlECHOtree, xsi_nil=True) | |
etree.cleanup_namespaces(xmlECHOtree) | |
xmlDivs = xmlECHOtree.findall(".//div") | |
for xmlDiv in xmlDivs: | |
if xmlDiv.get("class") == "pageContent": | |
# Create new EOA-Element | |
xmlEOAfacsimileelement = etree.Element("EOAfacsimileelement") | |
xmlEOAfacsimileelement.set("type", strMode) | |
# Fix Images in the <div>-Element | |
xmlImages = xmlDiv.findall(".//img") | |
intFacImgNumber = 1 | |
for xmlImage in xmlImages: | |
strImageSrc = xmlImage.get("src") | |
strCommand = "{cmd} {src} -o {dst}".format( | |
cmd = curl, | |
src = strImageSrc, | |
dst = OUTPUT_DIR / "images" / ("facsupplements_" + str(intFacNumber) + "_" + str(intFacPageNumber) + "_" + str(intFacImgNumber) + ".jpg") | |
) | |
# strCommand = "curl " + strImageSrc + " -o CONVERT/django/images/facsupplements_" + str(intFacNumber) + "_" + str(intFacPageNumber) + "_" + str(intFacImgNumber) + ".jpg" | |
listArguments = shlex.split(strCommand) | |
try: | |
exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True) | |
xmlImage.set("src", "facsupplements_" + str(intFacNumber) + "_" + str(intFacPageNumber) + "_" + str(intFacImgNumber) + ".jpg") | |
except: | |
xmlImage.tag = "temp" | |
intFacImgNumber += 1 | |
# Change of scr of img-Element | |
xmlEOAfacsimileelement.append(xmlDiv) | |
xmlEOAfacsimilepage.append(xmlEOAfacsimileelement) | |
intFacPageNumber += 1 | |
xmlEOAfacsimilepage.set("file", strImageFileDir + strImageFileName) | |
xmlEOAfacsimilepage.set("label", str(strLabel)) | |
xmlEOAfacsimilepage.set("pagenumber", str(strPagenumber)) | |
xmlEOAfacsimilepart.append(xmlEOAfacsimilepage) | |
intFacNumber =+ 1 | |
etree.strip_tags(xmlDjangoTree, "temp") | |
logging.info("----------------------------------------------") | |
logging.info("Processing and linking Footnotes for django") | |
xmlEOAchapters = xmlEOAdocument.findall(".//EOAchapter") | |
# debug_chapters(xmlEOAchapters) | |
translation_xml = etree.parse( str( TRANSLATION_FILE ) ) | |
dictLangFootnotes = translation_xml.find("//entry[@name='footnotes']").attrib | |
for xmlEOAchapter in xmlEOAchapters: | |
groupings = libeoaconvert.get_bigfoot_data(xmlEOAchapter) | |
has_old = 0 != len(xmlEOAchapter.findall(".//note[@place='Inline']")) | |
has_new = 0 != len( | |
[ # flatten | |
note | |
for grouping, notes in groupings | |
for note in notes | |
] | |
) | |
# XOR falls through, AND is an error (that should have already been thrown during the epub phase), and NOR skips to the next chapter | |
if has_old: | |
if has_new: | |
raise FootnoteError("This chapter contains both old-style footnotes and new-style footnotes") | |
else: | |
if not has_new: | |
continue | |
# Find out running order of last item the chapter | |
# Hier pro FN zunächst die EOAequationnonumber in <p> korrigieren | |
# Dann pro FN die Kindelemente abarbeiten und an die neue FN dran hängen | |
# Ggf. aufpassen, ob ein Absatz mit indent versehen ist, dann blockquote drum herum machen | |
xmlElement = xmlEOAchapter[(len(xmlEOAchapter)-1)] | |
logging.debug(etree.tostring(xmlElement)) | |
intObjectNumber = (int(xmlElement.get("order")) + 1) | |
intFootnoteNumber = 1 | |
xmlResult = etree.Element("temp") | |
xmlEOAsection = etree.Element("EOAsection") | |
xmlEOAsection.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlHead = etree.Element("head") | |
xmlHead.text = dictLangFootnotes[libeoaconvert.two_letter_language(xmlEOAchapter.get("language"))] | |
xmlEOAsection.append(xmlHead) | |
xmlResult.append(xmlEOAsection) | |
for grouping, notes in groupings: | |
for index, note in enumerate(notes): | |
# do for the new-style notes what the old code did for the other footnotes | |
fntext = str(index+1) | |
if "lower-latin" == grouping: | |
fntext = alph_footnote_index(index) | |
unique_id = "fn%s" % fntext | |
intObjectNumber = bring_footnote_down_django(note, unique_id, fntext, intObjectNumber, unique_id, xmlResult) | |
intFootnoteNumber = 1 | |
xmlFootnotes = xmlEOAchapter.findall(".//note[@place='Inline']") | |
for xmlFootnote in xmlFootnotes: | |
xmlFootnoteContent = xmlFootnote.getchildren() | |
strFootnoteText = xmlFootnote.text or "" | |
tmpTail = xmlFootnote.tail | |
tmpStrUID = xmlFootnote.get("id") | |
logging.debug(f"Looking at footnote {tmpStrUID}.") | |
xmlFootnote.clear() | |
xmlFootnote.tail = tmpTail | |
xmlFootnote.tag = "sup" | |
xmlFootnote.set("class", "footnote") | |
xmlFootnoteLink = etree.Element("a") | |
xmlFootnoteLink.set("href", "#fn" + str(intFootnoteNumber)) | |
xmlFootnoteLink.text = str(intFootnoteNumber) | |
xmlFootnote.append(xmlFootnoteLink) | |
xmlEOAfootnote = etree.Element("EOAfootnote") | |
xmlEOAfootnote.set("order", str(intObjectNumber)) | |
intObjectNumber += 1 | |
xmlEOAfootnote.set("number", str(intFootnoteNumber)) | |
for xmlParent in xmlFootnote.iterancestors(): | |
if xmlParent.get("order") is not None: | |
strFootnoteAnchorNumber = xmlParent.get("order") | |
break | |
xmlEOAfootnote.set("anchor", strFootnoteAnchorNumber) | |
xmlEOAfootnote.set("id", tmpStrUID) | |
xmlEOAfootnote.text = strFootnoteText | |
for xmlElement in xmlFootnoteContent: | |
if xmlElement.tag != "p": | |
surrounding_p = etree.fromstring("""<p></p>""") | |
if xmlElement.tag == "EOAequationnonumber": | |
shutil.copy( | |
PUBLICATION_DIR / "items" / xmlElement.get("filename"), | |
OUTPUT_DIR / "images/" | |
) | |
elif xmlElement.tag == "EOAverse": | |
verse_tail = xmlElement.tail | |
xmlElement.tail = "" | |
xmlElement.tag = "span" | |
xmlElement.set("style", "verse") | |
versecontent_bytes = etree.tostring(xmlElement) | |
versecontent_string = versecontent_bytes.decode("utf-8") | |
xml_verselines = versecontent_string.split("\n") | |
logging.debug("Removing surrounding EOAverse tags") | |
xml_result_string = xml_verselines[0] | |
for xml_verseline in xml_verselines[1:]: | |
xml_result_string += f"<br/>{xml_verseline}" | |
verse_paragraph = etree.fromstring(xml_result_string) | |
xmlElement.tail = verse_tail | |
surrounding_p.append(xmlElement) | |
elif xmlElement.tag == "span": | |
surrounding_p.append(xmlElement) | |
elif xmlElement.tag == "EOAfigurenonumber": | |
surrounding_p = etree.fromstring("""<img/>""") | |
strImageFileString = xmlElement.find(".//file").text | |
strImageFileString = strImageFileString.rstrip("\n") | |
strImageFileDir = os.path.dirname(strImageFileString) | |
strImageFileDir = re.sub("/", "", strImageFileDir) | |
strImageFileName = os.path.basename(strImageFileString) | |
strImageFileNamewoSuffix = os.path.splitext(strImageFileName)[0] | |
shutil.copy( | |
PUBLICATION_DIR / strImageFileString, | |
OUTPUT_DIR / "images" / "embedded" / (strImageFileDir + strImageFileName) | |
) | |
surrounding_p.set("src", strImageFileDir + strImageFileName) | |
surrounding_p.set("width", xmlElement.find(".//width").text + "%;") | |
xmlElement = surrounding_p | |
else: | |
logging.debug("Footnote paragraph") | |
xmlEOAfootnote.append(xmlElement) | |
xmlResult.append(xmlEOAfootnote) | |
intFootnoteNumber += 1 | |
xmlEOAchapter.append(xmlResult) | |
# Remove temp-Tag | |
etree.strip_tags(xmlDjangoTree, "temp") | |
logging.info("----------------------------------------------") | |
logging.info("Processing various Elements") | |
for xmlEOAchapter in xmlEOAchapters: | |
xmlEmphasized = xmlEOAchapter.findall(".//hi") | |
for xmlEmph in xmlEmphasized: | |
if xmlEmph.get("rend") == "it": | |
xmlEmph.tag = "em" | |
del xmlEmph.attrib["rend"] | |
xmlHyperlinks = xmlEOAchapter.findall(".//xref") | |
for xmlHyperlink in xmlHyperlinks: | |
libeoaconvert.format_hyperlinks_django_epub(xmlHyperlink, strLanguage) | |
# Convert bold text | |
xmlBolds = xmlEOAchapter.findall(".//EOAbold") | |
for xmlBold in xmlBolds: | |
if xmlBold.get("rend") == "bold": | |
xmlBold.tag = "b" | |
del xmlBold.attrib["rend"] | |
# Convert EOAup to <sup> | |
xmlUps = xmlEOAchapter.findall(".//EOAup") | |
for xmlUp in xmlUps: | |
xmlUp.tag = "sup" | |
# Convert EOAdown to <sub> | |
xmlDowns = xmlEOAchapter.findall(".//EOAdown") | |
for xmlDown in xmlDowns: | |
xmlDown.tag = "sub" | |
# Convert EOAst to <span> | |
xmlStrikeouts = xmlEOAchapter.findall(".//EOAst") | |
for xmlStrikeout in xmlStrikeouts: | |
xmlStrikeout.tag = "span" | |
xmlStrikeout.set("style", "text-decoration: line-through;") | |
# Convert letter-spacing into something nice | |
xmlLetterspaceds = xmlEOAchapter.findall(".//EOAls") | |
for xmlLetterspaced in xmlLetterspaceds: | |
xmlLetterspaced.tag = "span" | |
xmlLetterspaced.set("style", "letter-spacing: 0.5em;") | |
# Convert letter-spacing into something nice | |
xmlCaps = xmlEOAchapter.findall(".//EOAcaps") | |
for xmlCap in xmlCaps: | |
xmlCap.tag = "span" | |
xmlCap.set("style", "font-variant:small-caps;") | |
# Convert EOAineq into appropriate IMG-Tags | |
xmlInlineEquations = xmlEOAchapter.findall(".//EOAineq") | |
for xmlInlineEquation in xmlInlineEquations: | |
xmlInlineEquation.tag = "img" | |
xmlInlineEquation.set("class", "EOAineq") | |
xmlInlineEquation.set("alt", xmlInlineEquation.get("TeX")) | |
shutil.copy( | |
INPUT_DIR / "items" / xmlInlineEquation.get("src"), | |
OUTPUT_DIR / "images" / xmlInlineEquation.get("src") | |
) | |
# shutil.copy(os.getcwd() + "/items/" + xmlInlineEquation.get("src"), os.getcwd() + "/CONVERT/django/images/" + xmlInlineEquation.get("src")) | |
# Convert EOAchem into appropriate IMG-Tags | |
xml_inline_chems = xmlEOAchapter.findall(".//EOAchem") | |
for xml_inline_chem in xml_inline_chems: | |
xml_inline_chem.tag = "img" | |
xml_inline_chem.set("class", "EOAineq") | |
xml_inline_chem.set("alt", xml_inline_chem.get("TeX")) | |
shutil.copy( | |
INPUT_DIR / "items" / xml_inline_chem.get("src"), | |
OUTPUT_DIR / "images" / xml_inline_chem.get("src") | |
) | |
# shutil.copy(os.getcwd() + "/items/" + xml_inline_chem.get("src"), os.getcwd() + "/CONVERT/django/images/" + xml_inline_chem.get("src")) | |
# Convert EOAinline into appropriate IMG-Tags | |
xmlInlineElements = xmlEOAchapter.findall(".//EOAinline") | |
for xmlInlineElement in xmlInlineElements: | |
xmlInlineElement.tag = "img" | |
xmlInlineElement.set("class", "EOAinline") | |
xmlInlineElement.set("alt", "") | |
xmlInlineElement.set("class", "eoainlineimage") | |
strInlineElementFilePath = xmlInlineElement.text | |
strInlineElementFileName = os.path.basename(strInlineElementFilePath) | |
strInlineElementDirName = os.path.dirname(strInlineElementFilePath) | |
strInlineElementSubDirName = os.path.dirname(strInlineElementFilePath).split(os.path.sep)[-1] | |
xmlInlineElement.text = None | |
xmlInlineElement.set("src", strInlineElementSubDirName + strInlineElementFileName) | |
logging.debug(f"{strInlineElementDirName} is dirname, {strInlineElementFileName} is filename/basepath") | |
logging.debug(f"""copy from {PUBLICATION_DIR / strInlineElementDirName / strInlineElementFileName} to {OUTPUT_DIR / "images/embedded" / (strInlineElementDirName + strInlineElementFileName)}""") | |
shutil.copy( | |
PUBLICATION_DIR / strInlineElementDirName / strInlineElementFileName, | |
OUTPUT_DIR / "images/embedded" / (strInlineElementSubDirName + strInlineElementFileName) | |
) | |
# shutil.copy(os.getcwd() + "/" + strInlineElementDirName + "/" + strInlineElementFileName, os.getcwd() + "/CONVERT/django/images/embedded/" + strInlineElementDirName + strInlineElementFileName) | |
strNewImagePath = OUTPUT_DIR / "images/embedded" / (strInlineElementSubDirName + strInlineElementFileName) | |
# strNewImagePath = os.getcwd() + "/CONVERT/django/images/embedded/" + strInlineElementDirName + strInlineElementFileName | |
strCommand = GM_PATH + " convert " + str(strNewImagePath) + " -resize 20x20 " + str(strNewImagePath) | |
listArguments = shlex.split(strCommand) | |
subprocess.check_output(listArguments, shell=False) | |
# Change EOAcitenumeric into a span to create approriate link | |
xmlEOAcitenumerics = xmlEOAchapter.findall(".//EOAcitenumeric") | |
for xmlEOAcitenumeric in xmlEOAcitenumerics: | |
xmlEOAcitenumeric.tag = "span" | |
xmlEOAcitenumeric.set("class", "citation") | |
xmlEOAcitenumeric.set("rel", "popover") | |
# Change EOAciteauthoryear into a span to create approriate link | |
xmlEOAciteauthoryears = xmlEOAchapter.findall(".//EOAciteauthoryear") | |
for xmlEOAciteauthoryear in xmlEOAciteauthoryears: | |
xmlEOAciteauthoryear.tag = "span" | |
xmlEOAciteauthoryear.set("class", "citation") | |
xmlEOAciteauthoryear.set("rel", "popover") | |
# Change EOAciteauthoryear into a span to create approriate link | |
xmlEOAciteyears = xmlEOAchapter.findall(".//EOAciteyear") | |
for xmlEOAciteyear in xmlEOAciteyears: | |
xmlEOAciteyear.tag = "span" | |
xmlEOAciteyear.set("class", "citation") | |
xmlEOAciteyear.set("rel", "popover") | |
# Change EOAciteauthoryear into a span to create approriate link | |
xmlEOAcitemanuals = xmlEOAchapter.findall(".//EOAcitemanual") | |
for xmlEOAcitemanual in xmlEOAcitemanuals: | |
xmlEOAcitemanual.tag = "span" | |
xmlEOAcitemanual.set("class", "citation") | |
xmlEOAcitemanual.set("rel", "popover") | |
logging.info("----------------------------------------------") | |
logging.info("Processing Cross References") | |
# Substitute References with their targets (wit links) | |
for xmlEOAchapter in xmlEOAchapters: | |
# for hyperimage collages | |
originalcontents = xmlEOAchapter.findall(".//originalcontents") | |
if originalcontents is not None: | |
for originalcontent in originalcontents: | |
previous_element = originalcontent.getprevious() | |
if originalcontent.getparent().tag == "EOAref": | |
pass | |
elif previous_element.tag != "EOAref": | |
logging.error("Found a stray originalcontents element.") | |
else: | |
oc_tail = originalcontent.tail | |
originalcontent.tail = "" | |
previous_element.append(originalcontent) | |
if previous_element.tail is not None: | |
logging.warning("Appending the old tail of EOAref") | |
previous_element.tail += oc_tail | |
else: | |
previous_element.tail = oc_tail | |
else: | |
logging.debug("No originalcontents elements found.") | |
xmlReferences = xmlEOAchapter.findall(".//EOAref") | |
for xmlReference in xmlReferences: | |
strResult = "!!! Cross Reference !!!" | |
strChapterOrder = "" | |
strObjectOrder = "" | |
xmlReferenceLabel = xmlReference.find("Label") | |
xmlReferenceLabelText = xmlReferenceLabel.text | |
xmlReferenceRef = xmlReference.find("ref") | |
xmlReferenceRefTarget = xmlReferenceRef.get("target") | |
if xmlReferenceLabelText in dictEquations: | |
# Grab Number from Dictionary | |
strResult = dictEquations[xmlReferenceLabelText] | |
# Go through all equations and find the corresponding Equation | |
xmlEOAequations = xmlEOAdocument.findall(".//EOAequation") | |
for xmlEOAequation in xmlEOAequations: | |
tmpReferenceLabelText = xmlEOAequation.get("label") | |
if xmlReferenceLabelText == tmpReferenceLabelText: | |
logging.debug("Successfully found link to array formula: %s" % strResult) | |
for xmlParent in xmlEOAequation.iterancestors(): | |
if xmlParent.tag == "EOAchapter": | |
strChapterOrder = xmlParent.get("order") | |
strObjectOrder = xmlEOAequation.get("order") | |
if xmlReferenceRefTarget in dictEquations: | |
# Grab Number from Dictionary | |
strResult = dictEquations[xmlReferenceRefTarget] | |
# Go through all equations and find the corresponding Equation | |
xmlEOAequations = xmlEOAdocument.findall(".//EOAequation") | |
for xmlEOAequation in xmlEOAequations: | |
tmpReferenceRefTarget = xmlEOAequation.get("uid") | |
if xmlReferenceRefTarget == tmpReferenceRefTarget: | |
logging.debug("Successfully found link to normal formula: %s" % strResult) | |
for xmlParent in xmlEOAequation.iterancestors(): | |
if xmlParent.tag == "EOAchapter": | |
strChapterOrder = xmlParent.get("order") | |
strObjectOrder = xmlEOAequation.get("order") | |
if xmlReferenceRefTarget in dictLists: | |
logging.debug("Found link to list.") | |
strResult = dictLists[xmlReferenceRefTarget] | |
xmlEOAlistitem = xmlEOAdocument.xpath("//EOAchapter/*[contains(@id, $targetuid)]", targetuid = xmlReferenceRefTarget)[0] | |
for xmlParent in xmlEOAlistitem.iterancestors(): | |
if xmlParent.tag == "EOAchapter": | |
strChapterOrder = xmlParent.get("order") | |
strObjectOrder = xmlEOAlistitem.get("order") | |
if xmlReferenceRefTarget in dictChapters: | |
logging.debug("Found link to chapter.") | |
strResult = dictChapters[xmlReferenceRefTarget] | |
for xmlEOAchapter in xmlEOAdocument.findall(".//EOAchapter"): | |
if xmlEOAchapter.get("id") == xmlReferenceRefTarget: | |
logging.debug("Successfully handled link to a chapter: %s" % strResult) | |
strObjectOrder = "top" | |
strChapterOrder = xmlEOAchapter.get("order") | |
if xmlReferenceRefTarget in dictTheorems: | |
logging.debug("Found link to ein Theorem") | |
strResult = dictTheorems[xmlReferenceRefTarget] | |
for xmlEOAtheorem in xmlEOAdocument.findall(".//EOAtheorem"): | |
if xmlEOAtheorem.get("uid") == xmlReferenceRefTarget: | |
logging.debug("Successfully handled link to a theorem: %s " % strResult) | |
for xmlParent in xmlEOAtheorem.iterancestors(): | |
if xmlParent.tag == "EOAchapter": | |
strObjectOrder = xmlEOAtheorem.get("order") | |
strChapterOrder = xmlParent.get("order") | |
if xmlReferenceRefTarget in dictSections: | |
logging.debug("Found link to section") | |
strResult = dictSections[xmlReferenceRefTarget] | |
xmlEOAsections = xmlEOAdocument.findall(".//EOAsection") | |
for xmlEOAsection in xmlEOAsections: | |
tmpReferenceRefTarget = xmlEOAsection.get("id") | |
if xmlReferenceRefTarget == tmpReferenceRefTarget: | |
logging.debug("Successfully handled link to section: %s " % strResult) | |
for xmlParent in xmlEOAsection.iterancestors(): | |
if xmlParent.tag == "EOAchapter": | |
strChapterOrder = xmlParent.get("order") | |
strObjectOrder = xmlEOAsection.get("order") | |
xmlEOAsubsections = xmlEOAdocument.findall(".//EOAsubsection") | |
for xmlEOAsubsection in xmlEOAsubsections: | |
tmpReferenceRefTarget = xmlEOAsubsection.get("id") | |
if xmlReferenceRefTarget == tmpReferenceRefTarget: | |
logging.debug("Successfully handled link to subsection %s: " % strResult) | |
for xmlParent in xmlEOAsubsection.iterancestors(): | |
if xmlParent.tag == "EOAchapter": | |
strChapterOrder = xmlParent.get("order") | |
strObjectOrder = xmlEOAsubsection.get("order") | |
if xmlReferenceRefTarget in dictFigures: | |
logging.debug("Found link to figure") | |
strResult = dictFigures[xmlReferenceRefTarget] | |
xmlEOAfigures = xmlEOAdocument.findall(".//EOAfigure") | |
for xmlEOAfigure in xmlEOAfigures: | |
tmpReferenceRefTarget = xmlEOAfigure.get("id") | |
if xmlReferenceRefTarget == tmpReferenceRefTarget: | |
logging.debug("Successfully handled link to figure: %s" % strResult) | |
for xmlParent in xmlEOAfigure.iterancestors(): | |
if xmlParent.tag == "EOAchapter": | |
strChapterOrder = xmlParent.get("order") | |
strObjectOrder = xmlEOAfigure.get("order") | |
if xmlReferenceRefTarget in dictFootnotes: | |
logging.debug("Found link to footnote") | |
strResult = dictFootnotes[xmlReferenceRefTarget] | |
xmlEOAfootnotes = xmlEOAdocument.findall(".//EOAfootnote") | |
for xmlEOAfootnote in xmlEOAfootnotes: | |
tmpReferenceRefTarget = xmlEOAfootnote.get("id") | |
if xmlReferenceRefTarget == tmpReferenceRefTarget: | |
logging.debug("Successfully handled link to footnote: %s" % strResult) | |
for xmlParent in xmlEOAfootnote.iterancestors(): | |
if xmlParent.tag == "EOAchapter": | |
strChapterOrder = xmlParent.get("order") | |
strObjectOrder = xmlEOAfootnote.get("order") | |
if xmlReferenceRefTarget in dictTables: | |
logging.debug("Found link to table") | |
strResult = dictTables[xmlReferenceRefTarget] | |
xmlEOAtables = xmlEOAdocument.findall(".//EOAtable") | |
for xmlEOAtable in xmlEOAtables: | |
tmpReferenceRefTarget = xmlEOAtable.get("label") | |
if xmlReferenceLabelText == tmpReferenceRefTarget: | |
logging.debug("Successfully handled link to table: %s" % strResult) | |
for xmlParent in xmlEOAtable.iterancestors(): | |
if xmlParent.tag == "EOAchapter": | |
strChapterOrder = xmlParent.get("order") | |
strObjectOrder = xmlEOAtable.get("order") | |
tmpTail = xmlReference.tail or "" | |
originalcontents = xmlReference.find("originalcontents") | |
ref_is_text = False | |
ref_is_collage = False | |
reference_type = xmlReference.get("type") | |
if reference_type == "collage": | |
ref_is_collage = True | |
elif reference_type == "text": | |
ref_is_text = True | |
reference_text = xmlReference.text.strip() | |
xmlReference.clear() | |
if originalcontents is not None: | |
logging.info("Found originalcontents") | |
xmlReference.append(originalcontents) | |
elif ref_is_text: | |
xmlReference.text = reference_text | |
else: | |
xmlReference.text = strResult | |
xmlReference.tail = tmpTail | |
xmlReference.tag = "a" | |
# hyperimage | |
if xmlReferenceRef.get("data-hilayer"): | |
xmlReference.set("data-hilayer", xmlReferenceRef.get("data-hilayer")) | |
if xmlReference.text: | |
logging.debug(xmlReference.text) | |
xmlReference.text | |
if xmlReferenceRef.get("hitarget"): | |
xmlReference.set("class", "HILink") | |
href_string = "#" + xmlReferenceRef.get("hitarget") | |
elif strObjectOrder: | |
href_string = "../" + strChapterOrder + "/index.html#" + strObjectOrder | |
else: | |
href_string = "strChapterOrder missing" | |
logging.warning("strObjectOrder is missing!") | |
xmlReference.set("href", href_string) | |
if ref_is_collage: | |
xmlReference.set("type", "collage") | |
else: | |
pass | |
logging.info("----------------------------------------------") | |
logging.info("Processing Page References") | |
for xmlEOAchapter in xmlEOAchapters: | |
xmlPageReferences = xmlEOAchapter.findall(".//EOApageref") | |
strResult = "!!! Page Reference !!!" | |
for xmlReference in xmlPageReferences: | |
xmlReferenceLabel = xmlReference.find("Label") | |
xmlReferenceLabelText = xmlReferenceLabel.text | |
xmlReferenceRef = xmlReference.find("ref") | |
xmlReferenceRefTarget = xmlReferenceRef.get("target") | |
if xmlReferenceLabelText in dictPagelabels: | |
logging.debug("Found link to page: %s" % xmlReferenceLabelText) | |
strResult = dictPagelabels[xmlReferenceLabelText] | |
else: | |
logging.warning("Page reference not fully implemented yet, see https://github.molgen.mpg.de/EditionOpenAccess/EOASkripts/issues/52") | |
xmlReference.text = strResult | |
for xmlChild in xmlReference.iterchildren(): | |
xmlReference.remove(xmlChild) | |
# Check, if EOApageref points to a Facsimile-Page | |
# If yes, make a href to the facsimile | |
xmlEOAfacsimilepages = xmlEOAdocument.findall(".//EOAfacsimilepage") | |
for xmlEOAfacsimilepage in xmlEOAfacsimilepages: | |
if xmlEOAfacsimilepage.get("label") == xmlReferenceLabelText: | |
logging.debug("Found cross reference to facsimile.") | |
xmlReference.tag = "a" | |
strPartOrder = xmlEOAfacsimilepage.getparent().get("order") | |
strFacsimileOrder = xmlEOAfacsimilepage.get("order") | |
logging.debug(strFacsimileOrder) | |
xmlReference.set("href", "../" + strPartOrder + "/" + strFacsimileOrder + ".html") | |
logging.info("----------------------------------------------") | |
logging.info("Normalizing Index Entries") | |
for xmlEOAchapter in xmlEOAchapters: | |
xml_EOA_indices = xmlEOAchapter.xpath(".//EOAindex | .//EOAindexperson | .//EOAindexlocation") | |
for xmlEOAindex in xml_EOA_indices: | |
# Using the gettext function here, because of subelements | |
# strEOAindextext = xmlEOAindex.text | |
strEOAindextext = libeoaconvert.gettext(xmlEOAindex) | |
strEOAindextext = strEOAindextext.replace("\n", " ") | |
index_children = xmlEOAindex.getchildren() | |
if index_children is not None: | |
for sub_element in index_children: | |
xmlEOAindex.remove(sub_element) | |
xmlEOAindex.text = None | |
listFirstPart = re.split('\|', strEOAindextext) | |
tmpEntry = listFirstPart[0] | |
listSecondPart = re.split('\!', tmpEntry) | |
strMainEntry = listSecondPart[0] | |
# Check if a sortkey is present via @ | |
listSortKey = re.split('@', strMainEntry) | |
if len(listSortKey) == 2: | |
xmlEOAindex.set("main", listSortKey[0]) | |
xmlEOAindex.set("display", listSortKey[1]) | |
else: | |
xmlEOAindex.set("main", strMainEntry) | |
if len(listSecondPart) > 1: | |
strSecondPart = listSecondPart[1] | |
listSecondarySortkey = re.split('@', strSecondPart) | |
if len(listSecondarySortkey) == 2: | |
xmlEOAindex.set("secondary", listSecondarySortkey[0]) | |
xmlEOAindex.set("secondarydisplay", listSecondarySortkey[1]) | |
else: | |
xmlEOAindex.set("secondary", strSecondPart) | |
if len(listFirstPart) > 1: | |
strAddition = listFirstPart[1] | |
if strAddition == "textbf": | |
xmlEOAindex.set("bold", "true") | |
tmpseealso = re.match('seealso', strAddition) | |
if tmpseealso != None: | |
tmpAddition = re.sub('seealso', '', strAddition) | |
xmlEOAindex.set("seealso", tmpAddition) | |
# Entries containing seealso are omitted for the time being | |
xmlEOAindex.tag = "temp" | |
tmpsee = re.match('^see(?!also)', strAddition) | |
if tmpsee != None: | |
tmpAddition = re.sub('see', '', strAddition) | |
xmlEOAindex.set("see", tmpAddition) | |
# Entries containing seealso are omitted for the time being | |
xmlEOAindex.tag = "temp" | |
# Figure out parent chapter number and parent Element order | |
for xmlParent in xmlEOAindex.iterancestors(): | |
if xmlParent.get("order") != None and xmlParent.tag != "EOAchapter": | |
xmlEOAindex.set("elementorder", xmlParent.get("order")) | |
if xmlParent.get("order") != None and xmlParent.tag == "EOAchapter": | |
xmlEOAindex.set("chapterorder", xmlParent.get("order")) | |
# logging.info(etree.tostring(xmlEOAindex)) | |
etree.strip_tags(xmlDjangoTree, "temp") | |
logging.info("----------------------------------------------") | |
logging.info("Removing Duplicate Index Entries") | |
for xmlEOAchapter in xmlEOAchapters: | |
for xmlChild in xmlEOAchapter.iterchildren(): | |
dictEntries = {} | |
xml_EOA_indices = xmlChild.xpath(".//EOAindex | .//EOAindexperson | .//EOAindexlocation") | |
for xmlEOAindex in xml_EOA_indices: | |
listEntry = [] | |
strEntry = xmlEOAindex.get("main") | |
if strEntry in dictEntries: | |
strSubentry = xmlEOAindex.get("secondary") | |
if strSubentry in dictEntries[strEntry] or strSubentry == None: | |
if (xmlChild.get("see") is None) and (xmlChild.get("seealso") is None): | |
xmlEOAindex.tag = "temp" | |
else: | |
dictEntries[strEntry].append(strSubentry) | |
else: | |
dictEntries[strEntry] = listEntry | |
logging.info("----------------------------------------------") | |
logging.info("Removing Index Entries in Footnotes") | |
for xmlEOAchapter in xmlEOAchapters: | |
for xmlChild in xmlEOAchapter.iterchildren(): | |
dictEntries = {} | |
xml_EOA_indices = xmlChild.xpath(".//EOAindex | .//EOAindexperson | .//EOAindexlocation") | |
for xmlEOAindex in xml_EOA_indices: | |
for xmlParent in xmlEOAindex.iterancestors(): | |
if xmlParent.tag == "EOAfootnote": | |
xmlEOAindex.tag = "temp" | |
logging.debug("Found index in footnote") | |
logging.info("----------------------------------------------") | |
logging.info("Creating paragraph links") | |
paragraphs_with_corresp = xmlDjangoTree.xpath("//EOAparagraph[@corresp]") | |
for pc in paragraphs_with_corresp: | |
# get order of target and the chapter order to create the | |
# hyperlink pick this up on publicationimport and extend the model | |
# for a field, probably containing the html snippet for the URL | |
corresponding_attribute = pc.get("corresp")[1:] | |
corresponding_paragraph = xmlDjangoTree.xpath("//EOAparagraph[@xml:id='{}']".format(corresponding_attribute)) | |
if len(corresponding_paragraph) == 0: | |
logging.error("There seems to be no corresponding xml:id for %s. Exiting." % corresponding_attribute) | |
sys.exit(1) | |
elif len(corresponding_paragraph) > 1: | |
logging.error("The xml:id %s has been assigned more than once. This is not allowed. Exiting." % corresponding_paragraph[0].attrib["{http://www.w3.org/XML/1998/namespace}id"]) | |
sys.exit(1) | |
else: | |
eoa_id_element = corresponding_paragraph[0] | |
paragraph_order = eoa_id_element.get("order") | |
for xml_parent in eoa_id_element.iterancestors(): | |
if xml_parent.tag == "EOAchapter": | |
chapter_order = xml_parent.get("order") | |
href_text = f"../{chapter_order}/index.html#{paragraph_order}" | |
pc.set("href", href_text) | |
for pc in paragraphs_with_corresp: | |
etree.strip_attributes(pc, "corresp", "{http://www.w3.org/XML/1998/namespace}id") | |
logging.info("----------------------------------------------") | |
logging.info("Sorting and Creating Regular Index") | |
xml_regular_EOAindices = xmlDjangoTree.findall("//EOAindex") | |
if len(xml_regular_EOAindices) != 0:# is not None: | |
logging.debug("Sorting %s entries for regular index." % str(len(xml_regular_EOAindices))) | |
xml_eoa_print_regular_index = make_index(xml_regular_EOAindices, index_type = "regular") | |
libeoaconvert.debug_xml_here( | |
xmlDjangoTree, | |
"djangotree", | |
DEBUG_DIR | |
) | |
libeoaconvert.debug_xml_here( | |
xmlEOAdocument, | |
"xmleoadocument", | |
DEBUG_DIR | |
) | |
libeoaconvert.debug_xml_here( | |
xmlTree, | |
"xmltree", | |
DEBUG_DIR | |
) | |
# If EOAprintindex is found, append xml_eoa_print_regular_index to xmlEOAdocument | |
xmlPrintindex = xmlTree.find(".//EOAprintindex") | |
if xmlPrintindex is not None != 0: | |
# Remove <p><EOAprintindex/></p> from xmlDjangoTree | |
logging.info("found an index") | |
xmlPrintindex.tag = "temp" | |
xmlPrintindex.getparent().tag = "temp" | |
xmlEOAdocument.append(xml_eoa_print_regular_index) | |
else: | |
logging.info("found no index") | |
logging.info("----------------------------------------------") | |
logging.info("Sorting and Creating Person Index") | |
xml_person_EOAindices = xmlDjangoTree.findall("//EOAindexperson") | |
if len(xml_person_EOAindices) != 0:# is not None: | |
xml_eoa_print_person_index = make_index(xml_person_EOAindices, index_type = "person") | |
# If EOAprintpersonindex is found, append xml_eoa_print_person_index to xmlEOAdocument | |
# xmlPrintindex = xmlDjangoTree.find(".//EOAprintpersonindex") | |
xmlPrintindex = xmlTree.find("//EOAprintpersonindex") | |
if xmlPrintindex is not None != 0: | |
# Remove <p><EOAprintindex/></p> from xmlDjangoTree | |
xmlPrintindex.tag = "temp" | |
xmlPrintindex.getparent().tag = "temp" | |
xmlEOAdocument.append(xml_eoa_print_person_index) | |
# doing the same for location index | |
logging.info("----------------------------------------------") | |
logging.info("Sorting and Creating Location Index") | |
xml_location_EOAindices = xmlDjangoTree.findall("//EOAindexlocation") | |
if len(xml_location_EOAindices) != 0:# is not None: | |
xml_eoa_print_location_index = make_index(xml_location_EOAindices, index_type = "location") | |
# If EOAprintlocationindex is found, append xml_eoa_print_location_index to xmlEOAdocument | |
xmlPrintindex = xmlTree.find(".//EOAprintlocationindex") | |
if xmlPrintindex is not None != 0: | |
xmlPrintindex.tag = "temp" | |
xmlPrintindex.getparent().tag = "temp" | |
xmlEOAdocument.append(xml_eoa_print_location_index) | |
############################################################################ | |
# Cleaning up # | |
############################################################################ | |
# TODO: Die unnötigen Attribute wie id löschen | |
# TODO: Die unnötigen Tags wie EOAlabel löschen | |
collagelinks = xmlDjangoTree.xpath(".//a[@type='collage']/originalcontents/a") | |
for link in collagelinks: | |
link.tag = "temp" | |
etree.strip_tags(xmlDjangoTree, "temp", "citetext", "EOAprintbibliography", "originalcontents", "tagtobestripped") | |
etree.strip_elements(xmlDjangoTree, "citekey", "elementtoberemoved", with_tail=False) | |
etree.strip_attributes(xmlDjangoTree, "id-text", "id", "noindent", "type", "label", "spacebefore")#, "rend") | |
############################################################################ | |
# Save xmlDjangoTree # | |
############################################################################ | |
tmpFile = open( OUTPUT_DIR / "Django.xml", "w") | |
tmpResult = etree.tostring(xmlDjangoTree, pretty_print=True, encoding="unicode") | |
tmpFile.write(tmpResult) | |
tmpFile.close() | |
logging.debug(f"Wrote {OUTPUT_DIR}/Django.xml.") | |
if args.checkpublicationcfg: | |
check_publication_cfg(INPUT_DIR / "publication.cfg") | |
else: | |
pass |