Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 2120 lines (1940 sloc) 100 KB
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
# Time-stamp: <2021-10-26 14:09:17 (kthoden)>
"""
Create an XML file that can be inserted into the Django database
of an EOAv1 installation.
Input file is a customized DocBook XML that has been generated either
with eoatex2imxml or tei2imxml.
"""
from utils.load_config import load_config, exec_command, check_executable
import utils.libeoaconvert as libeoaconvert
import pickle
import os
import sys
import re
import shutil
import shlex
import subprocess
import argparse
import configparser
import logging
from copy import deepcopy
from lxml import etree
from pathlib import Path
import time
BASE_DIR = Path( __file__ ).resolve().parent
SCRIPT_PATH = Path( __file__ )
SCRIPT_NAME = SCRIPT_PATH.name
DEFAULT_INPUT_DIR = \
Path(os.environ['INPUT_DIR'] if 'INPUT_DIR' in os.environ else './input')
DEFAULT_OUTPUT_DIR = \
Path(os.environ['OUTPUT_DIR'] if 'OUTPUT_DIR' in os.environ else './output')
#####################
# Parsing arguments #
#####################
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"-c", "--config",
default = BASE_DIR / "config" / "eoaconvert.cfg",
dest="CONFIG_FILE",
help="Name of configuration file",
metavar="CONFIGURATION",
type = Path,
)
parser.add_argument(
"--log-level",
default = "INFO",
help="log level: choose between DEBUG, INFO, WARNING, ERROR, CRITICAL"
)
parser.add_argument(
"-p", "--checkpublicationcfg",
help="Check the publication.cfg for completeness.",
action="store_true"
)
parser.add_argument(
"-i", "--input-dir",
help = f"directory containing some intermediate xml created by previous steps. default: {DEFAULT_OUTPUT_DIR}/PUBLICATION_NAME/imxml",
type = Path,
)
parser.add_argument(
"-o", "--output-dir",
help = f"output directory. default: {DEFAULT_OUTPUT_DIR}/PUBLICATION_NAME/django",
type = Path,
)
parser.add_argument(
"PUBLICATION_DIR",
help = "directory containing the publication (including resources like pictures, etc.)",
type = Path,
)
args = parser.parse_args()
########################
# Paths to executables #
########################
GM_PATH = "gm"
PDFCROP_EXEC = "pdfcrop" # (part of texlive distribution):
############################
# Paths:
############################
PUBLICATION_DIR = args.PUBLICATION_DIR
INPUT_DIR = \
args.input_dir if args.input_dir is not None else DEFAULT_OUTPUT_DIR / PUBLICATION_DIR.resolve().stem / "imxml"
OUTPUT_DIR = \
args.output_dir if args.output_dir is not None else (DEFAULT_OUTPUT_DIR / PUBLICATION_DIR.resolve().stem) / "django"
LOG_DIR = OUTPUT_DIR / "log"
LOG_FILE = (LOG_DIR / SCRIPT_NAME) . with_suffix( ".log" )
TEMP_DIR = OUTPUT_DIR / "tmp_files"
DEBUG_DIR = OUTPUT_DIR / "debug"
config_file = args.CONFIG_FILE
print("The configfile is %s." % config_file)
##################################
# Reading the configuration file #
##################################
CONFIG = load_config(
config_file,
args.log_level,
LOG_FILE,
)
############################
# Paths to auxiliary files #
############################
TRANSLATION_FILE = BASE_DIR / CONFIG['Auxiliaries']['TRANSLATIONS']
# prepare:
logging.debug("PATH: {}".format( os.environ['PATH'] ))
check_executable( GM_PATH )
check_executable( PDFCROP_EXEC )
if not TEMP_DIR.exists():
os.makedirs( TEMP_DIR )
if not DEBUG_DIR.exists():
os.makedirs( DEBUG_DIR )
# Check for folder and necessary files
logging.info(f"The publication.cfg file is missing in django directory.")
if os.path.exists(INPUT_DIR / "publication.cfg"):
shutil.copy(INPUT_DIR / "publication.cfg", OUTPUT_DIR)
logging.info(f"Copied from {INPUT_DIR}.")
else:
logging.error(f"Found no publication.cfg in {INPUT_DIR}. Exiting")
sys.exit( 1 )
if os.path.exists(INPUT_DIR / "Cover.jpg"):
shutil.copy(INPUT_DIR / "Cover.jpg", OUTPUT_DIR / "Cover.jpg")
logging.info("Copied cover image from input directory.")
else:
logging.error("No coverfile found. You can create a temporary one with the mkimage.py script")
sys.exit( 1 )
###########################################
# Loading data from first conversion step #
###########################################
with open(INPUT_DIR / "tmp_files" / 'data.pickle', 'rb') as f:
data = pickle.load(f)
dictChapters = data["chapterdict"]
dictEquations = data["eqdict"]
dictLists = data["listdict"]
dictTheorems = data["theoremdict"]
dictSections = data["secdict"]
dictFigures = data["figdict"]
dictFootnotes = data["fndict"]
dictTables = data["tabdict"]
dictPagelabels = data["pagelabeldict"]
if not os.path.exists(DEBUG_DIR):
os.mkdir(DEBUG_DIR)
xmlTree = etree.parse( str(INPUT_DIR / "IntermediateXMLFile.xml") )
libeoaconvert.debug_xml_here(
xmlTree,
"fresh",
DEBUG_DIR
)
print("""
############################################################################
# Convert tralics-XML to Django Data Structure #
############################################################################
""")
if not os.path.exists(OUTPUT_DIR / "images"):
os.mkdir(OUTPUT_DIR / "images")
if not os.path.exists(OUTPUT_DIR / "images" / "embedded"):
os.mkdir(OUTPUT_DIR / "images" / "embedded")
if not os.path.exists(OUTPUT_DIR / "files"):
os.mkdir(OUTPUT_DIR / "files")
# Create empty xmlTree
xmlEOAdocument = etree.Element("EOAdocument")
xmlDjangoTree = etree.ElementTree(xmlEOAdocument)
etree.strip_attributes(xmlTree, "noindent")
# Remove temp-Tag
etree.strip_tags(xmlTree, "temp")
libeoaconvert.debug_xml_here(
xmlTree,
"afterstriptags",
DEBUG_DIR
)
# Write Temporary XML-Maintree
ergebnisdatei = open(TEMP_DIR / "Devel_django.xml", "w")
ergebnis = etree.tostring(xmlTree, pretty_print=True, encoding="unicode")
ergebnisdatei.write(ergebnis)
ergebnisdatei.close()
# Find all Chapters from the original tralics XML
xmlChapters = xmlTree.findall("//div1")
def replace_footnote_with_sup(note):
"""
captures reusable behavior from the existing code
potentially, some of the old code could be replaced by calls to this helper
this behavior showed up in a few places
I thought I would be able to extract a little more, but this was all that was actually common
"""
tail = note.tail
note.clear()
note.tail = tail
note.tag = "sup"
# def replace_footnote_with_sup ends here
def alph_footnote_index(fndex):
"""
lowercase Latin footnotes need to support more than 26 values
These are zero-indexed.
>>> alph_footnote_index(0)
'a'
>>> alph_footnote_index(1)
'b'
>>> alph_footnote_index(24)
'y'
>>> alph_footnote_index(25)
'z'
>>> alph_footnote_index(26)
'aa'
>>> alph_footnote_index(27)
'ab'
"""
alphabet = "abcdefghijklmnopqrstuvwxyz"
quotient, remainder = divmod(fndex, len(alphabet))
if not quotient: return alphabet[fndex]
return alph_footnote_index(quotient - 1) + alph_footnote_index(remainder)
# def alph_footnote_index ends here
def debug_chapters(xmlEOAchapters):
"""Write individual chapters to files"""
chap_num = 1
for chapter in xmlEOAchapters:
tmp_filename = DEBUG_DIR / ("debug-chapter-%02d.xml" % chap_num)
tmp_file = open (tmp_filename, "w")
tmp_result = etree.tostring(chapter, pretty_print=True, encoding="unicode")
tmp_file.write(tmp_result)
tmp_file.close()
chap_num += 1
# def debug_chapters ends here
def djangoParseObject(xmlElement, indent=False, listtype=None, listnumber=0, uid=None):
# Get Dictionaries of Numbers via Global Variables
global dictChapters
global dictFigures
global dictEquations
global dictSections
global dictFootnotes
global dictPagelabels
global dictTables
global dictLists
global intObjectNumber
# Check what kind of Element we have and change the data
if isinstance(xmlElement.tag, str):
if xmlElement.tag == "EOAtranscripted":
xmlResult = etree.Element("temp")
xmlEOATranscription = etree.Element("EOAtranscription")
xmlEOATranscription.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlLeftheader = xmlElement.find(".//Leftheader")
etree.strip_tags(xmlLeftheader, "p")
xmlEOATranscription.append(xmlLeftheader)
xmlRightheader = xmlElement.find(".//Rightheader")
etree.strip_tags(xmlRightheader, "p")
xmlEOATranscription.append(xmlRightheader)
xmlTranscriptedtext = xmlElement.find(".//EOAtranscriptedtext")
# change \n\n into </p><p> and pagebreak intto </p><pagebreak><p> to create some valid markup
strTranscriptedtext = etree.tostring(xmlTranscriptedtext, encoding="unicode")
#strTranscriptedtext = re.sub (r"\n\n", "</p><p>", str(strTranscriptedtext))
#strTranscriptedtext = re.sub (r"<p><pagebreak/></p>", "<pagebreak/>", strTranscriptedtext)
xmlLeftColumn = etree.Element("EOAtranscriptionleft")
xmlRightColumn = etree.Element("EOAtranscriptionright")
boolRightColumn = False
xmlTemp = etree.XML(str(strTranscriptedtext))
for xmlElement in xmlTemp.iterchildren():
if xmlElement.tag == "pagebreak":
boolRightColumn = True
continue
if boolRightColumn == False:
xmlLeftColumn.append(xmlElement)
if boolRightColumn == True:
xmlRightColumn.append(xmlElement)
xmlEOATranscription.append(xmlLeftColumn)
xmlEOATranscription.append(xmlRightColumn)
# Convert Images within the transcription
logging.debug("EOAfigurenonumber")
xmlFigures = xmlEOATranscription.findall(".//EOAfigurenonumber")
logging.debug(xmlFigures)
if xmlFigures is not None:
for xmlFigure in xmlFigures:
# example 'images/1.jpg'
strImageFileString = xmlFigure.find(".//file").text
strImageFileString = strImageFileString.rstrip("\n")
strImageFileDir = os.path.dirname(strImageFileString)
strImageFileDir = re.sub("/", "", strImageFileDir)
strImageFileName = os.path.basename(strImageFileString)
strImageFileNamewoSuffix = os.path.splitext(strImageFileName)[0]
strCommand = "{cmd} convert {arg1} -resize 250x250\\> {arg2}".format(
cmd = GM_PATH,
arg1 = PUBLICATION_DIR / strImageFileString,
arg2 = OUTPUT_DIR / "images/embedded" / (strImageFileDir + strImageFileName),
)
listArguments = shlex.split(strCommand)
subprocess.check_output(listArguments, shell=False)
tmpStrTail = xmlFigure.tail
xmlFigure.clear()
xmlFigure.tag = "img"
xmlFigure.set("src", strImageFileDir + strImageFileName)
xmlFigure.set("alt", "")
xmlResult.append(xmlEOATranscription)
elif xmlElement.tag == "EOAletterhead":
xmlResult = etree.Element("temp")
xmlEOAletterhead = etree.Element("EOAletterhead")
xmlEOAletterrecipient = xmlElement.find(".//Recipient")
xmlEOAletterhead.append(xmlEOAletterrecipient)
xmlEOAletterarchive = xmlElement.find(".//Archive")
xmlEOAletterhead.append(xmlEOAletterarchive)
xmlEOAletteradditional = xmlElement.find(".//Additional")
xmlEOAletterhead.append(xmlEOAletteradditional)
xmlEOAletterpages = xmlElement.find(".//Pages")
xmlEOAletterhead.append(xmlEOAletterpages)
xmlEOAletterhead.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlResult.append(xmlEOAletterhead)
elif xmlElement.tag == "EOAfigurenonumber":
xmlResult = etree.Element("temp")
xmlEOAfigure = etree.Element("EOAfigurenonumber")
# Copy Image
strImageFileString = xmlElement.find(".//file").text
strImageFileString = strImageFileString.rstrip("\n")
strImageFileDir = os.path.dirname(strImageFileString)
strImageFileDir = re.sub("/", "", strImageFileDir)
strImageFileName = os.path.basename(strImageFileString)
strImageFileNamewoSuffix = os.path.splitext(strImageFileName)[0]
shutil.copy(
PUBLICATION_DIR / strImageFileString,
OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName)
)
style_attribute = xmlElement.get("style")
if style_attribute is not None:
xmlEOAfigure.set("style", style_attribute)
xmlEOAfigure.set("file", strImageFileDir + strImageFileName)
xmlEOAfigure.set("width", xmlElement.find(".//width").text + "px;")
xmlEOAfigure.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlResult.append(xmlEOAfigure)
elif xmlElement.tag == "EOAfigure":
hi_figure_types = ["hitrue", "hionly", "hionlycollage", "hionlysub"]
xmlResult = etree.Element("temp")
# Create basic Element EOAfigure
xmlEOAfigure = etree.Element("EOAfigure")
figure_type = xmlElement.get("type")
strImageFileString = xmlElement.find(".//file").text
strImageFileString = strImageFileString.rstrip("\n")
strImageFileDir = os.path.dirname(strImageFileString)
strImageFileDir = re.sub("/", "", strImageFileDir)
strImageFileName = os.path.basename(strImageFileString)
logging.debug("This is figure %s", strImageFileName)
strImageFileNamewoSuffix = os.path.splitext(strImageFileName)[0]
# Copy Image
if figure_type in ["hionly", "hionlycollage", "hionlysub"]:
logging.debug(f"Found hyperimage figure ({figure_type}), no need to copy them.")
xmlEOAfigure.set("file", strImageFileDir + strImageFileName)
pass
else:
shutil.copy(
PUBLICATION_DIR / strImageFileString,
OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName)
)
logging.debug("Django figure %s." % strImageFileName)
# yellow
if os.path.splitext(strImageFileName)[1].lower() == ".pdf":
logging.debug(f"""Found a PDF file: {OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName)}""")
strImageFilepath = libeoaconvert.sanitizeImage(
OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName),
TEMP_DIR,
# os.getcwd() + "/CONVERT/django/images/" + strImageFileDir + strImageFileName,
GM_PATH,
PDFCROP_EXEC
)
xmlEOAfigure.set("file", strImageFileDir + strImageFileName.replace(".pdf", ".png"))
logging.debug("The filename is %s" % xmlEOAfigure.get("file"))
else:
xmlEOAfigure.set("file", strImageFileDir + strImageFileName)
if figure_type in hi_figure_types:
xmlEOAfigure.set("hielement", xmlElement.get("hielement"))
if figure_type in ["hionly", "hionlycollage", "hionlysub"]:
logging.debug(f"Found hyperimage figure ({figure_type}), no need for caption and size information.")
strFigureNumber = dictFigures[xmlElement.find(".//anchor").get("id")]
xmlEOAfigure.set("number", strFigureNumber)
else:
xmlEOAfigure.set("width", xmlElement.find(".//width").text + "px;")
xmlEOAfigure.append(xmlElement.find(".//caption"))
# Insert visual Number and uid
strFigureNumber = dictFigures[xmlElement.find(".//anchor").get("id")]
xmlEOAfigure.set("number", strFigureNumber)
strFigureUID = xmlElement.find(".//anchor").get("id")
xmlEOAfigure.set("id", strFigureUID)
xmlEOAfigure.set("order", str(intObjectNumber))
xmlResult.append(xmlEOAfigure)
intObjectNumber += 1
elif xmlElement.findall(".//EOAtable"):
xmlResult = etree.Element("EOAtable")
xmlRawTable = xmlElement.find(".//table")
xmlResult.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlResult.append(xmlRawTable)
# Copy Number, Label and Caption
if xmlElement.find(".//EOAtablecaption").text != "nonumber":
xmlResult.append(xmlElement.find(".//EOAtablecaption"))
xmlResult.set("label", xmlElement.find(".//EOAtablelabel").text)
table_id = xmlRawTable.get("id")
table_label = xmlRawTable.get("id")
xmlResult.set("number", dictTables[table_label])
xmlResult.set("id", xmlRawTable.get("id"))
else:
xmlElement.set("numbering", "false")
#if xmlElement.find(".//EOAtablelabel").text is not None:
# Transform width of Columns
strColumnString = xmlElement.find(".//EOAtablecolumns").text
strColumnString = re.sub(r"\|", "", strColumnString)
reMatchObjects = re.findall(r'([L|R|C].*?[c|m]m)', strColumnString)
intTableWidth = 0
listColumnAlignments = [None]
listColumnWidths = [None]
intNumberOfColumns = 0
for strColumnDefinition in reMatchObjects:
strColumnDefinition = strColumnDefinition.rstrip("cm")
logging.info(strColumnDefinition)
strColumnAlignment = strColumnDefinition[0]
if strColumnAlignment == "L":
strColumnAlignment = "left"
if strColumnAlignment == "C":
strColumnAlignment = "center"
if strColumnAlignment == "R":
strColumnAlignment = "right"
listColumnAlignments.append(strColumnAlignment)
intColumnWidth = int(float(strColumnDefinition.lstrip("LRC")) * 75)
listColumnWidths.append(intColumnWidth)
intTableWidth += intColumnWidth
intNumberOfColumns += 1
xmlRawTable.set("width", str(intTableWidth))
# Figure out and deal with the Header
xmlHeader = xmlRawTable.find(".//row/cell/tableheader")
if xmlHeader is not None:
xmlHeader.text = ""
xmlHeader.getparent().text = xmlHeader.tail
xmlHeader.getparent().remove(xmlHeader)
xmlFirstRow = xmlRawTable.find(".//row")
xmlFirstRow.tag = "tr"
xmlFirstRowCells = xmlFirstRow.findall(".//cell")
for xmlFirstRowCell in xmlFirstRowCells:
xmlFirstRowCell.tag = "th"
# Now Deal with the rest of the rows
xmlTableRows = xmlRawTable.findall(".//row")
for xmlTableRow in xmlTableRows:
xmlTableCells = xmlTableRow.findall(".//cell")
intCurrentColumn = 1
for xmlTableCell in xmlTableCells:
xmlTableCell.tag = "td"
xmlTableCell.set("align",listColumnAlignments[intCurrentColumn])
xmlTableCell.set("style","width: " + str(listColumnWidths[intCurrentColumn]) + ";")
# Deal with multicolumn
if xmlTableCell.get("cols") is not None:
xmlTableCell.set("colspan", xmlTableCell.get("cols"))
if intCurrentColumn > len(xmlTableCells):
intCurrentColumn = 1
# Deal with multicolumn again, increase intCurrentColumn by the columns being spanned
elif xmlTableCell.get("cols") is not None:
intCurrentColumn = intCurrentColumn + int(xmlTableCell.get("cols"))
del xmlTableCell.attrib["cols"]
else:
intCurrentColumn += 1
# deal with multirow
if xmlTableCell.get("rowspan") is not None:
cellchildren = xmlTableCell.getchildren()
for child in cellchildren:
if child.tag == "figure":
child.tag = "img"
imagepath = f"{child.get('file')}.{child.get('extension')}"
logging.debug(f"{imagepath}")
strImageFileDir = os.path.dirname(imagepath)
strImageFileDir = re.sub("/", "", strImageFileDir)
strImageFileName = os.path.basename(imagepath)
logging.debug(f"{strImageFileDir} and {strImageFileName}")
shutil.copy(
PUBLICATION_DIR / imagepath,
OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName)
)
if child.get('extension') == "pdf":
strImageFilepath = libeoaconvert.sanitizeImage(
OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName),
TEMP_DIR, GM_PATH, PDFCROP_EXEC
)
child.set("src", f"{strImageFileDir + strImageFileName}".replace(".pdf", ".png"))
else:
child.set("src", strImageFileDir)
child.set("width", f"{str(listColumnWidths[intCurrentColumn])}px")
del child.attrib["rend"]
del child.attrib["file"]
del child.attrib["extension"]
xmlTableRow.tag = "tr"
xmlTableRow.set("valign", "top")
elif xmlElement.tag == "list" and xmlElement.get('type') != 'description':
xmlResult = etree.Element("temp")
if xmlElement.get('type') == 'ordered':
# Change first item into EOAlistfirstitem
xmlFirstItem = xmlElement.find("..//item")
xmlFirstItemElement = xmlFirstItem.getchildren()[0]
xmlResult.append(djangoParseObject(xmlFirstItemElement,indent=True, listtype="ordered", listnumber=xmlFirstItem.get("label"), uid=xmlFirstItem.get("id")))
# Process Child Elements which are Part of this item
if len(xmlFirstItem.getchildren()) >= 1:
for xmlChild in xmlFirstItem.iterchildren():
xmlResult.append(djangoParseObject(xmlChild,indent=True))
xmlFirstItem.getparent().remove(xmlFirstItem)
# Process remaining items in this list
tmpIntNumber = 2
for xmlItem in xmlElement.iterchildren():
xmlItemElement = xmlItem.getchildren()[0]
xmlResult.append(djangoParseObject(xmlItemElement,indent=True,listtype="ordered",listnumber=xmlItem.get("label"), uid=xmlItem.get("id")))
tmpIntNumber += 1
if len(xmlItem.getchildren()) >= 1:
for xmlChild in xmlItem.iterchildren():
xmlResult.append(djangoParseObject(xmlChild, indent=True))
xmlItem.getparent().remove(xmlItem)
if xmlElement.get('type') == 'simple':
xml_first_child = xmlElement.getchildren()[0]
if xml_first_child.tag == 'item':
logging.debug("a simple list with no special items")
# Change first item into EOAlistfirstitem
xmlFirstItem = xmlElement.find("..//item")
xmlFirstItemElement = xmlFirstItem.getchildren()[0]
xmlResult.append(djangoParseObject(xmlFirstItemElement,indent=True,listtype="unordered", listnumber="-"))
# Process Child Elements which are Part of this item
if len(xmlFirstItem.getchildren()) >= 1:
logging.debug("len xmlFirstItem.getchildren is greater or equal 1")
for xmlChild in xmlFirstItem.iterchildren():
xmlResult.append(djangoParseObject(xmlChild,indent=True))
xmlFirstItem.getparent().remove(xmlFirstItem)
for xmlItem in xmlElement.iterchildren():
xmlItemElement = xmlItem.getchildren()[0]
xmlResult.append(djangoParseObject(xmlItemElement,indent=True))
if len(xmlItem.getchildren()) >= 1:
for xmlChild in xmlItem.iterchildren():
xmlResult.append(djangoParseObject(xmlChild,indent=True))
xmlItem.getparent().remove(xmlItem)
#############
# Baustelle #
#############
elif xml_first_child.tag == 'label':
logging.debug("a simple list with named items")
# Change first item into EOAlistfirstitem
xmlFirstItem = xmlElement.find("..//item")
xmlFirstItemElement = xmlFirstItem.getchildren()[0]
logging.debug(xmlFirstItemElement.text)
# debugging
logging.debug(etree.tostring(xmlFirstItemElement))
# end of debugging
xml_first_label = xmlElement.find("..//label")
listnumber_text = xml_first_label.text
xmlResult.append(djangoParseObject(xmlFirstItemElement,indent=True,listtype="unordered custom", listnumber=listnumber_text))
logging.debug("The length of the children of the first item: %s." % len(xmlFirstItem.getchildren()))
# Process Child Elements which are Part of this item
if len(xmlFirstItem.getchildren()) >= 1:
logging.debug("len xmlFirstItem.getchildren is greater or equal 1")
for xmlChild in xmlFirstItem.iterchildren():
xmlResult.append(djangoParseObject(xmlChild,indent=True))
xmlFirstItem.getparent().remove(xmlFirstItem)
xml_first_label.getparent().remove(xml_first_label)
all_the_labels = xmlElement.findall("label")
all_the_items = xmlElement.findall("item")
logging.debug("itemlength %s." % len(all_the_items))
logging.debug("labellength %s." % len(all_the_labels))
for listlabel, listitem in zip(all_the_labels, all_the_items):
logging.debug("listitem text %s." % listitem.text)
logging.debug("listlabel text %s." % listlabel.text)
xml_item_element = listitem.getchildren()[0]
xmlResult.append(djangoParseObject(xml_item_element, indent=True, listnumber=listlabel.text))
listlabel.getparent().remove(listlabel)
listitem.getparent().remove(listitem)
# for xmlItem in xmlElement.iterchildren():
# print("So many items have we: ", len(xmlItem))
# xmlItemElement = xmlItem.getchildren()[0]
# xmlResult.append(djangoParseObject(xmlItemElement,indent=True))
# if len(xmlItem.getchildren()) >= 1:
# for xmlChild in xmlItem.iterchildren():
# xmlResult.append(djangoParseObject(xmlChild,indent=True))
# xmlItem.getparent().remove(xmlItem)
##################
# Ende Baustelle #
##################
elif xmlElement.tag == "list" and xmlElement.get('type') == 'description':
logging.debug("A description")
xmlResult = etree.Element("temp")
while len(xmlElement.getchildren()) != 0:
xmlDescription = etree.Element("EOAdescription")
xmlDescription.set("order", str(intObjectNumber))
xmlLabel = xmlElement.getchildren()[0]
label_children = xmlLabel.getchildren()
if label_children:
last_child = label_children[-1]
if last_child.tail.endswith(":"):
last_child.tail = last_child.tail[:-1]
else:
if xmlLabel.text.endswith(":"):
xmlLabel.text = xmlLabel.text[:-1]
xmlItem = xmlElement.getchildren()[1]
if len(xmlItem.getchildren()) > 0:
xmlContent = xmlItem.getchildren()[0]
else:
xmlContent = etree.Element("p")
xmlLabel.tag = "description"
xmlDescription.append(xmlLabel)
xmlDescription.append(xmlContent)
xmlResult.append(xmlDescription)
intObjectNumber += 1
if len(xmlItem.getchildren()) > 0:
for xmlChild in xmlItem.iterchildren():
xmlResult.append(djangoParseObject(xmlChild,indent=True))
xmlItem.getparent().remove(xmlItem)
elif xmlElement.tag == "theorem":
xmlTheoremHead = xmlElement.find(".//head")
xmlTheoremText = xmlElement.find(".//p")
strTheoremNumber = xmlElement.get("id-text")
strTheoremID = xmlElement.get("id")
xmlResult = etree.Element("EOAtheorem")
xmlResult.append(xmlTheoremHead)
xmlResult.append(xmlTheoremText)
xmlResult.set("order", str(intObjectNumber))
xmlResult.set("number", strTheoremNumber)
xmlResult.set("uid", strTheoremID)
intObjectNumber += 1
elif xmlElement.findall(".//EOAequationarray"):
xmlResult = etree.Element("temp")
for xmlEquation in xmlElement.findall(".//EOAequation"):
xmlEOAequation = etree.Element("EOAequation")
xmlEOAequation.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlEOAequation.set("number", xmlEquation.get("number"))
xmlEOAequation.set("filename", xmlEquation.get("filename"))
if xmlEquation.get("label") is not None:
xmlEOAequation.set("label", xmlEquation.get("label"))
shutil.copy(
INPUT_DIR / "items" /xmlEquation.get("filename"),
OUTPUT_DIR / "images/"
)
# shutil.copy(os.getcwd() + "/items/" + xmlEquation.get("filename"), os.getcwd() + "/CONVERT/django/images/")
xmlEOAequation.set("TeX", xmlEquation.get("TeX"))
if xmlEquation.get("label") is not None:
xmlEOAequation.set("label", xmlEquation.get("label"))
xmlResult.append(xmlEOAequation)
elif xmlElement.findall(".//EOAequationarraynonumber"):
xmlResult = etree.Element("temp")
for xmlEquation in xmlElement.findall(".//EOAequationarraynonumber"):
xmlEOAequation = etree.Element("EOAequation")
xmlEOAequation.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlEOAequation.set("number", "")
xmlEOAequation.set("filename", xmlEquation.get("filename"))
shutil.copy(
INPUT_DIR / "items" / xmlEquation.get("filename"),
OUTPUT_DIR / "images/"
)
# shutil.copy(os.getcwd() + "/items/" + xmlEquation.get("filename"), os.getcwd() + "/CONVERT/django/images/")
xmlEOAequation.set("TeX", xmlEquation.get("TeX"))
xmlResult.append(xmlEOAequation)
elif xmlElement.tag == "EOAequationnonumber":
# Process one EOAequation which is not encapsulated
xmlResult = etree.Element("EOAequation")
xmlResult.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlResult.set("filename", xmlElement.get("filename"))
xmlResult.set("TeX", xmlElement.get("TeX"))
shutil.copy(
INPUT_DIR / "items" / xmlElement.get("filename"),
OUTPUT_DIR / "images/"
)
# shutil.copy(os.getcwd() + "/items/" + xmlElement.get("filename"), os.getcwd() + "/CONVERT/django/images/")
xmlResult.set("number", "")
elif xmlElement.findall(".//EOAequation"):
# Process various Equations which may be encapsulated within <p>
xmlEquations = xmlElement.findall(".//EOAequation")
xmlResult = etree.Element("temp")
for xmlEquation in xmlEquations:
# Create basic Element EOAequation
xmlEOAequation = etree.Element("EOAequation")
xmlEOAequation.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlEOAequation.set("number", xmlEquation.get("number"))
xmlEOAequation.set("TeX", xmlEquation.get("TeX"))
if xmlEquation.get("uid") is not None:
xmlEOAequation.set("uid", xmlEquation.get("uid"))
shutil.copy(
INPUT_DIR / "items" / xmlEquation.get("filename"),
OUTPUT_DIR / "images/"
)
# shutil.copy(os.getcwd() + "/items/" + xmlEquation.get("filename"), os.getcwd() + "/CONVERT/django/images/")
xmlEOAequation.set("filename", xmlEquation.get("filename"))
xmlResult.append(xmlEOAequation)
elif xmlElement.tag == "EOAequation":
# Process one EOAequation which is not encapsulated
xmlResult = etree.Element("EOAequation")
xmlResult.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlResult.set("number", xmlElement.get("number"))
xmlResult.set("TeX", xmlElement.get("TeX"))
if xmlElement.get("uid") is not None:
xmlResult.set("uid", xmlElement.get("uid"))
shutil.copy(
INPUT_DIR / "items" / xmlElement.get("filename"),
OUTPUT_DIR / "images/"
)
# shutil.copy(os.getcwd() + "/items/" + xmlElement.get("filename"), os.getcwd() + "/CONVERT/django/images/")
xmlResult.set("filename", xmlElement.get("filename"))
elif xmlElement.tag == "div3":
xmlResult = etree.Element("EOAsubsection")
xmlResult.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlResult.append(xmlElement.find("head"))
for xmlChild in xmlElement.iterchildren():
xmlResult.append(djangoParseObject(xmlChild))
elif xmlElement.tag == "div4":
xmlResult = etree.Element("EOAsubsubsection")
xmlResult.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlResult.append(xmlElement.find("head"))
for xmlChild in xmlElement.iterchildren():
xmlResult.append(djangoParseObject(xmlChild))
elif xmlElement.tag == "epigraph":
xmlResult = etree.Element("EOAparagraph")
xmlResult.set("class", "epigraph")
xmlResult.set("order", str(intObjectNumber))
intObjectNumber += 1
x_children = xmlElement.getchildren()
first_element = True
for child in x_children:
if child.tag == "p":
child.tag = "tagtobestripped"
linebreak = etree.Element("br")
xmlResult.append(linebreak)
if not first_element:
paragraphbreak = etree.Element("br")
xmlResult.append(paragraphbreak)
xmlResult.append(deepcopy(child))
elif child.tag == "EOAverse":
if not first_element:
paragraphbreak = etree.Element("br")
xmlResult.append(paragraphbreak)
verse_result = treat_verselines(child)
xmlResult.append(verse_result)
first_element = False
elif xmlElement.tag == "EOAverse":
xmlResult = etree.Element("EOAparagraph")
if xmlElement.get("class") is not None:
xmlResult.set("class", xmlElement.get("class"))
xmlResult.set("style", "verse")
xmlResult.set("order", str(intObjectNumber))
intObjectNumber += 1
xml_verselines = xmlElement.findall("p")
xmlResult.append(deepcopy(xml_verselines[0]))
for xml_verseline in xml_verselines[1:]:
linebreak = etree.Element("br")
xmlResult.append(linebreak)
copied_line = deepcopy(xml_verseline)
xmlResult.append(copied_line)
etree.strip_tags(xmlResult, "p")
elif xmlElement.get("style") == "boxhead":
xmlElement.tag = "b"
del xmlElement.attrib["style"]
wrapping_paragraph = etree.Element("EOAparagraph")
wrapping_paragraph.set("style", "box")
libeoaconvert.wrap_into_element(wrapping_paragraph, xmlElement)
wrapping_paragraph.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlResult = wrapping_paragraph
elif xmlElement.tag == "p" and xmlElement.get("class") == "divider":
xmlElement.tag = "EOAparagraph"
xmlElement.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlResult = xmlElement
elif xmlElement.tag == "EOAtocentry":
# throw them out for the time being
xmlResult = etree.Element("temp")
elif xmlElement.tag == "pagebreak":
# throw them out for the time being
xmlResult = etree.Element("temp")
else:
if xmlElement.getchildren() == [] and not xmlElement.text:
logging.debug(f"Removing empty paragraph")
xmlResult = etree.Element("temp")
else:
xmlElement.tag = "EOAparagraph"
logging.debug(f"The beginning of this paragraph is: '{libeoaconvert.gettext(xmlElement)[:40]}…'")
quoted_paragraph = xmlElement.get("rend")
if quoted_paragraph is not None and quoted_paragraph == "quoted":
xmlElement.set("rend", "quoted")
xmlElement.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlResult = xmlElement
else:
logging.info("SPECIAL: %s - %s" % (xmlElement, xmlElement.text))
xmlResult = xmlElement
if indent==True:
xmlResult.set("indent", "True")
if listtype != None:
xmlResult.set("listtype", listtype)
if listnumber != 0:
xmlResult.set("listnumber", listnumber)
if uid != None:
xmlResult.set("id", uid)
return xmlResult
# def djangoParseObject ends here
def make_index(index_hits, index_type):
"""Make an index"""
dictIndex = {}
for xmlEOAindex in index_hits:
strMainEntry = xmlEOAindex.get("main")
str_display_entry = xmlEOAindex.get("display")
if len(strMainEntry) == 0:
strMainEntry = str_display_entry
logging.warning("Index found without main entry, only display string. Using display string for sorting.")
# If strMainEntry not in Index, then create new index element
if strMainEntry not in dictIndex:
dictIndex[strMainEntry] = {}
dictIndex[strMainEntry]["display_string"] = ""
dictIndex[strMainEntry]["listMainentries"] = []
dictIndex[strMainEntry]["dictSubentries"] = {}
# store the display string here.
if str_display_entry is not None:
dictIndex[strMainEntry]["display_string"] = str_display_entry
else:
dictIndex[strMainEntry]["display_string"] = strMainEntry
# if entry has no subentry then append it to listMainentries
if strMainEntry in dictIndex and xmlEOAindex.get("secondary") == None:
dictIndex[strMainEntry]["listMainentries"].append(xmlEOAindex)
# if entry has subentry, proceed on the second level
if strMainEntry in dictIndex and xmlEOAindex.get("secondary") is not None:
# put the next line in anyway
# dictIndex[strMainEntry]["listMainentries"].append(xmlEOAindex)
strSubEntry = xmlEOAindex.get("secondary")
# if strSubEntry is not in dictSubentries, then create new list
if strSubEntry not in dictIndex[strMainEntry]["dictSubentries"]:
dictIndex[strMainEntry]["dictSubentries"][strSubEntry] = []
dictIndex[strMainEntry]["dictSubentries"][strSubEntry].append(xmlEOAindex)
else:
dictIndex[strMainEntry]["dictSubentries"][strSubEntry].append(xmlEOAindex)
# Sort the main index
listSortedKeys = sorted(dictIndex.keys(), key=str.lower)
if index_type == "regular":
new_index_element = "EOAprintindex"
else:
new_index_element = "EOAprint%sindex" % index_type
# Create new and empty xmlTree for xmlEOAindex
xmlEOAprintindex = etree.Element(new_index_element)
xmlEOAindexsection = None
listFirstChars = []
for strSortedKey in listSortedKeys:
strFirstChar = strSortedKey[0].upper()
if strFirstChar not in listFirstChars:
logging.debug("Beginning a new letter: %s." % strFirstChar)
listFirstChars.append(strFirstChar)
if xmlEOAindexsection is not None:
xmlEOAprintindex.append(xmlEOAindexsection)
xmlEOAindexsection = etree.Element("EOAindexsection")
xmlEOAindexsection.set("Character", strFirstChar)
# beginning a new entry
xmlEOAindexentry = etree.Element("EOAindexentry")
xmlEOAindexentry.set("main", strSortedKey)
xmlEOAindexentry.set("display", dictIndex[strSortedKey]["display_string"])
logging.debug("Index entry: %s." % strSortedKey)
for xmlMainelement in dictIndex[strSortedKey]["listMainentries"]:
logging.info(xmlMainelement.get("chapterorder") + ":" + xmlMainelement.get("elementorder"))
xmlEOAindexlink = etree.Element("EOAindexlink")
xmlEOAindexlink.set("chapterorder", xmlMainelement.get("chapterorder"))
xmlEOAindexlink.set("elementorder", xmlMainelement.get("elementorder"))
if xmlMainelement.get("bold") is not None:
xmlEOAindexlink.set("bold", "True")
xmlEOAindexentry.append(xmlEOAindexlink)
# If there are any subentries, process them now
if len(dictIndex[strSortedKey]["dictSubentries"]) > 0:
logging.debug("Processing Subentries")
listSortedSubKeys = sorted(dictIndex[strSortedKey]["dictSubentries"])
for strSortedSubKey in listSortedSubKeys:
xmlEOAindexsubentry = etree.Element("EOAindexsubentry")
xmlEOAindexsubentry.set("secondary", strSortedSubKey)
for xmlSubElement in dictIndex[strSortedKey]["dictSubentries"][strSortedSubKey]:
strSubEntry = xmlSubElement.get("secondary")
# Hier noch die Links auf den Untereintrag einfügen
xmlEOAindexlink = etree.Element("EOAindexlink")
xmlEOAindexlink.set("chapterorder", xmlSubElement.get("chapterorder"))
xmlEOAindexlink.set("elementorder", xmlSubElement.get("elementorder"))
xmlEOAindexsubentry.append(xmlEOAindexlink)
if xmlSubElement.get("bold") is not None:
xmlEOAindexlink.set("bold", "True")
logging.debug(strSubEntry)
xmlEOAindexentry.append(xmlEOAindexsubentry)
xmlEOAindexsection.append(xmlEOAindexentry)
# if xmlEOAindexsection is not None:
xmlEOAprintindex.append(xmlEOAindexsection)
return(xmlEOAprintindex)
# def make_index ends here
def djangoParseHeadline(xmlElement):
# Parse EOAauthor and append it to the Chapter Information
xmlAuthors = xmlElement.find(".//EOAauthor")
if xmlAuthors is not None:
strAuthors = xmlAuthors.text
xmlElement.remove(xmlAuthors)
strAuthors = re.sub("(, and | and | und )", ",", strAuthors)
listAuthors = re.split("\,", strAuthors)
logging.debug(listAuthors)
if len(listAuthors) >= 1:
for i in range(len(listAuthors)):
xmlAuthor = etree.Element("EOAauthor")
# Remove Spaces before and after AuthorString
if listAuthors[i][0] == " ":
strAuthor = listAuthors[i][1:]
elif listAuthors[i].endswith(" "):
strAuthor = listAuthors[i][:-1]
else:
strAuthor = listAuthors[i]
xmlAuthor.text = strAuthor
xmlElement.append(xmlAuthor)
return xmlElement
# def djangoParseHeadline ends here
def check_publication_cfg(configuration_file):
"""Check the configuration file before uploading
This function is adapted from the publicationimport script.
"""
logging.debug("Checking configuration file %s.", configuration_file)
config = configparser.ConfigParser()
try:
config.read(configuration_file)
except configparser.ParsingError as err:
logging.error(err)
technical_items = ["Serie", "Number", "Title", "Subtitle", "PublicationDate", "Language", "License", "ISBN", "Price", "Shoplink"]
general_items = ["BriefDescription", "DetailedDescription", "Submitter", "EditorialCoordination", "Copyediting", "Translator", "Dedication"]
authors_items = ["Author1", "Author2", "Author3", "Author4", "Author5", "Zusatz"]
categories = {"Technical" : technical_items, "General" : general_items, "Authors" : authors_items}
for cat in categories:
for item in categories[cat]:
try:
config[cat][item]
except KeyError:
logging.error("%s is missing in configuration.", item)
return
# def check_publication_cfg ends here
def treat_verselines(verse_element):
"Dissolve verselines to lines with linebreak milestones"
xml_result = etree.Element("tagtobestripped")
xml_verselines = verse_element.findall("p")
for xml_verseline in xml_verselines:
xml_verseline.tag = "tagtobestripped"
xml_result.append(deepcopy(xml_verselines[0]))
for xml_verseline in xml_verselines[1:]:
linebreak = etree.Element("br")
xml_result.append(linebreak)
copied_line = deepcopy(xml_verseline)
xml_result.append(copied_line)
return xml_result
# def treat_verselines ends here
def bring_footnote_down_django(footnote, fragment, footnote_number, object_number, unique_id, destination):
"""
captures reusable behavior from the existing code
potentially, some of the old code could be replaced by calls to this helper
usage: intObjectNumber = bring_footnote_down_django(xmlFootnote, "fn"+str(intFootnoteNumber), str(intFootnoteNumber), intObjectNumber, tmpStrUID, xmlResult)
unfortunately, returning the result seemed like a better idea than mutating the global variable
"""
kids = list(footnote.getchildren())
footnote_text = footnote.text or ""
replace_footnote_with_sup(footnote)
footnote.set("class", "footnote")
anchor = etree.Element("a")
anchor.set("href", "#" + fragment) # "fn" + str(intFootnoteNumber)
anchor.text = footnote_number # str(intFootnoteNumber)
footnote.append(anchor)
foot = etree.Element("EOAfootnote")
foot.set("order", str(object_number))
object_number += 1
foot.set("number", footnote_number)
anchor_number = next(
iter(
(
parent.get("order")
for parent
in footnote.iterancestors()
if parent.get("order") is not None
)
)
)
foot.set("anchor", anchor_number)
foot.set("id", unique_id)
foot.text = footnote_text
for kid in kids:
if "EOAequationnonumber" == kid.tag:
cwd = os.getcwd()
shutil.copy(
"%s/items/%s" % (cwd, kid.get("filename")),
"%s/images/" % cwd,
)
foot.append(kid)
destination.append(foot)
return object_number
# def bring_footnote_down_django ends here
###############################
# End of function definitions #
###############################
# Iterate over Chapters, Sections, Subsections, and Subsubsections and
# Put all on one level: EOAchapter
intChapterNumber = 1
listPartIDs = []
for xmlChapter in xmlChapters:
intObjectNumber = 1
# Process Chapter Title
xmlEOAchapter = etree.Element("EOAchapter")
xmlEOAchapter.set("type","regular")
xmlLanguage = xmlChapter.get("language")
if xmlLanguage is not None:
# KT changing this after separating the big script
strLanguage = xmlLanguage #or "english"
else:
strLanguage = "english"
xmlEOAchapter.set("language", strLanguage)
# xmlEOAchapter.set("language", xmlChapter.get("language"))
xmlEOAchapter.set("order", str(intChapterNumber))
if xmlChapter.get("rend") != "nonumber":
xmlEOAchapter.set("id", xmlChapter.get("id"))
xmlChapterHeadline = xmlChapter.find(".//head")
if xmlChapter.get("id") in dictChapters:
xmlEOAchapter.set("number", dictChapters[xmlChapter.get("id")])
else:
xmlEOAchapter.set("number", "")
logging.info("-----------------------------------------------------")
logging.info(libeoaconvert.gettext(xmlChapterHeadline))
xmlEOAchapter.append(djangoParseHeadline(xmlChapterHeadline))
# Deal with EOAauthor
if xmlChapter.find(".//EOAauthor") is not None:
xmlEOAchapter.append(xmlChapter.find(".//EOAauthor"))
# Attache enclosing Part to Chapter, see django structure for this purpose
if xmlChapter.getparent().tag == "div0":
if xmlChapter.getparent().get("id") not in listPartIDs:
listPartIDs.append(xmlChapter.getparent().get("id"))
xmlPartHeadline = xmlChapter.getparent().find("head")
xmlPartHeadline.tag = "EOAparthtml"
xmlEOAchapter.append(xmlPartHeadline)
# Append Chapter to xmlEOAdocument
xmlEOAdocument.append(xmlEOAchapter)
# iterate over children of Chapter
for xmlChapterChild in xmlChapter.iterchildren():
if xmlChapterChild.tag == "div2":
# Process Section Title
xmlEOAsection = etree.Element("EOAsection")
xmlEOAsection.set("order", str(intObjectNumber))
if xmlChapterChild.get("rend") != "nonumber":
xmlEOAsection.set("id", xmlChapterChild.get("id"))
xmlEOAsection.set("number", dictSections[xmlChapterChild.get("id")])
intObjectNumber += 1
xmlHead = xmlChapter.find(".//head")
logging.debug("Section '%s'" % libeoaconvert.gettext(xmlHead))
xmlEOAsection.append(djangoParseHeadline(xmlHead))
xmlEOAchapter.append(xmlEOAsection)
# Iterate over Children of Section
for xmlSectionChild in xmlChapterChild.iterchildren():
if xmlSectionChild.tag == "div3":
# Process Subsection Title
xmlEOAsubsection = etree.Element("EOAsubsection")
xmlEOAsubsection.set("order", str(intObjectNumber))
if xmlSectionChild.get("rend") != "nonumber":
xmlEOAsubsection.set("id", xmlSectionChild.get("id"))
xmlEOAsubsection.set("number", dictSections[xmlSectionChild.get("id")])
intObjectNumber += 1
xmlHead = xmlSectionChild.find(".//head")
logging.debug("Subsection '%s'" % libeoaconvert.gettext(xmlHead))
xmlEOAsubsection.append(djangoParseHeadline(xmlHead))
xmlEOAchapter.append(xmlEOAsubsection)
# Iterate over children of Subsection
for xmlSubsectionChild in xmlSectionChild.iterchildren():
if xmlSubsectionChild.tag == "div4":
# Process Subsubsection Title
xmlEOAsubsubsection = etree.Element("EOAsubsubsection")
xmlEOAsubsubsection.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlHead = xmlSubsectionChild.find(".//head")
logging.debug(libeoaconvert.gettext(xmlHead))
xmlEOAsubsubsection.append(djangoParseHeadline(xmlHead))
xmlEOAchapter.append(xmlEOAsubsubsection)
# Iterate over children of Subsubsection
for xmlSubsubsectionChild in xmlSubsectionChild.iterchildren():
xmlEOAchapter.append(djangoParseObject(xmlSubsubsectionChild))
else:
xmlEOAchapter.append(djangoParseObject(xmlSubsectionChild))
elif xmlSectionChild.tag == "div4":
# Process Subsubsection Title
xmlEOAsubsubsection = etree.Element("EOAsubsubsection")
xmlEOAsubsubsection.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlHead = xmlSectionChild.find(".//head")
xmlEOAsubsubsection.append(djangoParseHeadline(xmlHead))
xmlEOAchapter.append(xmlEOAsubsubsection)
# Iterate over children of Subsubsection
for xmlSubsubsectionChild in xmlSectionChild.iterchildren():
if xmlSubsubsectionChild.tag == "div5":
logging.debug("jubel")
# although it's div5, promote it to subsubsection
xmlEOAparasection = etree.Element("EOAsubsubsection")
# xmlEOAparasection = etree.Element("EOAparasection")
xmlEOAparasection.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlHead = xmlSubsubsectionChild.find(".//head")
logging.debug(libeoaconvert.gettext(xmlHead))
xmlEOAparasection.append(djangoParseHeadline(xmlHead))
xmlEOAchapter.append(xmlEOAparasection)
for xmlParasectionChild in xmlSubsubsectionChild.iterchildren():
xmlEOAchapter.append(djangoParseObject(xmlParasectionChild))
else:
xmlEOAchapter.append(djangoParseObject(xmlSubsubsectionChild))
else:
xmlEOAchapter.append(djangoParseObject(xmlSectionChild))
else:
xmlEOAchapter.append(djangoParseObject(xmlChapterChild))
intChapterNumber += 1
libeoaconvert.debug_xml_here(
xmlTree,
"afterchapter",
DEBUG_DIR
)
logging.info("----------------------------------------------")
logging.info("Processing Facsimile Parts")
listModes = ["text", "textPollux", "xml"]
strBasicURL = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/interface/page-fragment.xql?document="
parserECHO = etree.XMLParser()
xmlParts = xmlTree.findall("//div0")
intFacNumber = 1
for xmlPart in xmlParts:
intObjectNumber = 1
intFacPartNumber = 1
if xmlPart.find(".//EOAfacsimilepart") is None:
continue
xmlEOAfacsimilepart = etree.Element("EOAfacsimilepart")
xmlEOAfacsimilepart.set("order", str(intChapterNumber))
xmlEOAfacsimileparthead = xmlPart.find(".//head")
for xmlChild in xmlEOAfacsimileparthead:
if xmlChild.tag == "hi":
xmlChild.tag = "em"
del xmlChild.attrib["rend"]
xmlEOAfacsimilepart.append(xmlEOAfacsimileparthead)
intChapterNumber += 1
xmlEOAdocument.append(xmlEOAfacsimilepart)
xmlFacsimilepages = xmlPart.findall(".//EOAfacsimilepage")
intFacPageNumber = 1
for xmlFacsimilepage in xmlFacsimilepages:
strImageFile = xmlFacsimilepage.find(".//file").text
strLabel = xmlFacsimilepage.find(".//label").text
strPagenumber = xmlFacsimilepage.find(".//pagenumber").text or ""
xmlEOAfacsimilepage = etree.Element("EOAfacsimilepage")
xmlEOAfacsimilepage.set("order", str(intObjectNumber))
# TODO: Hier noch irgendwie (fehlendem) Suffix der Datei umgehen. Und ggf. Dateien Konvertieren
strImageFile = strImageFile.rstrip("\n")
strImageFileDir = os.path.dirname(strImageFile)
strImageFileDir = re.sub("/", "", strImageFileDir)
strImageFileName = os.path.basename(strImageFile)
shutil.copy(
PUBLICATION_DIR / strImageFile,
OUTPUT_DIR / "images" / (strImageFileDir + strImageFileName)
)
# shutil.copy(os.getcwd() + "/" + strImageFile, os.getcwd() + "/CONVERT/django/images/" + strImageFileDir + strImageFileName)
intObjectNumber += 1
# Download transcription for this Page
fulltext_string = xmlFacsimilepage.find(".//fulltext").text
if fulltext_string is not None:
logging.debug(f"Found a link to full text: {fulltext_string}")
if fulltext_string.find(",") == -1:
logging.info("Fulltext is linked in the document.")
# hier weiter!!!
else:
strFacsimileURL = re.split(",", fulltext_string)[0]
strFacsimilePage = re.split(",", fulltext_string)[1]
for strMode in listModes:
strURL = strBasicURL + strFacsimileURL + "&pn=" + strFacsimilePage + "&mode=" + strMode
logging.debug("Processing Facsimile : " + strURL)
xmlECHOtree = etree.parse(strURL, parserECHO)
# Remove ECHO-namespaces
objectify.deannotate(xmlECHOtree, xsi_nil=True)
etree.cleanup_namespaces(xmlECHOtree)
xmlDivs = xmlECHOtree.findall(".//div")
for xmlDiv in xmlDivs:
if xmlDiv.get("class") == "pageContent":
# Create new EOA-Element
xmlEOAfacsimileelement = etree.Element("EOAfacsimileelement")
xmlEOAfacsimileelement.set("type", strMode)
# Fix Images in the <div>-Element
xmlImages = xmlDiv.findall(".//img")
intFacImgNumber = 1
for xmlImage in xmlImages:
strImageSrc = xmlImage.get("src")
strCommand = "{cmd} {src} -o {dst}".format(
cmd = curl,
src = strImageSrc,
dst = OUTPUT_DIR / "images" / ("facsupplements_" + str(intFacNumber) + "_" + str(intFacPageNumber) + "_" + str(intFacImgNumber) + ".jpg")
)
# strCommand = "curl " + strImageSrc + " -o CONVERT/django/images/facsupplements_" + str(intFacNumber) + "_" + str(intFacPageNumber) + "_" + str(intFacImgNumber) + ".jpg"
listArguments = shlex.split(strCommand)
try:
exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True)
xmlImage.set("src", "facsupplements_" + str(intFacNumber) + "_" + str(intFacPageNumber) + "_" + str(intFacImgNumber) + ".jpg")
except:
xmlImage.tag = "temp"
intFacImgNumber += 1
# Change of scr of img-Element
xmlEOAfacsimileelement.append(xmlDiv)
xmlEOAfacsimilepage.append(xmlEOAfacsimileelement)
intFacPageNumber += 1
xmlEOAfacsimilepage.set("file", strImageFileDir + strImageFileName)
xmlEOAfacsimilepage.set("label", str(strLabel))
xmlEOAfacsimilepage.set("pagenumber", str(strPagenumber))
xmlEOAfacsimilepart.append(xmlEOAfacsimilepage)
intFacNumber =+ 1
etree.strip_tags(xmlDjangoTree, "temp")
logging.info("----------------------------------------------")
logging.info("Processing and linking Footnotes for django")
xmlEOAchapters = xmlEOAdocument.findall(".//EOAchapter")
# debug_chapters(xmlEOAchapters)
translation_xml = etree.parse( str( TRANSLATION_FILE ) )
dictLangFootnotes = translation_xml.find("//entry[@name='footnotes']").attrib
for xmlEOAchapter in xmlEOAchapters:
groupings = libeoaconvert.get_bigfoot_data(xmlEOAchapter)
has_old = 0 != len(xmlEOAchapter.findall(".//note[@place='Inline']"))
has_new = 0 != len(
[ # flatten
note
for grouping, notes in groupings
for note in notes
]
)
# XOR falls through, AND is an error (that should have already been thrown during the epub phase), and NOR skips to the next chapter
if has_old:
if has_new:
raise FootnoteError("This chapter contains both old-style footnotes and new-style footnotes")
else:
if not has_new:
continue
# Find out running order of last item the chapter
# Hier pro FN zunächst die EOAequationnonumber in <p> korrigieren
# Dann pro FN die Kindelemente abarbeiten und an die neue FN dran hängen
# Ggf. aufpassen, ob ein Absatz mit indent versehen ist, dann blockquote drum herum machen
xmlElement = xmlEOAchapter[(len(xmlEOAchapter)-1)]
logging.debug(etree.tostring(xmlElement))
intObjectNumber = (int(xmlElement.get("order")) + 1)
intFootnoteNumber = 1
xmlResult = etree.Element("temp")
xmlEOAsection = etree.Element("EOAsection")
xmlEOAsection.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlHead = etree.Element("head")
xmlHead.text = dictLangFootnotes[libeoaconvert.two_letter_language(xmlEOAchapter.get("language"))]
xmlEOAsection.append(xmlHead)
xmlResult.append(xmlEOAsection)
for grouping, notes in groupings:
for index, note in enumerate(notes):
# do for the new-style notes what the old code did for the other footnotes
fntext = str(index+1)
if "lower-latin" == grouping:
fntext = alph_footnote_index(index)
unique_id = "fn%s" % fntext
intObjectNumber = bring_footnote_down_django(note, unique_id, fntext, intObjectNumber, unique_id, xmlResult)
intFootnoteNumber = 1
xmlFootnotes = xmlEOAchapter.findall(".//note[@place='Inline']")
for xmlFootnote in xmlFootnotes:
tmpStrUID = xmlFootnote.get("id")
logging.debug(f"Looking at footnote {tmpStrUID}.")
xml_EOA_indices = xmlFootnote.xpath(".//EOAindex | .//EOAindexperson | .//EOAindexlocation")
for xmlEOAindex in xml_EOA_indices:
logging.debug("Removing index entry in footnote.")
xmlEOAindex.tag = "elementtoberemoved"
etree.strip_elements(xmlFootnote, "elementtoberemoved", with_tail=False)
xmlFootnoteContent = list(xmlFootnote)
strFootnoteText = xmlFootnote.text or ""
tmpTail = xmlFootnote.tail
xmlFootnote.clear()
xmlFootnote.tail = tmpTail
xmlFootnote.tag = "sup"
xmlFootnote.set("class", "footnote")
xmlFootnoteLink = etree.Element("a")
xmlFootnoteLink.set("href", "#fn" + str(intFootnoteNumber))
xmlFootnoteLink.text = str(intFootnoteNumber)
xmlFootnote.append(xmlFootnoteLink)
xmlEOAfootnote = etree.Element("EOAfootnote")
xmlEOAfootnote.set("order", str(intObjectNumber))
intObjectNumber += 1
xmlEOAfootnote.set("number", str(intFootnoteNumber))
for xmlParent in xmlFootnote.iterancestors():
if xmlParent.get("order") is not None:
strFootnoteAnchorNumber = xmlParent.get("order")
break
xmlEOAfootnote.set("anchor", strFootnoteAnchorNumber)
xmlEOAfootnote.set("id", tmpStrUID)
xmlEOAfootnote.text = strFootnoteText
for xmlElement in xmlFootnoteContent:
if xmlElement.tag != "p":
surrounding_p = etree.fromstring("""<p></p>""")
if xmlElement.tag == "EOAequationnonumber":
shutil.copy(
PUBLICATION_DIR / "items" / xmlElement.get("filename"),
OUTPUT_DIR / "images/"
)
elif xmlElement.tag == "EOAverse":
verse_tail = xmlElement.tail
xmlElement.tail = ""
xmlElement.tag = "span"
xmlElement.set("style", "verse")
versecontent_bytes = etree.tostring(xmlElement)
versecontent_string = versecontent_bytes.decode("utf-8")
xml_verselines = versecontent_string.split("\n")
logging.debug("Removing surrounding EOAverse tags")
xml_result_string = xml_verselines[0]
for xml_verseline in xml_verselines[1:]:
xml_result_string += f"<br/>{xml_verseline}"
verse_paragraph = etree.fromstring(xml_result_string)
xmlElement.tail = verse_tail
surrounding_p.append(xmlElement)
elif xmlElement.tag == "span":
surrounding_p.append(xmlElement)
elif xmlElement.tag == "EOAfigurenonumber":
surrounding_p = etree.fromstring("""<img/>""")
strImageFileString = xmlElement.find(".//file").text
strImageFileString = strImageFileString.rstrip("\n")
strImageFileDir = os.path.dirname(strImageFileString)
strImageFileDir = re.sub("/", "", strImageFileDir)
strImageFileName = os.path.basename(strImageFileString)
strImageFileNamewoSuffix = os.path.splitext(strImageFileName)[0]
shutil.copy(
PUBLICATION_DIR / strImageFileString,
OUTPUT_DIR / "images" / "embedded" / (strImageFileDir + strImageFileName)
)
surrounding_p.set("src", strImageFileDir + strImageFileName)
surrounding_p.set("width", xmlElement.find(".//width").text + "%;")
xmlElement = surrounding_p
else:
logging.debug("Footnote paragraph")
xmlEOAfootnote.append(xmlElement)
xmlResult.append(xmlEOAfootnote)
footnote_as_plain_text = libeoaconvert.remove_all_tags(xmlEOAfootnote)
if len(footnote_as_plain_text) > 200:
maybe_truncated_footnote_as_plain_text = footnote_as_plain_text[:200] + "…"
else:
maybe_truncated_footnote_as_plain_text = footnote_as_plain_text
xmlFootnoteLink.set("title", maybe_truncated_footnote_as_plain_text)
intFootnoteNumber += 1
xmlEOAchapter.append(xmlResult)
# Remove temp-Tag
etree.strip_tags(xmlDjangoTree, "temp")
logging.info("----------------------------------------------")
logging.info("Processing various Elements")
for xmlEOAchapter in xmlEOAchapters:
xmlEmphasized = xmlEOAchapter.findall(".//hi")
for xmlEmph in xmlEmphasized:
rend_attribute = xmlEmph.get("rend")
if rend_attribute == "it":
xmlEmph.tag = "em"
del xmlEmph.attrib["rend"]
elif rend_attribute == "bold":
xmlEmph.tag = "b"
del xmlEmph.attrib["rend"]
elif rend_attribute == "red":
xmlEmph.tag = "span"
xmlEmph.set("class", "red")
del xmlEmph.attrib["rend"]
xmlHyperlinks = xmlEOAchapter.findall(".//xref")
for xmlHyperlink in xmlHyperlinks:
libeoaconvert.format_hyperlinks_django_epub(xmlHyperlink, strLanguage)
# Convert EOAup to <sup>
xmlUps = xmlEOAchapter.findall(".//EOAup")
for xmlUp in xmlUps:
xmlUp.tag = "sup"
# Convert EOAdown to <sub>
xmlDowns = xmlEOAchapter.findall(".//EOAdown")
for xmlDown in xmlDowns:
xmlDown.tag = "sub"
# Convert EOAst to <span>
xmlStrikeouts = xmlEOAchapter.findall(".//EOAst")
for xmlStrikeout in xmlStrikeouts:
xmlStrikeout.tag = "span"
xmlStrikeout.set("style", "text-decoration: line-through;")
# Convert letter-spacing into something nice
xmlLetterspaceds = xmlEOAchapter.findall(".//EOAls")
for xmlLetterspaced in xmlLetterspaceds:
xmlLetterspaced.tag = "span"
xmlLetterspaced.set("style", "letter-spacing: 0.5em;")
# Convert letter-spacing into something nice
xmlCaps = xmlEOAchapter.findall(".//EOAcaps")
for xmlCap in xmlCaps:
xmlCap.tag = "span"
xmlCap.set("style", "font-variant:small-caps;")
# Convert EOAineq into appropriate IMG-Tags
xmlInlineEquations = xmlEOAchapter.findall(".//EOAineq")
for xmlInlineEquation in xmlInlineEquations:
xmlInlineEquation.tag = "img"
xmlInlineEquation.set("class", "EOAineq")
xmlInlineEquation.set("alt", xmlInlineEquation.get("TeX"))
shutil.copy(
INPUT_DIR / "items" / xmlInlineEquation.get("src"),
OUTPUT_DIR / "images" / xmlInlineEquation.get("src")
)
# shutil.copy(os.getcwd() + "/items/" + xmlInlineEquation.get("src"), os.getcwd() + "/CONVERT/django/images/" + xmlInlineEquation.get("src"))
# Convert EOAchem into appropriate IMG-Tags
xml_inline_chems = xmlEOAchapter.findall(".//EOAchem")
for xml_inline_chem in xml_inline_chems:
xml_inline_chem.tag = "img"
xml_inline_chem.set("class", "EOAineq")
xml_inline_chem.set("alt", xml_inline_chem.get("TeX"))
shutil.copy(
INPUT_DIR / "items" / xml_inline_chem.get("src"),
OUTPUT_DIR / "images" / xml_inline_chem.get("src")
)
# shutil.copy(os.getcwd() + "/items/" + xml_inline_chem.get("src"), os.getcwd() + "/CONVERT/django/images/" + xml_inline_chem.get("src"))
# Convert EOAinline into appropriate IMG-Tags
xmlInlineElements = xmlEOAchapter.findall(".//EOAinline")
for xmlInlineElement in xmlInlineElements:
xmlInlineElement.tag = "img"
xmlInlineElement.set("class", "EOAinline")
xmlInlineElement.set("alt", "")
xmlInlineElement.set("class", "eoainlineimage")
strInlineElementFilePath = xmlInlineElement.text
strInlineElementFileName = os.path.basename(strInlineElementFilePath)
strInlineElementDirName = os.path.dirname(strInlineElementFilePath)
strInlineElementSubDirName = os.path.dirname(strInlineElementFilePath).split(os.path.sep)[-1]
xmlInlineElement.text = None
if os.path.splitext(strInlineElementFileName)[1].lower() != ".pdf":
pass
else:
logging.debug(f"""Found a PDF file: {PUBLICATION_DIR / strInlineElementDirName / strInlineElementFileName}""")
strImageFilepath = libeoaconvert.sanitizeImage(
PUBLICATION_DIR / strInlineElementDirName / strInlineElementFileName,
TEMP_DIR,
GM_PATH,
PDFCROP_EXEC,
margin=False
)
strInlineElementFileName = strInlineElementFileName.replace(".pdf", ".png")
xmlInlineElement.set("src", strInlineElementSubDirName + strInlineElementFileName)
logging.debug(f"{strInlineElementDirName} is dirname, {strInlineElementFileName} is filename/basepath")
logging.debug(f"""copy from {PUBLICATION_DIR / strInlineElementDirName / strInlineElementFileName} to {OUTPUT_DIR / "images/embedded" / (strInlineElementDirName + strInlineElementFileName)}""")
shutil.copy(
PUBLICATION_DIR / strInlineElementDirName / strInlineElementFileName,
OUTPUT_DIR / "images/embedded" / (strInlineElementSubDirName + strInlineElementFileName)
)
# shutil.copy(os.getcwd() + "/" + strInlineElementDirName + "/" + strInlineElementFileName, os.getcwd() + "/CONVERT/django/images/embedded/" + strInlineElementDirName + strInlineElementFileName)
strNewImagePath = OUTPUT_DIR / "images/embedded" / (strInlineElementSubDirName + strInlineElementFileName)
# strNewImagePath = os.getcwd() + "/CONVERT/django/images/embedded/" + strInlineElementDirName + strInlineElementFileName
# strCommand = GM_PATH + " convert " + str(strNewImagePath) + " -resize 20x20 " + str(strNewImagePath)
# listArguments = shlex.split(strCommand)
# subprocess.check_output(listArguments, shell=False)
# Change EOAcitenumeric into a span to create approriate link
xmlEOAcitenumerics = xmlEOAchapter.findall(".//EOAcitenumeric")
for xmlEOAcitenumeric in xmlEOAcitenumerics:
xmlEOAcitenumeric.tag = "span"
xmlEOAcitenumeric.set("class", "citation")
xmlEOAcitenumeric.set("rel", "popover")
# Change EOAciteauthoryear into a span to create approriate link
xmlEOAciteauthoryears = xmlEOAchapter.findall(".//EOAciteauthoryear")
for xmlEOAciteauthoryear in xmlEOAciteauthoryears:
xmlEOAciteauthoryear.tag = "span"
xmlEOAciteauthoryear.set("class", "citation")
xmlEOAciteauthoryear.set("rel", "popover")
# Change EOAciteauthoryear into a span to create approriate link
xmlEOAciteyears = xmlEOAchapter.findall(".//EOAciteyear")
for xmlEOAciteyear in xmlEOAciteyears:
xmlEOAciteyear.tag = "span"
xmlEOAciteyear.set("class", "citation")
xmlEOAciteyear.set("rel", "popover")
# Change EOAciteauthoryear into a span to create approriate link
xmlEOAcitemanuals = xmlEOAchapter.findall(".//EOAcitemanual")
for xmlEOAcitemanual in xmlEOAcitemanuals:
xmlEOAcitemanual.tag = "span"
xmlEOAcitemanual.set("class", "citation")
xmlEOAcitemanual.set("rel", "popover")
logging.info("----------------------------------------------")
logging.info("Processing Cross References")
libeoaconvert.debug_xml_here(
xmlDjangoTree,
"beforecrossreference",
DEBUG_DIR
)
failed_ids = []
# Substitute References with their targets (wit links)
for xmlEOAchapter in xmlEOAchapters:
# for hyperimage collages
originalcontents = xmlEOAchapter.findall(".//originalcontents")
if originalcontents is not None:
for originalcontent in originalcontents:
previous_element = originalcontent.getprevious()
if originalcontent.getparent().tag == "EOAref":
pass
elif previous_element.tag != "EOAref":
logging.error("Found a stray originalcontents element.")
else:
oc_tail = originalcontent.tail
originalcontent.tail = ""
previous_element.append(originalcontent)
if previous_element.tail is not None:
logging.warning("Appending the old tail of EOAref")
previous_element.tail += oc_tail
else:
previous_element.tail = oc_tail
else:
logging.info("No originalcontents elements found.")
xmlReferences = xmlEOAchapter.findall(".//EOAref")
for xmlReference in xmlReferences:
strResult = "!!! Cross Reference !!!"
strChapterOrder = ""
strObjectOrder = ""
ref_is_text = False
ref_is_collage = False
reference_type = xmlReference.get("type")
originalcontents = xmlReference.find("originalcontents")
xmlReferenceLabel = xmlReference.find("Label")
xmlReferenceLabelText = xmlReferenceLabel.text
xmlReferenceRef = xmlReference.find("ref")
xmlReferenceRefTarget = xmlReferenceRef.get("target")
if xmlReferenceLabelText in dictEquations:
# Grab Number from Dictionary
strResult = dictEquations[xmlReferenceLabelText]
# Go through all equations and find the corresponding Equation
xmlEOAequations = xmlEOAdocument.findall(".//EOAequation")
for xmlEOAequation in xmlEOAequations:
tmpReferenceLabelText = xmlEOAequation.get("label")
if xmlReferenceLabelText == tmpReferenceLabelText:
logging.debug("Successfully found link to array formula: %s" % strResult)
for xmlParent in xmlEOAequation.iterancestors():
if xmlParent.tag == "EOAchapter":
strChapterOrder = xmlParent.get("order")
strObjectOrder = xmlEOAequation.get("order")
elif xmlReferenceRefTarget in dictEquations:
# Grab Number from Dictionary
strResult = dictEquations[xmlReferenceRefTarget]
# Go through all equations and find the corresponding Equation
xmlEOAequations = xmlEOAdocument.findall(".//EOAequation")
for xmlEOAequation in xmlEOAequations:
tmpReferenceRefTarget = xmlEOAequation.get("uid")
if xmlReferenceRefTarget == tmpReferenceRefTarget:
logging.debug("Successfully found link to normal formula: %s" % strResult)
for xmlParent in xmlEOAequation.iterancestors():
if xmlParent.tag == "EOAchapter":
strChapterOrder = xmlParent.get("order")
strObjectOrder = xmlEOAequation.get("order")
elif xmlReferenceRefTarget in dictLists:
logging.debug("Found link to list.")
strResult = dictLists[xmlReferenceRefTarget]
xmlEOAlistitem = xmlEOAdocument.xpath("//EOAchapter/*[contains(@id, $targetuid)]", targetuid = xmlReferenceRefTarget)[0]
for xmlParent in xmlEOAlistitem.iterancestors():
if xmlParent.tag == "EOAchapter":
strChapterOrder = xmlParent.get("order")
strObjectOrder = xmlEOAlistitem.get("order")
elif xmlReferenceRefTarget in dictChapters:
logging.debug("Found link to chapter.")
strResult = dictChapters[xmlReferenceRefTarget]
xmlEOAchapter = xmlEOAdocument.xpath(f".//EOAchapter[@id='{xmlReferenceRefTarget}']")
if len(xmlEOAchapter) == 0:
logging.warning("There seems to be no corresponding id for %s." % xmlReferenceRefTarget)
# if uid is the one from the anchor after the head
# element, that anchor element has been removed by now
# and we need to find the corresponding element by
# string comparison in the dictionary
same_sr = [i for i in dictChapters if dictChapters[i] == strResult]
same_sr.remove(xmlReferenceRefTarget)
if len(same_sr) == 0:
logging.error("id cannot be found.")
elif len(same_sr) > 1:
logging.error("id is ambiguous.")
else:
logging.info(f"Using {same_sr[0]} instead.")
right_chapter = xmlEOAdocument.xpath(f".//EOAchapter[@id='{same_sr[0]}']")[0]
elif len(xmlEOAchapter) > 1:
logging.error("The xml:id %s is assigned more than once. This is not allowed. Exiting." % xmlReferenceLabelText)
sys.exit(2)
else:
right_chapter = xmlEOAchapter[0]
strChapterOrder = right_chapter.get("order")
strObjectOrder = "top"
elif xmlReferenceRefTarget in dictTheorems:
logging.debug("Found link to ein Theorem")
strResult = dictTheorems[xmlReferenceRefTarget]
for xmlEOAtheorem in xmlEOAdocument.findall(".//EOAtheorem"):
if xmlEOAtheorem.get("uid") == xmlReferenceRefTarget:
logging.debug("Successfully handled link to a theorem: %s " % strResult)
for xmlParent in xmlEOAtheorem.iterancestors():
if xmlParent.tag == "EOAchapter":
strObjectOrder = xmlEOAtheorem.get("order")
strChapterOrder = xmlParent.get("order")
elif xmlReferenceRefTarget in dictSections:
logging.debug("Found link to section")
strResult = dictSections[xmlReferenceRefTarget]
xmlEOAsection = xmlEOAdocument.xpath(f".//EOAsection[@id='{xmlReferenceRefTarget}']")
if len(xmlEOAsection) == 0:
logging.warning("There seems to be no corresponding id for %s." % xmlReferenceRefTarget)
# see explanation at dictChapters
same_sr = [i for i in dictSections if dictSections[i] == strResult]
same_sr.remove(xmlReferenceRefTarget)
if len(same_sr) == 0:
logging.error("id cannot be found.")
elif len(same_sr) > 1:
logging.error("id is ambiguous.")
else:
logging.info(f"Using {same_sr[0]} instead.")
right_section = xmlEOAdocument.xpath(f".//EOAsection[@id='{same_sr[0]}']")[0]
elif len(xmlEOAsection) > 1:
logging.error("The xml:id %s is assigned more than once. This is not allowed. Exiting." % xmlReferenceLabelText)
sys.exit(2)
else:
right_section = xmlEOAsection[0]
for xmlParent in right_section.iterancestors():
if xmlParent.tag == "EOAchapter":
strChapterOrder = xmlParent.get("order")
strObjectOrder = right_section.get("order")
xmlEOAsubsections = xmlEOAdocument.findall(".//EOAsubsection")
for xmlEOAsubsection in xmlEOAsubsections:
tmpReferenceRefTarget = xmlEOAsubsection.get("id")
if xmlReferenceRefTarget == tmpReferenceRefTarget:
logging.debug("Successfully handled link to subsection %s: " % strResult)
for xmlParent in xmlEOAsubsection.iterancestors():
if xmlParent.tag == "EOAchapter":
strChapterOrder = xmlParent.get("order")
strObjectOrder = xmlEOAsubsection.get("order")
elif xmlReferenceRefTarget in dictFigures:
logging.debug("Found link to figure")
strResult = dictFigures[xmlReferenceRefTarget]
xmlEOAfigures = xmlEOAdocument.findall(".//EOAfigure")
for xmlEOAfigure in xmlEOAfigures:
tmpReferenceRefTarget = xmlEOAfigure.get("id")
if xmlReferenceRefTarget == tmpReferenceRefTarget:
logging.debug("Successfully handled link to figure: %s" % strResult)
for xmlParent in xmlEOAfigure.iterancestors():
if xmlParent.tag == "EOAchapter":
strChapterOrder = xmlParent.get("order")
strObjectOrder = xmlEOAfigure.get("order")
elif xmlReferenceRefTarget in dictFootnotes:
logging.debug("Found link to footnote")
strResult = dictFootnotes[xmlReferenceRefTarget]
xmlEOAfootnotes = xmlEOAdocument.findall(".//EOAfootnote")
for xmlEOAfootnote in xmlEOAfootnotes:
tmpReferenceRefTarget = xmlEOAfootnote.get("id")
if xmlReferenceRefTarget == tmpReferenceRefTarget:
logging.debug("Successfully handled link to footnote: %s" % strResult)
for xmlParent in xmlEOAfootnote.iterancestors():
if xmlParent.tag == "EOAchapter":
strChapterOrder = xmlParent.get("order")
strObjectOrder = xmlEOAfootnote.get("order")
elif xmlReferenceRefTarget in dictTables:
logging.debug("Found link to table")
strResult = dictTables[xmlReferenceRefTarget]
xmlEOAtables = xmlEOAdocument.findall(".//EOAtable")
for xmlEOAtable in xmlEOAtables:
tmpReferenceRefTarget = xmlEOAtable.get("label")
if xmlReferenceLabelText == tmpReferenceRefTarget:
logging.debug("Successfully handled link to table: %s" % strResult)
for xmlParent in xmlEOAtable.iterancestors():
if xmlParent.tag == "EOAchapter":
strChapterOrder = xmlParent.get("order")
strObjectOrder = xmlEOAtable.get("order")
else:
logging.debug("Found this other reference")
if reference_type == "collage":
logging.info(f"{xmlReferenceRefTarget} is a collage")
ref_is_collage = True
elif reference_type == "text":
logging.debug(f"{xmlReferenceRefTarget} is a text link")
ref_is_text = True
xmlReferenceRef = xmlReference.find("ref")
xmlReferenceRefTarget = xmlReferenceRef.get("target")
xmlReferenceLabel = xmlReference.find("Label")
xmlReferenceLabelText = xmlReferenceLabel.text
pararef = xmlDjangoTree.xpath("//*[@id='%s']" % xmlReferenceRefTarget)
if len(pararef) == 0:
logging.warning("There seems to be no corresponding xml:id for %s." % xmlReferenceRefTarget)
failed_ids.append(f"{xmlReferenceRefTarget} ({xmlReferenceLabelText})\n")
elif len(pararef) > 1:
logging.error("The xml:id %s is assigned more than once. This is not allowed. Exiting." % xmlReferenceLabelText)
sys.exit(2)
else:
for xmlParent in pararef[0].iterancestors():
if xmlParent.tag == "EOAchapter":
strChapterOrder = xmlParent.get("order")
for xmlParent in pararef[0].iterancestors():
if xmlParent.tag == "EOAparagraph":
strObjectOrder = xmlParent.get("order")
all_children = list(xmlReference)
text_has_children = all_children[:-2]
if text_has_children:
reference_text = xmlReference.text
textref_innards = list()
for xml_child in text_has_children:
textref_innards.append(xml_child)
else:
reference_text = xmlReference.text.strip()
else:
guessref = xmlDjangoTree.xpath("//*[@id='%s']" % xmlReferenceRefTarget)
if len(guessref) == 0:
logging.warning("There seems to be no corresponding xml:id for %s." % xmlReferenceLabelText)
failed_ids.append(xmlReferenceLabelText + "\n")
elif len(guessref) > 1:
logging.error("The xml:id %s is assigned more than once. This is not allowed. Exiting." % xmlReferenceLabelText)
sys.exit(2)
else:
for xmlParent in guessref[0].iterancestors():
if xmlParent.tag == "EOAparagraph":
strObjectOrder = xmlParent.get("order")
id_container = xmlParent.xpath("preceding-sibling::EOAsection[1]")[0]
section_id = id_container.get("id")
strResult = dictSections[section_id]
if xmlParent.tag == "EOAchapter":
strChapterOrder = xmlParent.get("order")
tmpTail = xmlReference.tail or ""
xmlReference.clear()
if originalcontents is not None:
logging.info("Found originalcontents")
xmlReference.append(originalcontents)
elif ref_is_text:
xmlReference.text = reference_text
if text_has_children:
for item in reversed(textref_innards):
xmlReference.insert(0, item)
else:
xmlReference.text = strResult
xmlReference.tail = tmpTail
xmlReference.tag = "a"
# hyperimage
if xmlReferenceRef.get("data-hilayer"):
xmlReference.set("data-hilayer", xmlReferenceRef.get("data-hilayer"))
elif xmlReferenceRef.get("hitarget"):
xmlReference.set("class", "HILink")
href_string = "#" + xmlReferenceRef.get("hitarget")
if strObjectOrder:
href_string = "../" + strChapterOrder + "/index.html#" + strObjectOrder
else:
href_string = "strChapterOrder missing"
logging.warning("strObjectOrder is missing!")
xmlReference.set("href", href_string)
if ref_is_collage:
xmlReference.set("type", "collage")
else:
pass
logging.info("----------------------------------------------")
logging.info("Processing Page References")
for xmlEOAchapter in xmlEOAchapters:
xmlPageReferences = xmlEOAchapter.findall(".//EOApageref")
strResult = "!!! Page Reference !!!"
for xmlReference in xmlPageReferences:
xmlReferenceLabel = xmlReference.find("Label")
xmlReferenceLabelText = xmlReferenceLabel.text
xmlReferenceRef = xmlReference.find("ref")
xmlReferenceRefTarget = xmlReferenceRef.get("target")
if xmlReferenceLabelText in dictPagelabels:
logging.debug("Found link to page: %s" % xmlReferenceLabelText)
strResult = dictPagelabels[xmlReferenceLabelText]
else:
logging.warning("Page reference not fully implemented yet, see https://github.molgen.mpg.de/EditionOpenAccess/EOASkripts/issues/52")
xmlReference.text = strResult
for xmlChild in xmlReference.iterchildren():
xmlReference.remove(xmlChild)
# Check, if EOApageref points to a Facsimile-Page
# If yes, make a href to the facsimile
xmlEOAfacsimilepages = xmlEOAdocument.findall(".//EOAfacsimilepage")
for xmlEOAfacsimilepage in xmlEOAfacsimilepages:
if xmlEOAfacsimilepage.get("label") == xmlReferenceLabelText:
logging.debug("Found cross reference to facsimile.")
xmlReference.tag = "a"
strPartOrder = xmlEOAfacsimilepage.getparent().get("order")
strFacsimileOrder = xmlEOAfacsimilepage.get("order")
logging.debug(strFacsimileOrder)
xmlReference.set("href", "../" + strPartOrder + "/" + strFacsimileOrder + ".html")
logging.info("----------------------------------------------")
logging.info("Normalizing Index Entries")
for xmlEOAchapter in xmlEOAchapters:
xml_EOA_indices = xmlEOAchapter.xpath(".//EOAindex | .//EOAindexperson | .//EOAindexlocation")
for xmlEOAindex in xml_EOA_indices:
# Using the gettext function here, because of subelements
# strEOAindextext = xmlEOAindex.text
strEOAindextext = libeoaconvert.gettext(xmlEOAindex)
strEOAindextext = strEOAindextext.replace("\n", " ")
index_children = xmlEOAindex.getchildren()
if index_children is not None:
for sub_element in index_children:
xmlEOAindex.remove(sub_element)
xmlEOAindex.text = None
listFirstPart = re.split('\|', strEOAindextext)
tmpEntry = listFirstPart[0]
listSecondPart = re.split('\!', tmpEntry)
strMainEntry = listSecondPart[0]
# Check if a sortkey is present via @
listSortKey = re.split('@', strMainEntry)
if len(listSortKey) == 2:
xmlEOAindex.set("main", listSortKey[0])
xmlEOAindex.set("display", listSortKey[1])
else:
xmlEOAindex.set("main", strMainEntry)
if len(listSecondPart) > 1:
strSecondPart = listSecondPart[1]
listSecondarySortkey = re.split('@', strSecondPart)
if len(listSecondarySortkey) == 2:
xmlEOAindex.set("secondary", listSecondarySortkey[0])
xmlEOAindex.set("secondarydisplay", listSecondarySortkey[1])
else:
xmlEOAindex.set("secondary", strSecondPart)
if len(listFirstPart) > 1:
strAddition = listFirstPart[1]
if strAddition == "textbf":
xmlEOAindex.set("bold", "true")
tmpseealso = re.match('seealso', strAddition)
if tmpseealso != None:
tmpAddition = re.sub('seealso', '', strAddition)
xmlEOAindex.set("seealso", tmpAddition)
# Entries containing seealso are omitted for the time being
xmlEOAindex.tag = "temp"
tmpsee = re.match('^see(?!also)', strAddition)
if tmpsee != None:
tmpAddition = re.sub('see', '', strAddition)
xmlEOAindex.set("see", tmpAddition)
# Entries containing seealso are omitted for the time being
xmlEOAindex.tag = "temp"
# Figure out parent chapter number and parent Element order
for xmlParent in xmlEOAindex.iterancestors():
if xmlParent.get("order") != None and xmlParent.tag != "EOAchapter":
xmlEOAindex.set("elementorder", xmlParent.get("order"))
if xmlParent.get("order") != None and xmlParent.tag == "EOAchapter":
xmlEOAindex.set("chapterorder", xmlParent.get("order"))
# logging.info(etree.tostring(xmlEOAindex))
etree.strip_tags(xmlDjangoTree, "temp")
logging.info("----------------------------------------------")
logging.info("Removing Duplicate Index Entries")
for xmlEOAchapter in xmlEOAchapters:
for xmlChild in xmlEOAchapter.iterchildren():
dictEntries = {}
xml_EOA_indices = xmlChild.xpath(".//EOAindex | .//EOAindexperson | .//EOAindexlocation")
for xmlEOAindex in xml_EOA_indices:
listEntry = []
strEntry = xmlEOAindex.get("main")
if strEntry in dictEntries:
strSubentry = xmlEOAindex.get("secondary")
if strSubentry in dictEntries[strEntry] or strSubentry == None:
if (xmlChild.get("see") is None) and (xmlChild.get("seealso") is None):
xmlEOAindex.tag = "temp"
else:
dictEntries[strEntry].append(strSubentry)
else:
dictEntries[strEntry] = listEntry
logging.info("----------------------------------------------")
logging.info("Creating paragraph links")
paragraphs_with_corresp = xmlDjangoTree.xpath("//EOAparagraph[@corresp]")
for pc in paragraphs_with_corresp:
# get order of target and the chapter order to create the
# hyperlink pick this up on publicationimport and extend the model
# for a field, probably containing the html snippet for the URL
corresponding_attribute = pc.get("corresp")[1:]
corresponding_paragraph = xmlDjangoTree.xpath("//EOAparagraph[@xml:id='{}']".format(corresponding_attribute))
if len(corresponding_paragraph) == 0:
logging.error("There is no corresponding xml:id for %s. Exiting." % corresponding_attribute)
sys.exit(1)
elif len(corresponding_paragraph) > 1:
logging.error("The xml:id %s has been assigned more than once. This is not allowed. Exiting." % corresponding_paragraph[0].attrib["{http://www.w3.org/XML/1998/namespace}id"])
sys.exit(1)
else:
eoa_id_element = corresponding_paragraph[0]
paragraph_order = eoa_id_element.get("order")
for xml_parent in eoa_id_element.iterancestors():
if xml_parent.tag == "EOAchapter":
chapter_order = xml_parent.get("order")
href_text = f"../{chapter_order}/index.html#{paragraph_order}"
pc.set("href", href_text)
for pc in paragraphs_with_corresp:
etree.strip_attributes(pc, "corresp", "{http://www.w3.org/XML/1998/namespace}id")
logging.info("----------------------------------------------")
logging.info("Sorting and Creating Regular Index")
xml_regular_EOAindices = xmlDjangoTree.findall("//EOAindex")
if len(xml_regular_EOAindices) != 0:# is not None:
logging.debug("Sorting %s entries for regular index." % str(len(xml_regular_EOAindices)))
xml_eoa_print_regular_index = make_index(xml_regular_EOAindices, index_type = "regular")
libeoaconvert.debug_xml_here(
xmlDjangoTree,
"djangotree",
DEBUG_DIR
)
libeoaconvert.debug_xml_here(
xmlEOAdocument,
"xmleoadocument",
DEBUG_DIR
)
libeoaconvert.debug_xml_here(
xmlTree,
"xmltree",
DEBUG_DIR
)
# If EOAprintindex is found, append xml_eoa_print_regular_index to xmlEOAdocument
xmlPrintindex = xmlTree.find(".//EOAprintindex")
if xmlPrintindex is not None != 0:
# Remove <p><EOAprintindex/></p> from xmlDjangoTree
logging.info("found an index")
xmlPrintindex.tag = "temp"
xmlPrintindex.getparent().tag = "temp"
xmlEOAdocument.append(xml_eoa_print_regular_index)
else:
logging.info("found no index")
logging.info("----------------------------------------------")
logging.info("Sorting and Creating Person Index")
xml_person_EOAindices = xmlDjangoTree.findall("//EOAindexperson")
if len(xml_person_EOAindices) != 0:# is not None:
xml_eoa_print_person_index = make_index(xml_person_EOAindices, index_type = "person")
# If EOAprintpersonindex is found, append xml_eoa_print_person_index to xmlEOAdocument
# xmlPrintindex = xmlDjangoTree.find(".//EOAprintpersonindex")
xmlPrintindex = xmlTree.find("//EOAprintpersonindex")
if xmlPrintindex is not None != 0:
# Remove <p><EOAprintindex/></p> from xmlDjangoTree
xmlPrintindex.tag = "temp"
xmlPrintindex.getparent().tag = "temp"
xmlEOAdocument.append(xml_eoa_print_person_index)
# doing the same for location index
logging.info("----------------------------------------------")
logging.info("Sorting and Creating Location Index")
xml_location_EOAindices = xmlDjangoTree.findall("//EOAindexlocation")
if len(xml_location_EOAindices) != 0:# is not None:
xml_eoa_print_location_index = make_index(xml_location_EOAindices, index_type = "location")
# If EOAprintlocationindex is found, append xml_eoa_print_location_index to xmlEOAdocument
xmlPrintindex = xmlTree.find(".//EOAprintlocationindex")
if xmlPrintindex is not None != 0:
xmlPrintindex.tag = "temp"
xmlPrintindex.getparent().tag = "temp"
xmlEOAdocument.append(xml_eoa_print_location_index)
############################################################################
# Cleaning up #
############################################################################
# TODO: Die unnötigen Attribute wie id löschen
# TODO: Die unnötigen Tags wie EOAlabel löschen
collagelinks = xmlDjangoTree.xpath(".//a[@type='collage']/originalcontents/a")
for link in collagelinks:
link.tag = "temp"
some_empty_tags = xmlDjangoTree.xpath(".//anchor[not(node())] | .//b[not(node())]")
for tag in some_empty_tags:
tag.tag = "tagtobestripped"
etree.strip_tags(xmlDjangoTree, "temp", "citetext", "EOAprintbibliography", "originalcontents", "tagtobestripped")
etree.strip_elements(xmlDjangoTree, "citekey", "elementtoberemoved", with_tail=False)
etree.strip_attributes(xmlDjangoTree, "id-text", "id", "noindent", "type", "label", "spacebefore")#, "rend")
############################################################################
# Save xmlDjangoTree #
############################################################################
tmpFile = open( OUTPUT_DIR / "Django.xml", "w")
tmpResult = etree.tostring(xmlDjangoTree, pretty_print=True, encoding="unicode")
tmpFile.write(tmpResult)
tmpFile.close()
logging.debug(f"Wrote {OUTPUT_DIR}/Django.xml.")
if len(failed_ids) > 0:
cleaned_failures = sorted(set(failed_ids))
tmpFile = open( OUTPUT_DIR / "debug/failed_ids.txt", "w")
tmpFile.writelines(f"Missing IDs by appearance ({len(failed_ids)} in total):\n")
tmpFile.writelines(failed_ids)
tmpFile.writelines(f"\nMissing IDs sorted and uniqued ({len(cleaned_failures)} in total):\n")
tmpFile.writelines(cleaned_failures)
tmpFile.close()
logging.debug(f"Some ids could not be referenced. Check {OUTPUT_DIR}/debug/failed_ids.txt.")
if args.checkpublicationcfg:
check_publication_cfg(INPUT_DIR / "publication.cfg")
else:
pass