Skip to content

Commit

Permalink
Merge branch 'master' of github.molgen.mpg.de:EditionOpenAccess/EOASk…
Browse files Browse the repository at this point in the history
…ripts
  • Loading branch information
EsGeh authored and EsGeh committed Nov 7, 2019
2 parents 01cfd75 + dbc6cfc commit 868cc6e
Show file tree
Hide file tree
Showing 6 changed files with 133 additions and 40 deletions.
2 changes: 1 addition & 1 deletion data/aux/bibliography4ht.tex
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
\printfield{volume}%
\iffieldundef{number}
{}
{\mkbibparens{\printfield{number}}}%
{\printfield[parens]{number}}
\setunit{\addcomma\space}%
\printfield{eid}%
\setunit{\addspace}%
Expand Down
11 changes: 11 additions & 0 deletions doc/datapickle.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,17 @@

The file data.pickle is created during a run of `eoatex2imxml.py` or `fix_tei.py` and primarily assigns numbers to elements. For example, the thirteenth figure in the first (numbered) chapter, that carries the id `uid17` is assigned the human readable reference `1.13`.

The original list of stored items is
- chapterdict
- figdict
- eqdict
- fndict
- listdict
- pagelabeldict
- secdict
- tabdict
- theoremdict

## eoatex2imxml.py
In the classic variant, the file contains these fields:

Expand Down
32 changes: 12 additions & 20 deletions idassigner.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,29 +46,21 @@ def assign_ids(chapter_tree, elements):
sections = chapter_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP)
section_id_counter = 1
for section in sections:
if section.get("n") == "nonumber":
logging.info("Leaving out unnumbered section.")
pass
else:
section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter)
libeoaconvert.assign_xml_id(section, section_id)
section_id_counter += 1
section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter)
libeoaconvert.assign_xml_id(section, section_id)
section_id_counter += 1

if "sections" in elements:
subsections = chapter_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP)
subsection_id_counter = 1
for subsection in subsections:
if subsection.get("n") == "nonumber":
logging.info("Leaving out unnumbered subsection.")
pass
else:
section_element = subsection.getparent()
section_id = section_element.attrib["{http://www.w3.org/XML/1998/namespace}id"]
logging.debug("Found a subsection in section %s", section_id)
rest, section_number = section_id.split("_sec")
subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, subsection_id_counter)
libeoaconvert.assign_xml_id(subsection, subsection_id)
subsection_id_counter += 1
section_element = subsection.getparent()
section_id = section_element.attrib["{http://www.w3.org/XML/1998/namespace}id"]
logging.debug("Found a subsection in section %s", section_id)
rest, section_number = section_id.split("_sec")
subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, subsection_id_counter)
libeoaconvert.assign_xml_id(subsection, subsection_id)
subsection_id_counter += 1

if "figures" in elements:
figures = chapter_tree.xpath("//t:figure", namespaces=NS_MAP)
Expand Down Expand Up @@ -119,13 +111,13 @@ def main():
print(selected_chapters)
chapters = []
for xml_chapter in selected_chapters:
chapter = xml_tree.xpath(f"//t:div[@xml:id='{xml_chapter}' and not(@n='nonumber')]", namespaces=NS_MAP)[0]
chapter = xml_tree.xpath(f"//t:div[@xml:id='{xml_chapter}'", namespaces=NS_MAP)[0]
copied_chapter = deepcopy(chapter)
assign_ids(copied_chapter, elements=list_of_elements)
chapter.addprevious(copied_chapter)
chapter.tag = "elementtobestripped"
else:
chapters = xml_tree.xpath("//t:div[@type='chapter' and not(@n='nonumber')]", namespaces=NS_MAP)
chapters = xml_tree.xpath("//t:div[@type='chapter']", namespaces=NS_MAP)
logging.debug("Found %s chapters.", len(chapters))
# in this iteration, a copy is made of each chapter and fitted
# with ids, the original chapter is being discarded
Expand Down
17 changes: 10 additions & 7 deletions imxml2django.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
# Time-stamp: <2019-07-31 14:46:18 (kthoden)>
# Time-stamp: <2019-08-06 15:05:17 (kthoden)>

"""
Create an XML file that can be inserted into the Django database
Expand Down Expand Up @@ -416,7 +416,7 @@ def djangoParseObject(xmlElement, indent=False, listtype=None, listnumber=0, uid
else:
xmlEOAfigure.set("file", strImageFileDir + strImageFileName)

if figure_type == "hionly" or figure_type == "hionlycollage":
if figure_type == "hionly":# or figure_type == "hionlycollage":
logging.debug(f"Found hyperimage figure ({figure_type}), no need for caption and size information.")
pass
else:
Expand All @@ -426,12 +426,15 @@ def djangoParseObject(xmlElement, indent=False, listtype=None, listnumber=0, uid
xmlResult.append(xmlEOAfigure)
intObjectNumber += 1
# Insert visual Number and uid
strFigureNumber = dictFigures[xmlElement.find(".//anchor").get("id")]
xmlEOAfigure.set("number", strFigureNumber)
strFigureUID = xmlElement.find(".//anchor").get("id")
xmlEOAfigure.set("id", strFigureUID)
if figure_type == "hionlycollage" or figure_type == "hionlysub":
pass
else:
strFigureNumber = dictFigures[xmlElement.find(".//anchor").get("id")]
xmlEOAfigure.set("number", strFigureNumber)
strFigureUID = xmlElement.find(".//anchor").get("id")
xmlEOAfigure.set("id", strFigureUID)

hi_figure_types = ["hitrue", "hionly", "hionlycollage"]
hi_figure_types = ["hitrue", "hionly", "hionlycollage"]#, "hionlysub"]

if figure_type in hi_figure_types:
xmlEOAfigure.set("hielement", xmlElement.get("hielement"))
Expand Down
87 changes: 85 additions & 2 deletions imxml2tei.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,93 @@
"""

import argparse
import sys
import configparser
from pathlib import Path
from lxml import etree
import utils.libeoaconvert as libeoaconvert

BASE_DIR = Path( __file__ ).resolve().parent
SCRIPT_PATH = Path( __file__ )
SCRIPT_NAME = SCRIPT_PATH.stem

#####################
# Parsing arguments #
#####################

parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"-c", "--config",
default = BASE_DIR / "config" / "eoaconvert.cfg",
help="Name of config file"
)
parser.add_argument(
"-l", "--log-dir",
default = Path("output/logs"),
# default = Path("logs", SCRIPT_NAME).with_suffix(".log"),
help="logfile"
)
parser.add_argument(
"--log-level",
default = "INFO",
help="log level: choose between DEBUG, INFO, WARNING, ERROR, CRITICAL"
)
parser.add_argument(
"-f", "--filename",
default = "IntermediateXMLFile.xml",
help="Name of intermediate XML file (without suffix!)."
)
parser.add_argument(
"-o", "--output-dir",
default = "./output/tei",
help="where to dump all output files"
)
parser.add_argument(
"-i", "--input-dir",
default = "./output/imxml",
help="location of intermediate XML file"
)

args = parser.parse_args()

CONFIG_FILE = args.config

print("The configfile is %s." % CONFIG_FILE)

CONFIG = load_config(
CONFIG_FILE,
args.log_level,
(Path(args.log_dir) / SCRIPT_NAME) . with_suffix( ".log" ),
# args.log_file,
)

############################
# Paths:
############################
INPUT_DIR = Path( args.input_dir )
INPUT_PATH = Path( args.filename )
OUTPUT_DIR = Path( args.output_dir )
LOG_DIR = Path( args.log_dir )

TEMP_DIR = OUTPUT_DIR / "tmp_files"
DEBUG_DIR = OUTPUT_DIR / "debug"

# where to output the xml file:
XML_FILE = (OUTPUT_DIR / INPUT_PATH.name) .with_suffix( ".xml" )

##################################
# Setting up various directories #
##################################

if not os.path.exists(OUTPUT_DIR):
os.mkdir( OUTPUT_DIR )
if not os.path.exists(TEMP_DIR):
os.mkdir( TEMP_DIR )
if not os.path.exists( DEBUG_DIR ):
os.mkdir( DEBUG_DIR )

# citations need a little more work: especially citedRange
# so do landscape figures, no way to distinguish them!
Expand Down Expand Up @@ -417,10 +501,9 @@ def main():
back_part = etree.SubElement(tei_body, "back")
tei_body.insert(1, tei_body_xml.getroot())

outfile = 'CONVERT/TEI.xml'
output_string = etree.tostring(tei_root, xml_declaration=True, pretty_print=True, encoding="UTF-8", doctype= '<?xml-model href="eoa_tei.rnc" type="application/relax-ng-compact-syntax"?>\n<?xml-stylesheet type="text/css" href="tei.css" ?>')

with open(outfile, 'w') as output_file:
with open(XML_FILE, 'w') as output_file:
output_file.write(output_string.decode("utf-8"))
# def main ends here

Expand Down
24 changes: 14 additions & 10 deletions tei2imxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,10 +449,11 @@ def hi_lookup_code(nd, hitrue_xml_id):
def get_hitarget(nd, teitarget):
"""Find out corresponding hyperimage id for hyperimage link"""

if nd[teitarget]:
try:
hi_target = nd[teitarget]["hiid"]
else:
logging.error("Could not find hi code %s", teitarget)
except KeyError:
logging.error("Could not find hi code %s. Exiting", teitarget)
sys.exit(1)

return hi_target
# def get_hitarget ends here
Expand Down Expand Up @@ -786,7 +787,7 @@ def handle_refs_default(ref):
figure.set("id", "anotheruid")

# the anchor element is used to determine whether a figure gets an id and can be numbered
if figure_type == "hionlycollage":
if figure_type == "hionlycollage" or figure_type == "hionlysub":
logging.debug("No anchor element for collages.")

else:
Expand Down Expand Up @@ -900,16 +901,19 @@ def handle_refs_default(ref):
#########
eoa_lists = xml_tree.xpath("//t:body//t:list", namespaces=NS_MAP)
for eoalist in eoa_lists:
items = eoalist.findall("t:item", namespaces=NS_MAP)
for listitem in items:
listitem.tag = "p"
libeoaconvert.wrap_into_element(etree.Element("item"), listitem)
if eoalist.get("type") == "ordered":
pass
for listitem in items:
new_item_element = listitem.getparent()
new_item_element.set("id-text", f"{str(items.index(listitem) + 1)}")
new_item_element.set("label", f"{str(items.index(listitem) + 1)}.")
if eoalist.get("type") == "unordered":
pass
eoalist.set("type", "simple")
if eoalist.get("type") == "gloss":
eoalist.set("type", "description")
items = eoalist.findall("t:item", namespaces=NS_MAP)
for listitem in items:
listitem.tag = "p"
libeoaconvert.wrap_into_element(etree.Element("item"), listitem)

##############
# References #
Expand Down

0 comments on commit 868cc6e

Please sign in to comment.