From 4369492710f0c7c0fb931182a5aed49ed3f9f16b Mon Sep 17 00:00:00 2001 From: kthoden Date: Fri, 2 Aug 2019 16:26:33 +0200 Subject: [PATCH 1/9] Exception handling --- tei2imxml.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tei2imxml.py b/tei2imxml.py index 778ccb5..42783c8 100755 --- a/tei2imxml.py +++ b/tei2imxml.py @@ -449,10 +449,11 @@ def hi_lookup_code(nd, hitrue_xml_id): def get_hitarget(nd, teitarget): """Find out corresponding hyperimage id for hyperimage link""" - if nd[teitarget]: + try: hi_target = nd[teitarget]["hiid"] - else: - logging.error("Could not find hi code %s", teitarget) + except KeyError: + logging.error("Could not find hi code %s. Exiting", teitarget) + sys.exit(1) return hi_target # def get_hitarget ends here From 17d29ad181d4177bf7a4120a876114c703ce1203 Mon Sep 17 00:00:00 2001 From: kthoden Date: Fri, 2 Aug 2019 16:26:56 +0200 Subject: [PATCH 2/9] collage handling --- imxml2django.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/imxml2django.py b/imxml2django.py index 0b82359..29258b7 100755 --- a/imxml2django.py +++ b/imxml2django.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8; mode: python -*- -# Time-stamp: <2019-07-31 14:46:18 (kthoden)> +# Time-stamp: <2019-08-02 16:25:56 (kthoden)> """ Create an XML file that can be inserted into the Django database @@ -416,7 +416,7 @@ def djangoParseObject(xmlElement, indent=False, listtype=None, listnumber=0, uid else: xmlEOAfigure.set("file", strImageFileDir + strImageFileName) - if figure_type == "hionly" or figure_type == "hionlycollage": + if figure_type == "hionly":# or figure_type == "hionlycollage": logging.debug(f"Found hyperimage figure ({figure_type}), no need for caption and size information.") pass else: @@ -426,10 +426,11 @@ def djangoParseObject(xmlElement, indent=False, listtype=None, listnumber=0, uid xmlResult.append(xmlEOAfigure) intObjectNumber += 1 # Insert visual Number and uid - strFigureNumber = dictFigures[xmlElement.find(".//anchor").get("id")] - xmlEOAfigure.set("number", strFigureNumber) - strFigureUID = xmlElement.find(".//anchor").get("id") - xmlEOAfigure.set("id", strFigureUID) + if figure_type != "hionlycollage": + strFigureNumber = dictFigures[xmlElement.find(".//anchor").get("id")] + xmlEOAfigure.set("number", strFigureNumber) + strFigureUID = xmlElement.find(".//anchor").get("id") + xmlEOAfigure.set("id", strFigureUID) hi_figure_types = ["hitrue", "hionly", "hionlycollage"] From c28dc8d183db81cb8175c2acbb8cd1439d52d0cf Mon Sep 17 00:00:00 2001 From: kthoden Date: Mon, 2 Sep 2019 16:28:50 +0200 Subject: [PATCH 3/9] Additional information --- doc/datapickle.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doc/datapickle.md b/doc/datapickle.md index d03ed0a..119c644 100644 --- a/doc/datapickle.md +++ b/doc/datapickle.md @@ -2,6 +2,17 @@ The file data.pickle is created during a run of `eoatex2imxml.py` or `fix_tei.py` and primarily assigns numbers to elements. For example, the thirteenth figure in the first (numbered) chapter, that carries the id `uid17` is assigned the human readable reference `1.13`. +The original list of stored items is +- chapterdict +- figdict +- eqdict +- fndict +- listdict +- pagelabeldict +- secdict +- tabdict +- theoremdict + ## eoatex2imxml.py In the classic variant, the file contains these fields: From def54087c9bfab222f2648bb3ffcf6f2df888752 Mon Sep 17 00:00:00 2001 From: kthoden Date: Wed, 18 Sep 2019 10:46:05 +0200 Subject: [PATCH 4/9] Corrected rendering thanks to Martin Sievers --- data/aux/bibliography4ht.tex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/aux/bibliography4ht.tex b/data/aux/bibliography4ht.tex index 5ba07b3..91f7607 100644 --- a/data/aux/bibliography4ht.tex +++ b/data/aux/bibliography4ht.tex @@ -42,7 +42,7 @@ \printfield{volume}% \iffieldundef{number} {} - {\mkbibparens{\printfield{number}}}% + {\printfield[parens]{number}} \setunit{\addcomma\space}% \printfield{eid}% \setunit{\addspace}% From e28da2c6edd94861c988cd1d778108b0678815bb Mon Sep 17 00:00:00 2001 From: kthoden Date: Wed, 18 Sep 2019 10:47:13 +0200 Subject: [PATCH 5/9] Lists --- tei2imxml.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tei2imxml.py b/tei2imxml.py index 42783c8..72e9eef 100755 --- a/tei2imxml.py +++ b/tei2imxml.py @@ -901,16 +901,19 @@ def handle_refs_default(ref): ######### eoa_lists = xml_tree.xpath("//t:body//t:list", namespaces=NS_MAP) for eoalist in eoa_lists: + items = eoalist.findall("t:item", namespaces=NS_MAP) + for listitem in items: + listitem.tag = "p" + libeoaconvert.wrap_into_element(etree.Element("item"), listitem) if eoalist.get("type") == "ordered": - pass + for listitem in items: + new_item_element = listitem.getparent() + new_item_element.set("id-text", f"{str(items.index(listitem) + 1)}") + new_item_element.set("label", f"{str(items.index(listitem) + 1)}.") if eoalist.get("type") == "unordered": - pass + eoalist.set("type", "simple") if eoalist.get("type") == "gloss": eoalist.set("type", "description") - items = eoalist.findall("t:item", namespaces=NS_MAP) - for listitem in items: - listitem.tag = "p" - libeoaconvert.wrap_into_element(etree.Element("item"), listitem) ############## # References # From 3ca1343596920bcef199d8ee1ecbe022fce5c716 Mon Sep 17 00:00:00 2001 From: kthoden Date: Wed, 18 Sep 2019 10:47:26 +0200 Subject: [PATCH 6/9] Hyperimage exceptions --- tei2imxml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tei2imxml.py b/tei2imxml.py index 72e9eef..c1b6334 100755 --- a/tei2imxml.py +++ b/tei2imxml.py @@ -787,7 +787,7 @@ def handle_refs_default(ref): figure.set("id", "anotheruid") # the anchor element is used to determine whether a figure gets an id and can be numbered - if figure_type == "hionlycollage": + if figure_type == "hionlycollage" or figure_type == "hionlysub": logging.debug("No anchor element for collages.") else: From 51813866055dcc17f14fa587935172a8fb394e5e Mon Sep 17 00:00:00 2001 From: kthoden Date: Wed, 18 Sep 2019 10:47:58 +0200 Subject: [PATCH 7/9] Some more Hyperimage rules --- imxml2django.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/imxml2django.py b/imxml2django.py index 29258b7..84e3323 100755 --- a/imxml2django.py +++ b/imxml2django.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8; mode: python -*- -# Time-stamp: <2019-08-02 16:25:56 (kthoden)> +# Time-stamp: <2019-08-06 15:05:17 (kthoden)> """ Create an XML file that can be inserted into the Django database @@ -426,13 +426,15 @@ def djangoParseObject(xmlElement, indent=False, listtype=None, listnumber=0, uid xmlResult.append(xmlEOAfigure) intObjectNumber += 1 # Insert visual Number and uid - if figure_type != "hionlycollage": + if figure_type == "hionlycollage" or figure_type == "hionlysub": + pass + else: strFigureNumber = dictFigures[xmlElement.find(".//anchor").get("id")] xmlEOAfigure.set("number", strFigureNumber) strFigureUID = xmlElement.find(".//anchor").get("id") xmlEOAfigure.set("id", strFigureUID) - hi_figure_types = ["hitrue", "hionly", "hionlycollage"] + hi_figure_types = ["hitrue", "hionly", "hionlycollage"]#, "hionlysub"] if figure_type in hi_figure_types: xmlEOAfigure.set("hielement", xmlElement.get("hielement")) From fb6af94f52ba7f45a0718fd13213713ed760b38b Mon Sep 17 00:00:00 2001 From: kthoden Date: Wed, 6 Nov 2019 15:28:16 +0100 Subject: [PATCH 8/9] Inserting common functionalities --- imxml2tei.py | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 2 deletions(-) diff --git a/imxml2tei.py b/imxml2tei.py index c029395..b98aed8 100755 --- a/imxml2tei.py +++ b/imxml2tei.py @@ -7,9 +7,93 @@ """ +import argparse import sys import configparser +from pathlib import Path from lxml import etree +import utils.libeoaconvert as libeoaconvert + +BASE_DIR = Path( __file__ ).resolve().parent +SCRIPT_PATH = Path( __file__ ) +SCRIPT_NAME = SCRIPT_PATH.stem + +##################### +# Parsing arguments # +##################### + +parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter +) +parser.add_argument( + "-c", "--config", + default = BASE_DIR / "config" / "eoaconvert.cfg", + help="Name of config file" +) +parser.add_argument( + "-l", "--log-dir", + default = Path("output/logs"), + # default = Path("logs", SCRIPT_NAME).with_suffix(".log"), + help="logfile" +) +parser.add_argument( + "--log-level", + default = "INFO", + help="log level: choose between DEBUG, INFO, WARNING, ERROR, CRITICAL" +) +parser.add_argument( + "-f", "--filename", + default = "IntermediateXMLFile.xml", + help="Name of intermediate XML file (without suffix!)." +) +parser.add_argument( + "-o", "--output-dir", + default = "./output/tei", + help="where to dump all output files" +) +parser.add_argument( + "-i", "--input-dir", + default = "./output/imxml", + help="location of intermediate XML file" +) + +args = parser.parse_args() + +CONFIG_FILE = args.config + +print("The configfile is %s." % CONFIG_FILE) + +CONFIG = load_config( + CONFIG_FILE, + args.log_level, + (Path(args.log_dir) / SCRIPT_NAME) . with_suffix( ".log" ), + # args.log_file, +) + +############################ +# Paths: +############################ +INPUT_DIR = Path( args.input_dir ) +INPUT_PATH = Path( args.filename ) +OUTPUT_DIR = Path( args.output_dir ) +LOG_DIR = Path( args.log_dir ) + +TEMP_DIR = OUTPUT_DIR / "tmp_files" +DEBUG_DIR = OUTPUT_DIR / "debug" + +# where to output the xml file: +XML_FILE = (OUTPUT_DIR / INPUT_PATH.name) .with_suffix( ".xml" ) + +################################## +# Setting up various directories # +################################## + +if not os.path.exists(OUTPUT_DIR): + os.mkdir( OUTPUT_DIR ) +if not os.path.exists(TEMP_DIR): + os.mkdir( TEMP_DIR ) +if not os.path.exists( DEBUG_DIR ): + os.mkdir( DEBUG_DIR ) # citations need a little more work: especially citedRange # so do landscape figures, no way to distinguish them! @@ -417,10 +501,9 @@ def main(): back_part = etree.SubElement(tei_body, "back") tei_body.insert(1, tei_body_xml.getroot()) - outfile = 'CONVERT/TEI.xml' output_string = etree.tostring(tei_root, xml_declaration=True, pretty_print=True, encoding="UTF-8", doctype= '\n') - with open(outfile, 'w') as output_file: + with open(XML_FILE, 'w') as output_file: output_file.write(output_string.decode("utf-8")) # def main ends here From c02135d027dba92bc6dca9f3dff743345b49ecea Mon Sep 17 00:00:00 2001 From: kthoden Date: Wed, 6 Nov 2019 15:28:59 +0100 Subject: [PATCH 9/9] Insert id, irrespective of nonumber status --- idassigner.py | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/idassigner.py b/idassigner.py index 9dec660..2dea957 100644 --- a/idassigner.py +++ b/idassigner.py @@ -46,29 +46,21 @@ def assign_ids(chapter_tree, elements): sections = chapter_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP) section_id_counter = 1 for section in sections: - if section.get("n") == "nonumber": - logging.info("Leaving out unnumbered section.") - pass - else: - section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter) - libeoaconvert.assign_xml_id(section, section_id) - section_id_counter += 1 + section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter) + libeoaconvert.assign_xml_id(section, section_id) + section_id_counter += 1 if "sections" in elements: subsections = chapter_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP) subsection_id_counter = 1 for subsection in subsections: - if subsection.get("n") == "nonumber": - logging.info("Leaving out unnumbered subsection.") - pass - else: - section_element = subsection.getparent() - section_id = section_element.attrib["{http://www.w3.org/XML/1998/namespace}id"] - logging.debug("Found a subsection in section %s", section_id) - rest, section_number = section_id.split("_sec") - subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, subsection_id_counter) - libeoaconvert.assign_xml_id(subsection, subsection_id) - subsection_id_counter += 1 + section_element = subsection.getparent() + section_id = section_element.attrib["{http://www.w3.org/XML/1998/namespace}id"] + logging.debug("Found a subsection in section %s", section_id) + rest, section_number = section_id.split("_sec") + subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, subsection_id_counter) + libeoaconvert.assign_xml_id(subsection, subsection_id) + subsection_id_counter += 1 if "figures" in elements: figures = chapter_tree.xpath("//t:figure", namespaces=NS_MAP) @@ -119,13 +111,13 @@ def main(): print(selected_chapters) chapters = [] for xml_chapter in selected_chapters: - chapter = xml_tree.xpath(f"//t:div[@xml:id='{xml_chapter}' and not(@n='nonumber')]", namespaces=NS_MAP)[0] + chapter = xml_tree.xpath(f"//t:div[@xml:id='{xml_chapter}'", namespaces=NS_MAP)[0] copied_chapter = deepcopy(chapter) assign_ids(copied_chapter, elements=list_of_elements) chapter.addprevious(copied_chapter) chapter.tag = "elementtobestripped" else: - chapters = xml_tree.xpath("//t:div[@type='chapter' and not(@n='nonumber')]", namespaces=NS_MAP) + chapters = xml_tree.xpath("//t:div[@type='chapter']", namespaces=NS_MAP) logging.debug("Found %s chapters.", len(chapters)) # in this iteration, a copy is made of each chapter and fitted # with ids, the original chapter is being discarded