Merge branch 'master' of github.molgen.mpg.de:EditionOpenAccess/EOASk…

…ripts
EditionOpenAccess · Nov 7, 2019 · 868cc6e · 868cc6e
2 parents 01cfd75 + dbc6cfc
commit 868cc6e
Show file tree

Hide file tree

Showing 6 changed files with 133 additions and 40 deletions.
diff --git a/data/aux/bibliography4ht.tex b/data/aux/bibliography4ht.tex
@@ -42,7 +42,7 @@
   \printfield{volume}%
   \iffieldundef{number}
      {}
-      {\mkbibparens{\printfield{number}}}%
+     {\printfield[parens]{number}}
   \setunit{\addcomma\space}%
   \printfield{eid}%
   \setunit{\addspace}%

diff --git a/doc/datapickle.md b/doc/datapickle.md
@@ -2,6 +2,17 @@
 
 The file data.pickle is created during a run of `eoatex2imxml.py` or `fix_tei.py` and primarily assigns numbers to elements. For example, the thirteenth figure in the first (numbered) chapter, that carries the id `uid17` is assigned the human readable reference `1.13`.
 
+The original list of stored items is
+- chapterdict
+- figdict
+- eqdict
+- fndict
+- listdict
+- pagelabeldict
+- secdict
+- tabdict
+- theoremdict
+
 ## eoatex2imxml.py
 In the classic variant, the file contains these fields:
 

diff --git a/idassigner.py b/idassigner.py
@@ -46,29 +46,21 @@ def assign_ids(chapter_tree, elements):
         sections = chapter_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP)
         section_id_counter = 1
         for section in sections:
-            if section.get("n") == "nonumber":
-                logging.info("Leaving out unnumbered section.")
-                pass
-            else:
-                section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter)
-                libeoaconvert.assign_xml_id(section, section_id)
-                section_id_counter += 1
+            section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter)
+            libeoaconvert.assign_xml_id(section, section_id)
+            section_id_counter += 1
 
     if "sections" in elements:
         subsections = chapter_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP)
         subsection_id_counter = 1
         for subsection in subsections:
-            if subsection.get("n") == "nonumber":
-                logging.info("Leaving out unnumbered subsection.")
-                pass
-            else:
-                section_element = subsection.getparent()
-                section_id = section_element.attrib["{http://www.w3.org/XML/1998/namespace}id"]
-                logging.debug("Found a subsection in section %s", section_id)
-                rest, section_number = section_id.split("_sec")
-                subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, subsection_id_counter)
-                libeoaconvert.assign_xml_id(subsection, subsection_id)
-                subsection_id_counter += 1
+            section_element = subsection.getparent()
+            section_id = section_element.attrib["{http://www.w3.org/XML/1998/namespace}id"]
+            logging.debug("Found a subsection in section %s", section_id)
+            rest, section_number = section_id.split("_sec")
+            subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, subsection_id_counter)
+            libeoaconvert.assign_xml_id(subsection, subsection_id)
+            subsection_id_counter += 1
 
     if "figures" in elements:
         figures = chapter_tree.xpath("//t:figure", namespaces=NS_MAP)
@@ -119,13 +111,13 @@ def main():
         print(selected_chapters)
         chapters = []
         for xml_chapter in selected_chapters:
-            chapter = xml_tree.xpath(f"//t:div[@xml:id='{xml_chapter}' and not(@n='nonumber')]", namespaces=NS_MAP)[0]
+            chapter = xml_tree.xpath(f"//t:div[@xml:id='{xml_chapter}'", namespaces=NS_MAP)[0]
             copied_chapter = deepcopy(chapter)
             assign_ids(copied_chapter, elements=list_of_elements)
             chapter.addprevious(copied_chapter)
             chapter.tag = "elementtobestripped"
     else:
-        chapters = xml_tree.xpath("//t:div[@type='chapter' and not(@n='nonumber')]", namespaces=NS_MAP)
+        chapters = xml_tree.xpath("//t:div[@type='chapter']", namespaces=NS_MAP)
         logging.debug("Found %s chapters.", len(chapters))
         # in this iteration, a copy is made of each chapter and fitted
         # with ids, the original chapter is being discarded

diff --git a/imxml2django.py b/imxml2django.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8; mode: python -*-
-# Time-stamp: <2019-07-31 14:46:18 (kthoden)>
+# Time-stamp: <2019-08-06 15:05:17 (kthoden)>
 
 """
 Create an XML file that can be inserted into the Django database
@@ -416,7 +416,7 @@ def djangoParseObject(xmlElement, indent=False, listtype=None, listnumber=0, uid
                 else:
                     xmlEOAfigure.set("file", strImageFileDir + strImageFileName)
 
-            if figure_type == "hionly" or figure_type == "hionlycollage":
+            if figure_type == "hionly":# or figure_type == "hionlycollage":
                 logging.debug(f"Found hyperimage figure ({figure_type}), no need for caption and size information.")
                 pass
             else:
@@ -426,12 +426,15 @@ def djangoParseObject(xmlElement, indent=False, listtype=None, listnumber=0, uid
                 xmlResult.append(xmlEOAfigure)
                 intObjectNumber += 1
                 # Insert visual Number and uid
-                strFigureNumber = dictFigures[xmlElement.find(".//anchor").get("id")]
-                xmlEOAfigure.set("number", strFigureNumber)
-                strFigureUID = xmlElement.find(".//anchor").get("id")
-                xmlEOAfigure.set("id", strFigureUID)
+                if figure_type == "hionlycollage" or figure_type == "hionlysub":
+                    pass
+                else:
+                    strFigureNumber = dictFigures[xmlElement.find(".//anchor").get("id")]
+                    xmlEOAfigure.set("number", strFigureNumber)
+                    strFigureUID = xmlElement.find(".//anchor").get("id")
+                    xmlEOAfigure.set("id", strFigureUID)
 
-            hi_figure_types = ["hitrue", "hionly", "hionlycollage"]
+            hi_figure_types = ["hitrue", "hionly", "hionlycollage"]#, "hionlysub"]
 
             if figure_type in hi_figure_types:
                 xmlEOAfigure.set("hielement", xmlElement.get("hielement"))

diff --git a/imxml2tei.py b/imxml2tei.py
@@ -7,9 +7,93 @@
 
 """
 
+import argparse
 import sys
 import configparser
+from pathlib import Path
 from lxml import etree
+import utils.libeoaconvert as libeoaconvert
+
+BASE_DIR = Path( __file__ ).resolve().parent
+SCRIPT_PATH = Path( __file__ )
+SCRIPT_NAME = SCRIPT_PATH.stem
+
+#####################
+# Parsing arguments #
+#####################
+
+parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+parser.add_argument(
+        "-c", "--config",
+        default = BASE_DIR / "config" / "eoaconvert.cfg",
+        help="Name of config file"
+)
+parser.add_argument(
+        "-l", "--log-dir",
+        default = Path("output/logs"),
+        # default = Path("logs", SCRIPT_NAME).with_suffix(".log"),
+        help="logfile"
+)
+parser.add_argument(
+        "--log-level",
+        default = "INFO",
+        help="log level: choose between DEBUG, INFO, WARNING, ERROR, CRITICAL"
+)
+parser.add_argument(
+        "-f", "--filename",
+        default = "IntermediateXMLFile.xml",
+        help="Name of intermediate XML file (without suffix!)."
+)
+parser.add_argument(
+        "-o", "--output-dir",
+        default = "./output/tei",
+        help="where to dump all output files"
+)
+parser.add_argument(
+        "-i", "--input-dir",
+        default = "./output/imxml",
+        help="location of intermediate XML file"
+)
+
+args = parser.parse_args()
+
+CONFIG_FILE = args.config
+
+print("The configfile is %s." % CONFIG_FILE)
+
+CONFIG = load_config(
+        CONFIG_FILE,
+        args.log_level,
+        (Path(args.log_dir) / SCRIPT_NAME) . with_suffix( ".log" ),
+        # args.log_file,
+)
+
+############################
+# Paths:
+############################
+INPUT_DIR = Path( args.input_dir )
+INPUT_PATH = Path( args.filename )
+OUTPUT_DIR = Path( args.output_dir )
+LOG_DIR = Path( args.log_dir )
+
+TEMP_DIR = OUTPUT_DIR / "tmp_files"
+DEBUG_DIR = OUTPUT_DIR / "debug"
+
+# where to output the xml file:
+XML_FILE = (OUTPUT_DIR / INPUT_PATH.name) .with_suffix( ".xml" )
+
+##################################
+# Setting up various directories #
+##################################
+
+if not os.path.exists(OUTPUT_DIR):
+    os.mkdir( OUTPUT_DIR )
+if not os.path.exists(TEMP_DIR):
+    os.mkdir( TEMP_DIR )
+if not os.path.exists( DEBUG_DIR ):
+    os.mkdir( DEBUG_DIR  )
 
 # citations need a little more work: especially citedRange
 # so do landscape figures, no way to distinguish them!
@@ -417,10 +501,9 @@ def main():
     back_part = etree.SubElement(tei_body, "back")
     tei_body.insert(1, tei_body_xml.getroot())
 
-    outfile = 'CONVERT/TEI.xml'
     output_string = etree.tostring(tei_root, xml_declaration=True, pretty_print=True, encoding="UTF-8", doctype= '<?xml-model href="eoa_tei.rnc" type="application/relax-ng-compact-syntax"?>\n<?xml-stylesheet type="text/css" href="tei.css" ?>')
 
-    with open(outfile, 'w') as output_file:
+    with open(XML_FILE, 'w') as output_file:
         output_file.write(output_string.decode("utf-8"))
 # def main ends here
 

diff --git a/tei2imxml.py b/tei2imxml.py
@@ -449,10 +449,11 @@ def hi_lookup_code(nd, hitrue_xml_id):
 def get_hitarget(nd, teitarget):
     """Find out corresponding hyperimage id for hyperimage link"""
 
-    if nd[teitarget]:
+    try:
         hi_target = nd[teitarget]["hiid"]
-    else:
-        logging.error("Could not find hi code %s", teitarget)
+    except KeyError:
+        logging.error("Could not find hi code %s. Exiting", teitarget)
+        sys.exit(1)
 
     return hi_target
 # def get_hitarget ends here
@@ -786,7 +787,7 @@ def handle_refs_default(ref):
             figure.set("id", "anotheruid")
 
             # the anchor element is used to determine whether a figure gets an id and can be numbered
-            if figure_type == "hionlycollage":
+            if figure_type == "hionlycollage" or figure_type == "hionlysub":
                 logging.debug("No anchor element for collages.")
 
             else:
@@ -900,16 +901,19 @@ def handle_refs_default(ref):
     #########
     eoa_lists = xml_tree.xpath("//t:body//t:list", namespaces=NS_MAP)
     for eoalist in eoa_lists:
+        items = eoalist.findall("t:item", namespaces=NS_MAP)
+        for listitem in items:
+            listitem.tag = "p"
+            libeoaconvert.wrap_into_element(etree.Element("item"), listitem)
         if eoalist.get("type") == "ordered":
-            pass
+            for listitem in items:
+                new_item_element = listitem.getparent()
+                new_item_element.set("id-text", f"{str(items.index(listitem) + 1)}")
+                new_item_element.set("label", f"{str(items.index(listitem) + 1)}.")
         if eoalist.get("type") == "unordered":
-            pass
+            eoalist.set("type", "simple")
         if eoalist.get("type") == "gloss":
             eoalist.set("type", "description")
-            items = eoalist.findall("t:item", namespaces=NS_MAP)
-            for listitem in items:
-                listitem.tag = "p"
-                libeoaconvert.wrap_into_element(etree.Element("item"), listitem)
 
     ##############
     # References #