diff --git a/idassigner.py b/idassigner.py new file mode 100644 index 0000000..9a9d9aa --- /dev/null +++ b/idassigner.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8; mode: python -*- + +""" +Docstring goes here +""" + +__version__ = "1.0" +__date__ = "20181127" +__author__ = "kthoden@mpiwg-berlin.mpg.de" + +import argparse +import logging +from datetime import datetime +from lxml import etree +from copy import deepcopy +import libeoaconvert + +logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') + +ns_tei = "http://www.tei-c.org/ns/1.0" +NS_MAP = {"t": ns_tei, "xml": "http://www.w3.org/XML/1998/namespace"} + +def assign_ids(chapter_tree): + """Assign xml:ids to various elements""" + + chapter_id = (chapter_tree.attrib["{http://www.w3.org/XML/1998/namespace}id"]) + logging.debug("The id of this chapter is %s", chapter_id) + + footnotes = chapter_tree.xpath("//t:note[@place='bottom']", namespaces=NS_MAP) + footnote_id_counter = 1 + for note in footnotes: + note_id = "{}_ftn{:03d}".format(chapter_id, footnote_id_counter) + note.set("n", str(footnote_id_counter)) + libeoaconvert.assign_xml_id(note, note_id) + footnote_id_counter += 1 + + sections = chapter_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP) + section_id_counter = 1 + for section in sections: + if section.get("n") == "nonumber": + logging.info("Leaving out unnumbered section.") + pass + else: + section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter) + libeoaconvert.assign_xml_id(section, section_id) + section_id_counter += 1 + + subsections = chapter_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP) + subsection_id_counter = 1 + for subsection in subsections: + if subsection.get("n") == "nonumber": + logging.info("Leaving out unnumbered subsection.") + pass + else: + section_element = subsection.getparent() + section_id = section_element.attrib["{http://www.w3.org/XML/1998/namespace}id"] + logging.debug("Found a subsection in section %s", section_id) + rest, section_number = section_id.split("_sec") + subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, subsection_id_counter) + libeoaconvert.assign_xml_id(subsection, subsection_id) + subsection_id_counter += 1 + + figures = chapter_tree.xpath("//t:figure", namespaces=NS_MAP) + figure_id_counter = 1 + for figure in figures: + figure_id = "{}_fig{:02d}".format(chapter_id, figure_id_counter) + libeoaconvert.assign_xml_id(figure, figure_id) + figure_id_counter += 1 + + tables = chapter_tree.xpath("//t:table", namespaces=NS_MAP) + table_id_counter = 1 + for table in tables: + table_id = "{}_tab{:02d}".format(chapter_id, table_id_counter) + libeoaconvert.assign_xml_id(table, table_id) + table_id_counter += 1 + + return +# def assign_ids ends here + +def main(): + """The main bit""" + + parser = argparse.ArgumentParser() + parser.add_argument("teifile", help="TEI file") + args = parser.parse_args() + + xml_tree = etree.parse(args.teifile) + + chapters = xml_tree.xpath("//t:div[@type='chapter']", namespaces=NS_MAP) + + logging.debug("Found %s chapters.", len(chapters)) + + # in this iteration, a copy is made of each chapter and fitted + # with ids, the original chapter is being discarded + for chapter in chapters: + copied_chapter = deepcopy(chapter) + assign_ids(copied_chapter) + chapter.addprevious(copied_chapter) + chapter.tag = "elementtobestripped" + + etree.strip_elements(xml_tree, "elementtobestripped") + libeoaconvert.write_appinfo(xml_tree, "id_assigner", __version__, "idassign", "Assign XML IDs to elements", datetime.now().strftime("%Y-%m-%d")) + output = args.teifile.replace(".xml", "-withids.xml") + xml_tree.write(output, pretty_print=True, xml_declaration=True, encoding="utf-8") + logging.info("Wrote %s." % output) +# def main ends here + +if __name__ == '__main__': + main() +# finis