diff --git a/src/idassigner.py b/src/idassigner.py index 19dde35..0e0fead 100644 --- a/src/idassigner.py +++ b/src/idassigner.py @@ -2,83 +2,166 @@ # -*- coding: utf-8; mode: python -*- """ -Assign xml:ids to various elements fo TEI file +Assign xml:ids to various elements of TEI file + +Unsuccessful exit codes: +9: Found a chapter without id. Prior to running idassigner all chapters must have ids. +10: User enters a chapter id that does not exist. +11: User wants to assign an id to an element that is not supported. +12: An element is referenced by ref but would get a new id. This requires user intervention. """ -__version__ = "1.0" -__date__ = "20181127" +__version__ = "1.5" +__date__ = "20210218" __author__ = "kthoden@mpiwg-berlin.mpg.de" -import utils.libeoaconvert as libeoaconvert import argparse import logging import sys import shutil from datetime import datetime -from lxml import etree from copy import deepcopy +from lxml import etree +import utils.libeoaconvert as libeoaconvert logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') -ns_tei = "http://www.tei-c.org/ns/1.0" -NS_MAP = {"t": ns_tei, "xml": "http://www.w3.org/XML/1998/namespace"} +NS_TEI = "http://www.tei-c.org/ns/1.0" +NS_MAP = {"t": NS_TEI, "xml": "http://www.w3.org/XML/1998/namespace"} + + +def create_ref_hash(xml_tree): + """Create a list of all used references inside the document""" + + refs = xml_tree.xpath("//t:ref[@type='text-hi' or @type='number-hionly' \ + or @type='number-hi' or @type='hionlycollage' or @type='number' or @type='page' \ + or @type='text']", namespaces=NS_MAP) + + return refs +# def create_ref_hash ends here + + +def check_references(target_id, references): + """Check if target id is in referenced items.""" + + is_in_references = False + + for reference in references: + referenced_item = reference.get("target") + + if target_id == referenced_item: + is_in_references = True -def assign_ids(chapter_tree, elements): + return is_in_references +# def check_references ends here + + +def interrupt_assignment(identifier): + """Terminate the program with an error, because an id is about to be + overridden that is referenced""" + + logging.error("Found an element that is referenced: ") + logging.error(identifier) + logging.error("""Before continuing you should take a note of what is referenced and + modify the reference after assignment. Exiting.""") + sys.exit(12) +# def interrupt_assignment ends here + + +def assign_ids(chapter_tree, references, elements): """Assign xml:ids to various elements""" try: - chapter_id = (chapter_tree.attrib["{http://www.w3.org/XML/1998/namespace}id"]) + chapter_id = chapter_tree.attrib["{http://www.w3.org/XML/1998/namespace}id"] except KeyError: logging.error("Chapter has no id. Exiting") - sys.exit() + sys.exit(9) logging.debug("The id of this chapter is %s", chapter_id) if "footnotes" in elements: footnotes = chapter_tree.xpath("//t:note[@place='bottom']", namespaces=NS_MAP) footnote_id_counter = 1 for note in footnotes: - note_id = "{}_ftn{:03d}".format(chapter_id, footnote_id_counter) - note.set("n", str(footnote_id_counter)) - libeoaconvert.assign_xml_id(note, note_id) + existing_note_id = f"#{libeoaconvert.get_xml_id(note)}" + new_note_id = "{}_ftn{:03d}".format(chapter_id, footnote_id_counter) + if existing_note_id is None: + libeoaconvert.assign_xml_id(note, new_note_id) + else: + referenced = check_references(existing_note_id, references) + if referenced: + interrupt_assignment(existing_note_id) + else: + note.set("n", str(footnote_id_counter)) + libeoaconvert.assign_xml_id(note, new_note_id) footnote_id_counter += 1 if "sections" in elements: sections = chapter_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP) section_id_counter = 1 for section in sections: - section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter) - libeoaconvert.assign_xml_id(section, section_id) + existing_section_id = f"#{libeoaconvert.get_xml_id(section)}" + new_section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter) + if existing_section_id is None: + libeoaconvert.assign_xml_id(section, new_section_id) + else: + referenced = check_references(existing_section_id, references) + if referenced: + interrupt_assignment(existing_section_id) + else: + libeoaconvert.assign_xml_id(section, new_section_id) section_id_counter += 1 if "sections" in elements: subsections = chapter_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP) subsection_id_counter = 1 for subsection in subsections: - section_element = subsection.getparent() - section_id = section_element.attrib["{http://www.w3.org/XML/1998/namespace}id"] - logging.debug("Found a subsection in section %s", section_id) + existing_subsection_id = f"#{libeoaconvert.get_xml_id(subsection)}" + section_id = libeoaconvert.get_xml_id(subsection.getparent()) rest, section_number = section_id.split("_sec") - subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, subsection_id_counter) - libeoaconvert.assign_xml_id(subsection, subsection_id) + new_subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, + subsection_id_counter) + logging.debug("Found a subsection in section %s", section_id) + if existing_section_id is None: + libeoaconvert.assign_xml_id(subsection, new_subsection_id) + else: + referenced = check_references(existing_subsection_id, references) + if referenced: + interrupt_assignment(existing_subsection_id) + else: + libeoaconvert.assign_xml_id(subsection, new_subsection_id) subsection_id_counter += 1 if "figures" in elements: figures = chapter_tree.xpath("//t:figure", namespaces=NS_MAP) figure_id_counter = 1 for figure in figures: - figure_id = "{}_fig{:02d}".format(chapter_id, figure_id_counter) - libeoaconvert.assign_xml_id(figure, figure_id) + existing_figure_id = f"#{libeoaconvert.get_xml_id(figure)}" + new_figure_id = "{}_fig{:02d}".format(chapter_id, figure_id_counter) + if existing_figure_id is None: + libeoaconvert.assign_xml_id(figure, new_figure_id) + else: + referenced = check_references(existing_figure_id, references) + if referenced: + interrupt_assignment(existing_figure_id) + else: + libeoaconvert.assign_xml_id(figure, new_figure_id) figure_id_counter += 1 if "tables" in elements: tables = chapter_tree.xpath("//t:table", namespaces=NS_MAP) table_id_counter = 1 for table in tables: - table_id = "{}_tab{:02d}".format(chapter_id, table_id_counter) - libeoaconvert.assign_xml_id(table, table_id) + existing_table_id = f"#{libeoaconvert.get_xml_id(table)}" + new_table_id = "{}_tab{:02d}".format(chapter_id, table_id_counter) + if existing_table_id is None: + libeoaconvert.assign_xml_id(table, new_table_id) + else: + referenced = check_references(existing_table_id, references) + if referenced: + interrupt_assignment(existing_table_id) + else: + libeoaconvert.assign_xml_id(table, new_table_id) table_id_counter += 1 - - return # def assign_ids ends here def main(): @@ -88,22 +171,26 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument("-f", "--teifile", help="TEI file") - parser.add_argument("-c", "--chapter", nargs='*', help="Modify only chapters with following ids.") - parser.add_argument("-e", "--element", nargs='*', help=f"List of elements to assign ids to, separated by spaces. Available elements: {', '.join(list_of_elements)}") + parser.add_argument("-c", "--chapter", nargs='*', help="""Modify only chapters with + following ids.""") + parser.add_argument("-e", "--element", nargs='*', help=f"""List of elements to assign + ids to, separated by spaces. Available elements: {', '.join(list_of_elements)}""") args = parser.parse_args() xml_tree = etree.parse(args.teifile) + references_in_xml = create_ref_hash(xml_tree) + elements = args.element - if elements: + if not elements: + logging.debug(f"Assigning ids to all elements: {', '.join(list_of_elements)}") + else: for element in elements: if element not in list_of_elements: - logging.error(f"{element} is not a valid element. Exiting.") - sys.exit(1) + logging.error("%s is not a valid element. Exiting.", element) + sys.exit(11) list_of_elements = elements logging.debug(f"Assigning ids selected elements: {', '.join(list_of_elements)}") - else: - logging.debug(f"Assigning ids to all elements: {', '.join(list_of_elements)}") selected_chapters = args.chapter @@ -115,19 +202,20 @@ def main(): chapter = xml_tree.xpath(f"//t:div[@xml:id='{xml_chapter}']", namespaces=NS_MAP)[0] except IndexError: logging.error("Found no chapter with id '%s'. Exiting.", xml_chapter) - sys.exit(2) + sys.exit(10) copied_chapter = deepcopy(chapter) - assign_ids(copied_chapter, elements=list_of_elements) + assign_ids(copied_chapter, references_in_xml, elements=list_of_elements) chapter.addprevious(copied_chapter) chapter.tag = "elementtobestripped" else: - chapters = xml_tree.xpath("//t:div[@type='chapter' or @type='indexchapter' or @type='chapteroriginal' or @type='chaptertranslation']", namespaces=NS_MAP) + chapters = xml_tree.xpath("//t:div[@type='chapter' or @type='indexchapter' or \ + @type='chapteroriginal' or @type='chaptertranslation']", namespaces=NS_MAP) logging.debug("Found %s chapters.", len(chapters)) # in this iteration, a copy is made of each chapter and fitted # with ids, the original chapter is being discarded for chapter in chapters: copied_chapter = deepcopy(chapter) - assign_ids(copied_chapter, elements=list_of_elements) + assign_ids(copied_chapter, references_in_xml, elements=list_of_elements) chapter.addprevious(copied_chapter) chapter.tag = "elementtobestripped" @@ -143,7 +231,7 @@ def main(): logging.info(f"Made backup copy of {args.teifile} to {old_file_name}.") output = args.teifile xml_tree.write(output, pretty_print=True, xml_declaration=True, encoding="utf-8") - logging.info("Wrote %s." % output) + logging.info("Wrote %s.", output) # def main ends here if __name__ == '__main__': diff --git a/src/utils/libeoaconvert.py b/src/utils/libeoaconvert.py index 52fa834..b508d4c 100644 --- a/src/utils/libeoaconvert.py +++ b/src/utils/libeoaconvert.py @@ -414,6 +414,20 @@ def assign_xml_id(element, identifier): return # def assign_xml_id ends here + +def get_xml_id(element): + """Get the xml:id of an element""" + + try: + element_id = element.attrib["{http://www.w3.org/XML/1998/namespace}id"] + except KeyError: + logging.warning("Element %s has no xml:id attribute.", element.tag) + element_id = None + + return element_id +# def get_xml_id ends here + + def write_appinfo(ident, version, xmlid, text, date): """Log the change of a TEI document in the appinfo element"""