diff --git a/idassigner.py b/idassigner.py index f2fc439..9dec660 100644 --- a/idassigner.py +++ b/idassigner.py @@ -13,6 +13,7 @@ import argparse import logging import sys +import shutil from datetime import datetime from lxml import etree from copy import deepcopy @@ -22,7 +23,7 @@ ns_tei = "http://www.tei-c.org/ns/1.0" NS_MAP = {"t": ns_tei, "xml": "http://www.w3.org/XML/1998/namespace"} -def assign_ids(chapter_tree): +def assign_ids(chapter_tree, elements): """Assign xml:ids to various elements""" try: @@ -32,53 +33,58 @@ def assign_ids(chapter_tree): sys.exit() logging.debug("The id of this chapter is %s", chapter_id) - footnotes = chapter_tree.xpath("//t:note[@place='bottom']", namespaces=NS_MAP) - footnote_id_counter = 1 - for note in footnotes: - note_id = "{}_ftn{:03d}".format(chapter_id, footnote_id_counter) - note.set("n", str(footnote_id_counter)) - libeoaconvert.assign_xml_id(note, note_id) - footnote_id_counter += 1 - - sections = chapter_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP) - section_id_counter = 1 - for section in sections: - if section.get("n") == "nonumber": - logging.info("Leaving out unnumbered section.") - pass - else: - section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter) - libeoaconvert.assign_xml_id(section, section_id) - section_id_counter += 1 - - subsections = chapter_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP) - subsection_id_counter = 1 - for subsection in subsections: - if subsection.get("n") == "nonumber": - logging.info("Leaving out unnumbered subsection.") - pass - else: - section_element = subsection.getparent() - section_id = section_element.attrib["{http://www.w3.org/XML/1998/namespace}id"] - logging.debug("Found a subsection in section %s", section_id) - rest, section_number = section_id.split("_sec") - subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, subsection_id_counter) - libeoaconvert.assign_xml_id(subsection, subsection_id) - subsection_id_counter += 1 - - figures = chapter_tree.xpath("//t:figure", namespaces=NS_MAP) - figure_id_counter = 1 - for figure in figures: - figure_id = "{}_fig{:02d}".format(chapter_id, figure_id_counter) - libeoaconvert.assign_xml_id(figure, figure_id) - figure_id_counter += 1 - - tables = chapter_tree.xpath("//t:table", namespaces=NS_MAP) - table_id_counter = 1 - for table in tables: - table_id = "{}_tab{:02d}".format(chapter_id, table_id_counter) - libeoaconvert.assign_xml_id(table, table_id) - table_id_counter += 1 + if "footnotes" in elements: + footnotes = chapter_tree.xpath("//t:note[@place='bottom']", namespaces=NS_MAP) + footnote_id_counter = 1 + for note in footnotes: + note_id = "{}_ftn{:03d}".format(chapter_id, footnote_id_counter) + note.set("n", str(footnote_id_counter)) + libeoaconvert.assign_xml_id(note, note_id) + footnote_id_counter += 1 + + if "sections" in elements: + sections = chapter_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP) + section_id_counter = 1 + for section in sections: + if section.get("n") == "nonumber": + logging.info("Leaving out unnumbered section.") + pass + else: + section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter) + libeoaconvert.assign_xml_id(section, section_id) + section_id_counter += 1 + + if "sections" in elements: + subsections = chapter_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP) + subsection_id_counter = 1 + for subsection in subsections: + if subsection.get("n") == "nonumber": + logging.info("Leaving out unnumbered subsection.") + pass + else: + section_element = subsection.getparent() + section_id = section_element.attrib["{http://www.w3.org/XML/1998/namespace}id"] + logging.debug("Found a subsection in section %s", section_id) + rest, section_number = section_id.split("_sec") + subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, subsection_id_counter) + libeoaconvert.assign_xml_id(subsection, subsection_id) + subsection_id_counter += 1 + + if "figures" in elements: + figures = chapter_tree.xpath("//t:figure", namespaces=NS_MAP) + figure_id_counter = 1 + for figure in figures: + figure_id = "{}_fig{:02d}".format(chapter_id, figure_id_counter) + libeoaconvert.assign_xml_id(figure, figure_id) + figure_id_counter += 1 + + if "tables" in elements: + tables = chapter_tree.xpath("//t:table", namespaces=NS_MAP) + table_id_counter = 1 + for table in tables: + table_id = "{}_tab{:02d}".format(chapter_id, table_id_counter) + libeoaconvert.assign_xml_id(table, table_id) + table_id_counter += 1 return # def assign_ids ends here @@ -86,23 +92,48 @@ def assign_ids(chapter_tree): def main(): """The main bit""" + list_of_elements = ["footnotes", "sections", "subsections", "figures", "tables"] + parser = argparse.ArgumentParser() - parser.add_argument("teifile", help="TEI file") + parser.add_argument("-f", "--teifile", help="TEI file") + parser.add_argument("-c", "--chapter", nargs='*', help="Modify only chapters with following ids.") + parser.add_argument("-e", "--element", nargs='*', help=f"List of elements to assign ids to, separated by spaces. Available elements: {', '.join(list_of_elements)}") args = parser.parse_args() xml_tree = etree.parse(args.teifile) - chapters = xml_tree.xpath("//t:div[@type='chapter' and not(@n='nonumber')]", namespaces=NS_MAP) - - logging.debug("Found %s chapters.", len(chapters)) - - # in this iteration, a copy is made of each chapter and fitted - # with ids, the original chapter is being discarded - for chapter in chapters: - copied_chapter = deepcopy(chapter) - assign_ids(copied_chapter) - chapter.addprevious(copied_chapter) - chapter.tag = "elementtobestripped" + elements = args.element + if elements: + for element in elements: + if element not in list_of_elements: + logging.error(f"{element} is not a valid element. Exiting.") + sys.exit(1) + list_of_elements = elements + logging.debug(f"Assigning ids selected elements: {', '.join(list_of_elements)}") + else: + logging.debug(f"Assigning ids to all elements: {', '.join(list_of_elements)}") + + selected_chapters = args.chapter + + if selected_chapters: + print(selected_chapters) + chapters = [] + for xml_chapter in selected_chapters: + chapter = xml_tree.xpath(f"//t:div[@xml:id='{xml_chapter}' and not(@n='nonumber')]", namespaces=NS_MAP)[0] + copied_chapter = deepcopy(chapter) + assign_ids(copied_chapter, elements=list_of_elements) + chapter.addprevious(copied_chapter) + chapter.tag = "elementtobestripped" + else: + chapters = xml_tree.xpath("//t:div[@type='chapter' and not(@n='nonumber')]", namespaces=NS_MAP) + logging.debug("Found %s chapters.", len(chapters)) + # in this iteration, a copy is made of each chapter and fitted + # with ids, the original chapter is being discarded + for chapter in chapters: + copied_chapter = deepcopy(chapter) + assign_ids(copied_chapter, elements=list_of_elements) + chapter.addprevious(copied_chapter) + chapter.tag = "elementtobestripped" etree.strip_elements(xml_tree, "elementtobestripped") @@ -110,7 +141,10 @@ def main(): appinfo = libeoaconvert.get_appinfo("id_assigner", __version__, "idassign", "Assign XML IDs to elements", datetime.now().strftime("%Y-%m-%d")) appinfo_element.insert(0, appinfo) - output = args.teifile.replace(".xml", "-withids.xml") + old_file_name = args.teifile.replace(".xml", "-old-ids.xml") + shutil.copy(args.teifile, old_file_name) + logging.info(f"Made backup copy of {args.teifile} to {old_file_name}.") + output = args.teifile xml_tree.write(output, pretty_print=True, xml_declaration=True, encoding="utf-8") logging.info("Wrote %s." % output) # def main ends here