Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
EOASkripts/src/idassigner.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
239 lines (200 sloc)
9.52 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8; mode: python -*- | |
""" | |
Assign xml:ids to various elements of TEI file | |
Unsuccessful exit codes: | |
9: Found a chapter without id. Prior to running idassigner all chapters must have ids. | |
10: User enters a chapter id that does not exist. | |
11: User wants to assign an id to an element that is not supported. | |
12: An element is referenced by ref but would get a new id. This requires user intervention. | |
""" | |
__version__ = "1.5" | |
__date__ = "20210218" | |
__author__ = "kthoden@mpiwg-berlin.mpg.de" | |
import argparse | |
import logging | |
import sys | |
import shutil | |
from datetime import datetime | |
from copy import deepcopy | |
from lxml import etree | |
import utils.libeoaconvert as libeoaconvert | |
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') | |
NS_TEI = "http://www.tei-c.org/ns/1.0" | |
NS_MAP = {"t": NS_TEI, "xml": "http://www.w3.org/XML/1998/namespace"} | |
def create_ref_hash(xml_tree): | |
"""Create a list of all used references inside the document""" | |
refs = xml_tree.xpath("//t:ref[@type='text-hi' or @type='number-hionly' \ | |
or @type='number-hi' or @type='hionlycollage' or @type='number' or @type='page' \ | |
or @type='text']", namespaces=NS_MAP) | |
return refs | |
# def create_ref_hash ends here | |
def check_references(target_id, references): | |
"""Check if target id is in referenced items.""" | |
is_in_references = False | |
for reference in references: | |
referenced_item = reference.get("target") | |
if target_id == referenced_item: | |
is_in_references = True | |
return is_in_references | |
# def check_references ends here | |
def interrupt_assignment(identifier): | |
"""Terminate the program with an error, because an id is about to be | |
overridden that is referenced""" | |
logging.error("Found an element that is referenced: ") | |
logging.error(identifier) | |
logging.error("""Before continuing you should take a note of what is referenced and | |
modify the reference after assignment. Exiting.""") | |
sys.exit(12) | |
# def interrupt_assignment ends here | |
def assign_ids(chapter_tree, references, elements): | |
"""Assign xml:ids to various elements""" | |
try: | |
chapter_id = chapter_tree.attrib["{http://www.w3.org/XML/1998/namespace}id"] | |
except KeyError: | |
logging.error("Chapter has no id. Exiting") | |
sys.exit(9) | |
logging.debug("The id of this chapter is %s", chapter_id) | |
if "footnotes" in elements: | |
footnotes = chapter_tree.xpath("//t:note[@place='bottom']", namespaces=NS_MAP) | |
footnote_id_counter = 1 | |
for note in footnotes: | |
existing_note_id = f"#{libeoaconvert.get_xml_id(note)}" | |
new_note_id = "{}_ftn{:03d}".format(chapter_id, footnote_id_counter) | |
if existing_note_id is None: | |
libeoaconvert.assign_xml_id(note, new_note_id) | |
else: | |
referenced = check_references(existing_note_id, references) | |
if referenced: | |
interrupt_assignment(existing_note_id) | |
else: | |
note.set("n", str(footnote_id_counter)) | |
libeoaconvert.assign_xml_id(note, new_note_id) | |
footnote_id_counter += 1 | |
if "sections" in elements: | |
sections = chapter_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP) | |
section_id_counter = 1 | |
for section in sections: | |
existing_section_id = f"#{libeoaconvert.get_xml_id(section)}" | |
new_section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter) | |
if existing_section_id is None: | |
libeoaconvert.assign_xml_id(section, new_section_id) | |
else: | |
referenced = check_references(existing_section_id, references) | |
if referenced: | |
interrupt_assignment(existing_section_id) | |
else: | |
libeoaconvert.assign_xml_id(section, new_section_id) | |
section_id_counter += 1 | |
if "sections" in elements: | |
subsections = chapter_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP) | |
subsection_id_counter = 1 | |
for subsection in subsections: | |
existing_subsection_id = f"#{libeoaconvert.get_xml_id(subsection)}" | |
section_id = libeoaconvert.get_xml_id(subsection.getparent()) | |
rest, section_number = section_id.split("_sec") | |
new_subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, | |
subsection_id_counter) | |
logging.debug("Found a subsection in section %s", section_id) | |
if existing_section_id is None: | |
libeoaconvert.assign_xml_id(subsection, new_subsection_id) | |
else: | |
referenced = check_references(existing_subsection_id, references) | |
if referenced: | |
interrupt_assignment(existing_subsection_id) | |
else: | |
libeoaconvert.assign_xml_id(subsection, new_subsection_id) | |
subsection_id_counter += 1 | |
if "figures" in elements: | |
figures = chapter_tree.xpath("//t:figure", namespaces=NS_MAP) | |
figure_id_counter = 1 | |
for figure in figures: | |
existing_figure_id = f"#{libeoaconvert.get_xml_id(figure)}" | |
new_figure_id = "{}_fig{:02d}".format(chapter_id, figure_id_counter) | |
if existing_figure_id is None: | |
libeoaconvert.assign_xml_id(figure, new_figure_id) | |
else: | |
referenced = check_references(existing_figure_id, references) | |
if referenced: | |
interrupt_assignment(existing_figure_id) | |
else: | |
libeoaconvert.assign_xml_id(figure, new_figure_id) | |
figure_id_counter += 1 | |
if "tables" in elements: | |
tables = chapter_tree.xpath("//t:table", namespaces=NS_MAP) | |
table_id_counter = 1 | |
for table in tables: | |
existing_table_id = f"#{libeoaconvert.get_xml_id(table)}" | |
new_table_id = "{}_tab{:02d}".format(chapter_id, table_id_counter) | |
if existing_table_id is None: | |
libeoaconvert.assign_xml_id(table, new_table_id) | |
else: | |
referenced = check_references(existing_table_id, references) | |
if referenced: | |
interrupt_assignment(existing_table_id) | |
else: | |
libeoaconvert.assign_xml_id(table, new_table_id) | |
table_id_counter += 1 | |
# def assign_ids ends here | |
def main(): | |
"""The main bit""" | |
list_of_elements = ["footnotes", "sections", "subsections", "figures", "tables"] | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-f", "--teifile", help="TEI file") | |
parser.add_argument("-c", "--chapter", nargs='*', help="""Modify only chapters with | |
following ids.""") | |
parser.add_argument("-e", "--element", nargs='*', help=f"""List of elements to assign | |
ids to, separated by spaces. Available elements: {', '.join(list_of_elements)}""") | |
args = parser.parse_args() | |
xml_tree = etree.parse(args.teifile) | |
references_in_xml = create_ref_hash(xml_tree) | |
elements = args.element | |
if not elements: | |
logging.debug(f"Assigning ids to all elements: {', '.join(list_of_elements)}") | |
else: | |
for element in elements: | |
if element not in list_of_elements: | |
logging.error("%s is not a valid element. Exiting.", element) | |
sys.exit(11) | |
list_of_elements = elements | |
logging.debug(f"Assigning ids selected elements: {', '.join(list_of_elements)}") | |
selected_chapters = args.chapter | |
if selected_chapters: | |
logging.debug(selected_chapters) | |
chapters = [] | |
for xml_chapter in selected_chapters: | |
try: | |
chapter = xml_tree.xpath(f"//t:div[@xml:id='{xml_chapter}']", namespaces=NS_MAP)[0] | |
except IndexError: | |
logging.error("Found no chapter with id '%s'. Exiting.", xml_chapter) | |
sys.exit(10) | |
copied_chapter = deepcopy(chapter) | |
assign_ids(copied_chapter, references_in_xml, elements=list_of_elements) | |
chapter.addprevious(copied_chapter) | |
chapter.tag = "elementtobestripped" | |
else: | |
chapters = xml_tree.xpath("//t:div[@type='chapter' or @type='indexchapter' or \ | |
@type='chapteroriginal' or @type='chaptertranslation']", namespaces=NS_MAP) | |
logging.debug("Found %s chapters.", len(chapters)) | |
# in this iteration, a copy is made of each chapter and fitted | |
# with ids, the original chapter is being discarded | |
for chapter in chapters: | |
copied_chapter = deepcopy(chapter) | |
assign_ids(copied_chapter, references_in_xml, elements=list_of_elements) | |
chapter.addprevious(copied_chapter) | |
chapter.tag = "elementtobestripped" | |
etree.strip_elements(xml_tree, "elementtobestripped") | |
appinfo_element = xml_tree.xpath("//t:encodingDesc/t:appInfo", namespaces=NS_MAP)[0] | |
appinfo = libeoaconvert.write_appinfo("id_assigner", __version__, "idassign", | |
"Assign XML IDs to elements", datetime.now()) | |
appinfo_element.insert(0, appinfo) | |
old_file_name = args.teifile.replace(".xml", "-old-ids.xml") | |
shutil.copy(args.teifile, old_file_name) | |
logging.info(f"Made backup copy of {args.teifile} to {old_file_name}.") | |
output = args.teifile | |
xml_tree.write(output, pretty_print=True, xml_declaration=True, encoding="utf-8") | |
logging.info("Wrote %s.", output) | |
# def main ends here | |
if __name__ == '__main__': | |
main() | |
# finis |