Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
"""
Assign xml:ids to various elements of TEI file
Unsuccessful exit codes:
9: Found a chapter without id. Prior to running idassigner all chapters must have ids.
10: User enters a chapter id that does not exist.
11: User wants to assign an id to an element that is not supported.
12: An element is referenced by ref but would get a new id. This requires user intervention.
"""
__version__ = "1.5"
__date__ = "20210218"
__author__ = "kthoden@mpiwg-berlin.mpg.de"
import argparse
import logging
import sys
import shutil
from datetime import datetime
from copy import deepcopy
from lxml import etree
import utils.libeoaconvert as libeoaconvert
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
NS_TEI = "http://www.tei-c.org/ns/1.0"
NS_MAP = {"t": NS_TEI, "xml": "http://www.w3.org/XML/1998/namespace"}
def create_ref_hash(xml_tree):
"""Create a list of all used references inside the document"""
refs = xml_tree.xpath("//t:ref[@type='text-hi' or @type='number-hionly' \
or @type='number-hi' or @type='hionlycollage' or @type='number' or @type='page' \
or @type='text']", namespaces=NS_MAP)
return refs
# def create_ref_hash ends here
def check_references(target_id, references):
"""Check if target id is in referenced items."""
is_in_references = False
for reference in references:
referenced_item = reference.get("target")
if target_id == referenced_item:
is_in_references = True
return is_in_references
# def check_references ends here
def interrupt_assignment(identifier):
"""Terminate the program with an error, because an id is about to be
overridden that is referenced"""
logging.error("Found an element that is referenced: ")
logging.error(identifier)
logging.error("""Before continuing you should take a note of what is referenced and
modify the reference after assignment. Exiting.""")
sys.exit(12)
# def interrupt_assignment ends here
def assign_ids(chapter_tree, references, elements):
"""Assign xml:ids to various elements"""
try:
chapter_id = chapter_tree.attrib["{http://www.w3.org/XML/1998/namespace}id"]
except KeyError:
logging.error("Chapter has no id. Exiting")
sys.exit(9)
logging.debug("The id of this chapter is %s", chapter_id)
if "footnotes" in elements:
footnotes = chapter_tree.xpath("//t:note[@place='bottom']", namespaces=NS_MAP)
footnote_id_counter = 1
for note in footnotes:
existing_note_id = f"#{libeoaconvert.get_xml_id(note)}"
new_note_id = "{}_ftn{:03d}".format(chapter_id, footnote_id_counter)
if existing_note_id is None:
libeoaconvert.assign_xml_id(note, new_note_id)
else:
referenced = check_references(existing_note_id, references)
if referenced:
interrupt_assignment(existing_note_id)
else:
note.set("n", str(footnote_id_counter))
libeoaconvert.assign_xml_id(note, new_note_id)
footnote_id_counter += 1
if "sections" in elements:
sections = chapter_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP)
section_id_counter = 1
for section in sections:
existing_section_id = f"#{libeoaconvert.get_xml_id(section)}"
new_section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter)
if existing_section_id is None:
libeoaconvert.assign_xml_id(section, new_section_id)
else:
referenced = check_references(existing_section_id, references)
if referenced:
interrupt_assignment(existing_section_id)
else:
libeoaconvert.assign_xml_id(section, new_section_id)
section_id_counter += 1
if "sections" in elements:
subsections = chapter_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP)
subsection_id_counter = 1
for subsection in subsections:
existing_subsection_id = f"#{libeoaconvert.get_xml_id(subsection)}"
section_id = libeoaconvert.get_xml_id(subsection.getparent())
rest, section_number = section_id.split("_sec")
new_subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number,
subsection_id_counter)
logging.debug("Found a subsection in section %s", section_id)
if existing_section_id is None:
libeoaconvert.assign_xml_id(subsection, new_subsection_id)
else:
referenced = check_references(existing_subsection_id, references)
if referenced:
interrupt_assignment(existing_subsection_id)
else:
libeoaconvert.assign_xml_id(subsection, new_subsection_id)
subsection_id_counter += 1
if "figures" in elements:
figures = chapter_tree.xpath("//t:figure", namespaces=NS_MAP)
figure_id_counter = 1
for figure in figures:
existing_figure_id = f"#{libeoaconvert.get_xml_id(figure)}"
new_figure_id = "{}_fig{:02d}".format(chapter_id, figure_id_counter)
if existing_figure_id is None:
libeoaconvert.assign_xml_id(figure, new_figure_id)
else:
referenced = check_references(existing_figure_id, references)
if referenced:
interrupt_assignment(existing_figure_id)
else:
libeoaconvert.assign_xml_id(figure, new_figure_id)
figure_id_counter += 1
if "tables" in elements:
tables = chapter_tree.xpath("//t:table", namespaces=NS_MAP)
table_id_counter = 1
for table in tables:
existing_table_id = f"#{libeoaconvert.get_xml_id(table)}"
new_table_id = "{}_tab{:02d}".format(chapter_id, table_id_counter)
if existing_table_id is None:
libeoaconvert.assign_xml_id(table, new_table_id)
else:
referenced = check_references(existing_table_id, references)
if referenced:
interrupt_assignment(existing_table_id)
else:
libeoaconvert.assign_xml_id(table, new_table_id)
table_id_counter += 1
# def assign_ids ends here
def main():
"""The main bit"""
list_of_elements = ["footnotes", "sections", "subsections", "figures", "tables"]
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--teifile", help="TEI file")
parser.add_argument("-c", "--chapter", nargs='*', help="""Modify only chapters with
following ids.""")
parser.add_argument("-e", "--element", nargs='*', help=f"""List of elements to assign
ids to, separated by spaces. Available elements: {', '.join(list_of_elements)}""")
args = parser.parse_args()
xml_tree = etree.parse(args.teifile)
references_in_xml = create_ref_hash(xml_tree)
elements = args.element
if not elements:
logging.debug(f"Assigning ids to all elements: {', '.join(list_of_elements)}")
else:
for element in elements:
if element not in list_of_elements:
logging.error("%s is not a valid element. Exiting.", element)
sys.exit(11)
list_of_elements = elements
logging.debug(f"Assigning ids selected elements: {', '.join(list_of_elements)}")
selected_chapters = args.chapter
if selected_chapters:
logging.debug(selected_chapters)
chapters = []
for xml_chapter in selected_chapters:
try:
chapter = xml_tree.xpath(f"//t:div[@xml:id='{xml_chapter}']", namespaces=NS_MAP)[0]
except IndexError:
logging.error("Found no chapter with id '%s'. Exiting.", xml_chapter)
sys.exit(10)
copied_chapter = deepcopy(chapter)
assign_ids(copied_chapter, references_in_xml, elements=list_of_elements)
chapter.addprevious(copied_chapter)
chapter.tag = "elementtobestripped"
else:
chapters = xml_tree.xpath("//t:div[@type='chapter' or @type='indexchapter' or \
@type='chapteroriginal' or @type='chaptertranslation']", namespaces=NS_MAP)
logging.debug("Found %s chapters.", len(chapters))
# in this iteration, a copy is made of each chapter and fitted
# with ids, the original chapter is being discarded
for chapter in chapters:
copied_chapter = deepcopy(chapter)
assign_ids(copied_chapter, references_in_xml, elements=list_of_elements)
chapter.addprevious(copied_chapter)
chapter.tag = "elementtobestripped"
etree.strip_elements(xml_tree, "elementtobestripped")
appinfo_element = xml_tree.xpath("//t:encodingDesc/t:appInfo", namespaces=NS_MAP)[0]
appinfo = libeoaconvert.write_appinfo("id_assigner", __version__, "idassign",
"Assign XML IDs to elements", datetime.now())
appinfo_element.insert(0, appinfo)
old_file_name = args.teifile.replace(".xml", "-old-ids.xml")
shutil.copy(args.teifile, old_file_name)
logging.info(f"Made backup copy of {args.teifile} to {old_file_name}.")
output = args.teifile
xml_tree.write(output, pretty_print=True, xml_declaration=True, encoding="utf-8")
logging.info("Wrote %s.", output)
# def main ends here
if __name__ == '__main__':
main()
# finis