Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Update idassigner script, close #42
  • Loading branch information
kthoden committed Feb 18, 2021
1 parent bf3ebb0 commit f576301
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 38 deletions.
164 changes: 126 additions & 38 deletions src/idassigner.py
Expand Up @@ -2,83 +2,166 @@
# -*- coding: utf-8; mode: python -*-

"""
Assign xml:ids to various elements fo TEI file
Assign xml:ids to various elements of TEI file
Unsuccessful exit codes:
9: Found a chapter without id. Prior to running idassigner all chapters must have ids.
10: User enters a chapter id that does not exist.
11: User wants to assign an id to an element that is not supported.
12: An element is referenced by ref but would get a new id. This requires user intervention.
"""

__version__ = "1.0"
__date__ = "20181127"
__version__ = "1.5"
__date__ = "20210218"
__author__ = "kthoden@mpiwg-berlin.mpg.de"

import utils.libeoaconvert as libeoaconvert
import argparse
import logging
import sys
import shutil
from datetime import datetime
from lxml import etree
from copy import deepcopy
from lxml import etree
import utils.libeoaconvert as libeoaconvert

logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')

ns_tei = "http://www.tei-c.org/ns/1.0"
NS_MAP = {"t": ns_tei, "xml": "http://www.w3.org/XML/1998/namespace"}
NS_TEI = "http://www.tei-c.org/ns/1.0"
NS_MAP = {"t": NS_TEI, "xml": "http://www.w3.org/XML/1998/namespace"}


def create_ref_hash(xml_tree):
"""Create a list of all used references inside the document"""

refs = xml_tree.xpath("//t:ref[@type='text-hi' or @type='number-hionly' \
or @type='number-hi' or @type='hionlycollage' or @type='number' or @type='page' \
or @type='text']", namespaces=NS_MAP)

return refs
# def create_ref_hash ends here


def check_references(target_id, references):
"""Check if target id is in referenced items."""

is_in_references = False

for reference in references:
referenced_item = reference.get("target")

if target_id == referenced_item:
is_in_references = True

def assign_ids(chapter_tree, elements):
return is_in_references
# def check_references ends here


def interrupt_assignment(identifier):
"""Terminate the program with an error, because an id is about to be
overridden that is referenced"""

logging.error("Found an element that is referenced: ")
logging.error(identifier)
logging.error("""Before continuing you should take a note of what is referenced and
modify the reference after assignment. Exiting.""")
sys.exit(12)
# def interrupt_assignment ends here


def assign_ids(chapter_tree, references, elements):
"""Assign xml:ids to various elements"""

try:
chapter_id = (chapter_tree.attrib["{http://www.w3.org/XML/1998/namespace}id"])
chapter_id = chapter_tree.attrib["{http://www.w3.org/XML/1998/namespace}id"]
except KeyError:
logging.error("Chapter has no id. Exiting")
sys.exit()
sys.exit(9)
logging.debug("The id of this chapter is %s", chapter_id)

if "footnotes" in elements:
footnotes = chapter_tree.xpath("//t:note[@place='bottom']", namespaces=NS_MAP)
footnote_id_counter = 1
for note in footnotes:
note_id = "{}_ftn{:03d}".format(chapter_id, footnote_id_counter)
note.set("n", str(footnote_id_counter))
libeoaconvert.assign_xml_id(note, note_id)
existing_note_id = f"#{libeoaconvert.get_xml_id(note)}"
new_note_id = "{}_ftn{:03d}".format(chapter_id, footnote_id_counter)
if existing_note_id is None:
libeoaconvert.assign_xml_id(note, new_note_id)
else:
referenced = check_references(existing_note_id, references)
if referenced:
interrupt_assignment(existing_note_id)
else:
note.set("n", str(footnote_id_counter))
libeoaconvert.assign_xml_id(note, new_note_id)
footnote_id_counter += 1

if "sections" in elements:
sections = chapter_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP)
section_id_counter = 1
for section in sections:
section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter)
libeoaconvert.assign_xml_id(section, section_id)
existing_section_id = f"#{libeoaconvert.get_xml_id(section)}"
new_section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter)
if existing_section_id is None:
libeoaconvert.assign_xml_id(section, new_section_id)
else:
referenced = check_references(existing_section_id, references)
if referenced:
interrupt_assignment(existing_section_id)
else:
libeoaconvert.assign_xml_id(section, new_section_id)
section_id_counter += 1

if "sections" in elements:
subsections = chapter_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP)
subsection_id_counter = 1
for subsection in subsections:
section_element = subsection.getparent()
section_id = section_element.attrib["{http://www.w3.org/XML/1998/namespace}id"]
logging.debug("Found a subsection in section %s", section_id)
existing_subsection_id = f"#{libeoaconvert.get_xml_id(subsection)}"
section_id = libeoaconvert.get_xml_id(subsection.getparent())
rest, section_number = section_id.split("_sec")
subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, subsection_id_counter)
libeoaconvert.assign_xml_id(subsection, subsection_id)
new_subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number,
subsection_id_counter)
logging.debug("Found a subsection in section %s", section_id)
if existing_section_id is None:
libeoaconvert.assign_xml_id(subsection, new_subsection_id)
else:
referenced = check_references(existing_subsection_id, references)
if referenced:
interrupt_assignment(existing_subsection_id)
else:
libeoaconvert.assign_xml_id(subsection, new_subsection_id)
subsection_id_counter += 1

if "figures" in elements:
figures = chapter_tree.xpath("//t:figure", namespaces=NS_MAP)
figure_id_counter = 1
for figure in figures:
figure_id = "{}_fig{:02d}".format(chapter_id, figure_id_counter)
libeoaconvert.assign_xml_id(figure, figure_id)
existing_figure_id = f"#{libeoaconvert.get_xml_id(figure)}"
new_figure_id = "{}_fig{:02d}".format(chapter_id, figure_id_counter)
if existing_figure_id is None:
libeoaconvert.assign_xml_id(figure, new_figure_id)
else:
referenced = check_references(existing_figure_id, references)
if referenced:
interrupt_assignment(existing_figure_id)
else:
libeoaconvert.assign_xml_id(figure, new_figure_id)
figure_id_counter += 1

if "tables" in elements:
tables = chapter_tree.xpath("//t:table", namespaces=NS_MAP)
table_id_counter = 1
for table in tables:
table_id = "{}_tab{:02d}".format(chapter_id, table_id_counter)
libeoaconvert.assign_xml_id(table, table_id)
existing_table_id = f"#{libeoaconvert.get_xml_id(table)}"
new_table_id = "{}_tab{:02d}".format(chapter_id, table_id_counter)
if existing_table_id is None:
libeoaconvert.assign_xml_id(table, new_table_id)
else:
referenced = check_references(existing_table_id, references)
if referenced:
interrupt_assignment(existing_table_id)
else:
libeoaconvert.assign_xml_id(table, new_table_id)
table_id_counter += 1

return
# def assign_ids ends here

def main():
Expand All @@ -88,22 +171,26 @@ def main():

parser = argparse.ArgumentParser()
parser.add_argument("-f", "--teifile", help="TEI file")
parser.add_argument("-c", "--chapter", nargs='*', help="Modify only chapters with following ids.")
parser.add_argument("-e", "--element", nargs='*', help=f"List of elements to assign ids to, separated by spaces. Available elements: {', '.join(list_of_elements)}")
parser.add_argument("-c", "--chapter", nargs='*', help="""Modify only chapters with
following ids.""")
parser.add_argument("-e", "--element", nargs='*', help=f"""List of elements to assign
ids to, separated by spaces. Available elements: {', '.join(list_of_elements)}""")
args = parser.parse_args()

xml_tree = etree.parse(args.teifile)

references_in_xml = create_ref_hash(xml_tree)

elements = args.element
if elements:
if not elements:
logging.debug(f"Assigning ids to all elements: {', '.join(list_of_elements)}")
else:
for element in elements:
if element not in list_of_elements:
logging.error(f"{element} is not a valid element. Exiting.")
sys.exit(1)
logging.error("%s is not a valid element. Exiting.", element)
sys.exit(11)
list_of_elements = elements
logging.debug(f"Assigning ids selected elements: {', '.join(list_of_elements)}")
else:
logging.debug(f"Assigning ids to all elements: {', '.join(list_of_elements)}")

selected_chapters = args.chapter

Expand All @@ -115,19 +202,20 @@ def main():
chapter = xml_tree.xpath(f"//t:div[@xml:id='{xml_chapter}']", namespaces=NS_MAP)[0]
except IndexError:
logging.error("Found no chapter with id '%s'. Exiting.", xml_chapter)
sys.exit(2)
sys.exit(10)
copied_chapter = deepcopy(chapter)
assign_ids(copied_chapter, elements=list_of_elements)
assign_ids(copied_chapter, references_in_xml, elements=list_of_elements)
chapter.addprevious(copied_chapter)
chapter.tag = "elementtobestripped"
else:
chapters = xml_tree.xpath("//t:div[@type='chapter' or @type='indexchapter' or @type='chapteroriginal' or @type='chaptertranslation']", namespaces=NS_MAP)
chapters = xml_tree.xpath("//t:div[@type='chapter' or @type='indexchapter' or \
@type='chapteroriginal' or @type='chaptertranslation']", namespaces=NS_MAP)
logging.debug("Found %s chapters.", len(chapters))
# in this iteration, a copy is made of each chapter and fitted
# with ids, the original chapter is being discarded
for chapter in chapters:
copied_chapter = deepcopy(chapter)
assign_ids(copied_chapter, elements=list_of_elements)
assign_ids(copied_chapter, references_in_xml, elements=list_of_elements)
chapter.addprevious(copied_chapter)
chapter.tag = "elementtobestripped"

Expand All @@ -143,7 +231,7 @@ def main():
logging.info(f"Made backup copy of {args.teifile} to {old_file_name}.")
output = args.teifile
xml_tree.write(output, pretty_print=True, xml_declaration=True, encoding="utf-8")
logging.info("Wrote %s." % output)
logging.info("Wrote %s.", output)
# def main ends here

if __name__ == '__main__':
Expand Down
14 changes: 14 additions & 0 deletions src/utils/libeoaconvert.py
Expand Up @@ -414,6 +414,20 @@ def assign_xml_id(element, identifier):
return
# def assign_xml_id ends here


def get_xml_id(element):
"""Get the xml:id of an element"""

try:
element_id = element.attrib["{http://www.w3.org/XML/1998/namespace}id"]
except KeyError:
logging.warning("Element %s has no xml:id attribute.", element.tag)
element_id = None

return element_id
# def get_xml_id ends here


def write_appinfo(ident, version, xmlid, text, date):
"""Log the change of a TEI document in the appinfo element"""

Expand Down

0 comments on commit f576301

Please sign in to comment.