Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Made elements and chapters configurable
  • Loading branch information
kthoden committed Jul 24, 2019
1 parent bfa5311 commit d59a04f
Showing 1 changed file with 95 additions and 61 deletions.
156 changes: 95 additions & 61 deletions idassigner.py
Expand Up @@ -13,6 +13,7 @@
import argparse
import logging
import sys
import shutil
from datetime import datetime
from lxml import etree
from copy import deepcopy
Expand All @@ -22,7 +23,7 @@
ns_tei = "http://www.tei-c.org/ns/1.0"
NS_MAP = {"t": ns_tei, "xml": "http://www.w3.org/XML/1998/namespace"}

def assign_ids(chapter_tree):
def assign_ids(chapter_tree, elements):
"""Assign xml:ids to various elements"""

try:
Expand All @@ -32,85 +33,118 @@ def assign_ids(chapter_tree):
sys.exit()
logging.debug("The id of this chapter is %s", chapter_id)

footnotes = chapter_tree.xpath("//t:note[@place='bottom']", namespaces=NS_MAP)
footnote_id_counter = 1
for note in footnotes:
note_id = "{}_ftn{:03d}".format(chapter_id, footnote_id_counter)
note.set("n", str(footnote_id_counter))
libeoaconvert.assign_xml_id(note, note_id)
footnote_id_counter += 1

sections = chapter_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP)
section_id_counter = 1
for section in sections:
if section.get("n") == "nonumber":
logging.info("Leaving out unnumbered section.")
pass
else:
section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter)
libeoaconvert.assign_xml_id(section, section_id)
section_id_counter += 1

subsections = chapter_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP)
subsection_id_counter = 1
for subsection in subsections:
if subsection.get("n") == "nonumber":
logging.info("Leaving out unnumbered subsection.")
pass
else:
section_element = subsection.getparent()
section_id = section_element.attrib["{http://www.w3.org/XML/1998/namespace}id"]
logging.debug("Found a subsection in section %s", section_id)
rest, section_number = section_id.split("_sec")
subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, subsection_id_counter)
libeoaconvert.assign_xml_id(subsection, subsection_id)
subsection_id_counter += 1

figures = chapter_tree.xpath("//t:figure", namespaces=NS_MAP)
figure_id_counter = 1
for figure in figures:
figure_id = "{}_fig{:02d}".format(chapter_id, figure_id_counter)
libeoaconvert.assign_xml_id(figure, figure_id)
figure_id_counter += 1

tables = chapter_tree.xpath("//t:table", namespaces=NS_MAP)
table_id_counter = 1
for table in tables:
table_id = "{}_tab{:02d}".format(chapter_id, table_id_counter)
libeoaconvert.assign_xml_id(table, table_id)
table_id_counter += 1
if "footnotes" in elements:
footnotes = chapter_tree.xpath("//t:note[@place='bottom']", namespaces=NS_MAP)
footnote_id_counter = 1
for note in footnotes:
note_id = "{}_ftn{:03d}".format(chapter_id, footnote_id_counter)
note.set("n", str(footnote_id_counter))
libeoaconvert.assign_xml_id(note, note_id)
footnote_id_counter += 1

if "sections" in elements:
sections = chapter_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP)
section_id_counter = 1
for section in sections:
if section.get("n") == "nonumber":
logging.info("Leaving out unnumbered section.")
pass
else:
section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter)
libeoaconvert.assign_xml_id(section, section_id)
section_id_counter += 1

if "sections" in elements:
subsections = chapter_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP)
subsection_id_counter = 1
for subsection in subsections:
if subsection.get("n") == "nonumber":
logging.info("Leaving out unnumbered subsection.")
pass
else:
section_element = subsection.getparent()
section_id = section_element.attrib["{http://www.w3.org/XML/1998/namespace}id"]
logging.debug("Found a subsection in section %s", section_id)
rest, section_number = section_id.split("_sec")
subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, subsection_id_counter)
libeoaconvert.assign_xml_id(subsection, subsection_id)
subsection_id_counter += 1

if "figures" in elements:
figures = chapter_tree.xpath("//t:figure", namespaces=NS_MAP)
figure_id_counter = 1
for figure in figures:
figure_id = "{}_fig{:02d}".format(chapter_id, figure_id_counter)
libeoaconvert.assign_xml_id(figure, figure_id)
figure_id_counter += 1

if "tables" in elements:
tables = chapter_tree.xpath("//t:table", namespaces=NS_MAP)
table_id_counter = 1
for table in tables:
table_id = "{}_tab{:02d}".format(chapter_id, table_id_counter)
libeoaconvert.assign_xml_id(table, table_id)
table_id_counter += 1

return
# def assign_ids ends here

def main():
"""The main bit"""

list_of_elements = ["footnotes", "sections", "subsections", "figures", "tables"]

parser = argparse.ArgumentParser()
parser.add_argument("teifile", help="TEI file")
parser.add_argument("-f", "--teifile", help="TEI file")
parser.add_argument("-c", "--chapter", nargs='*', help="Modify only chapters with following ids.")
parser.add_argument("-e", "--element", nargs='*', help=f"List of elements to assign ids to, separated by spaces. Available elements: {', '.join(list_of_elements)}")
args = parser.parse_args()

xml_tree = etree.parse(args.teifile)

chapters = xml_tree.xpath("//t:div[@type='chapter' and not(@n='nonumber')]", namespaces=NS_MAP)

logging.debug("Found %s chapters.", len(chapters))

# in this iteration, a copy is made of each chapter and fitted
# with ids, the original chapter is being discarded
for chapter in chapters:
copied_chapter = deepcopy(chapter)
assign_ids(copied_chapter)
chapter.addprevious(copied_chapter)
chapter.tag = "elementtobestripped"
elements = args.element
if elements:
for element in elements:
if element not in list_of_elements:
logging.error(f"{element} is not a valid element. Exiting.")
sys.exit(1)
list_of_elements = elements
logging.debug(f"Assigning ids selected elements: {', '.join(list_of_elements)}")
else:
logging.debug(f"Assigning ids to all elements: {', '.join(list_of_elements)}")

selected_chapters = args.chapter

if selected_chapters:
print(selected_chapters)
chapters = []
for xml_chapter in selected_chapters:
chapter = xml_tree.xpath(f"//t:div[@xml:id='{xml_chapter}' and not(@n='nonumber')]", namespaces=NS_MAP)[0]
copied_chapter = deepcopy(chapter)
assign_ids(copied_chapter, elements=list_of_elements)
chapter.addprevious(copied_chapter)
chapter.tag = "elementtobestripped"
else:
chapters = xml_tree.xpath("//t:div[@type='chapter' and not(@n='nonumber')]", namespaces=NS_MAP)
logging.debug("Found %s chapters.", len(chapters))
# in this iteration, a copy is made of each chapter and fitted
# with ids, the original chapter is being discarded
for chapter in chapters:
copied_chapter = deepcopy(chapter)
assign_ids(copied_chapter, elements=list_of_elements)
chapter.addprevious(copied_chapter)
chapter.tag = "elementtobestripped"

etree.strip_elements(xml_tree, "elementtobestripped")

appinfo_element = xml_tree.xpath("//t:encodingDesc/t:appInfo", namespaces=NS_MAP)[0]
appinfo = libeoaconvert.get_appinfo("id_assigner", __version__, "idassign", "Assign XML IDs to elements", datetime.now().strftime("%Y-%m-%d"))
appinfo_element.insert(0, appinfo)

output = args.teifile.replace(".xml", "-withids.xml")
old_file_name = args.teifile.replace(".xml", "-old-ids.xml")
shutil.copy(args.teifile, old_file_name)
logging.info(f"Made backup copy of {args.teifile} to {old_file_name}.")
output = args.teifile
xml_tree.write(output, pretty_print=True, xml_declaration=True, encoding="utf-8")
logging.info("Wrote %s." % output)
# def main ends here
Expand Down

0 comments on commit d59a04f

Please sign in to comment.