Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Script for assigning xml:ids
  • Loading branch information
Klaus Thoden committed Nov 28, 2018
1 parent 2c9b5d1 commit 467690e
Showing 1 changed file with 111 additions and 0 deletions.
111 changes: 111 additions & 0 deletions idassigner.py
@@ -0,0 +1,111 @@
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-

"""
Docstring goes here
"""

__version__ = "1.0"
__date__ = "20181127"
__author__ = "kthoden@mpiwg-berlin.mpg.de"

import argparse
import logging
from datetime import datetime
from lxml import etree
from copy import deepcopy
import libeoaconvert

logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')

ns_tei = "http://www.tei-c.org/ns/1.0"
NS_MAP = {"t": ns_tei, "xml": "http://www.w3.org/XML/1998/namespace"}

def assign_ids(chapter_tree):
"""Assign xml:ids to various elements"""

chapter_id = (chapter_tree.attrib["{http://www.w3.org/XML/1998/namespace}id"])
logging.debug("The id of this chapter is %s", chapter_id)

footnotes = chapter_tree.xpath("//t:note[@place='bottom']", namespaces=NS_MAP)
footnote_id_counter = 1
for note in footnotes:
note_id = "{}_ftn{:03d}".format(chapter_id, footnote_id_counter)
note.set("n", str(footnote_id_counter))
libeoaconvert.assign_xml_id(note, note_id)
footnote_id_counter += 1

sections = chapter_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP)
section_id_counter = 1
for section in sections:
if section.get("n") == "nonumber":
logging.info("Leaving out unnumbered section.")
pass
else:
section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter)
libeoaconvert.assign_xml_id(section, section_id)
section_id_counter += 1

subsections = chapter_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP)
subsection_id_counter = 1
for subsection in subsections:
if subsection.get("n") == "nonumber":
logging.info("Leaving out unnumbered subsection.")
pass
else:
section_element = subsection.getparent()
section_id = section_element.attrib["{http://www.w3.org/XML/1998/namespace}id"]
logging.debug("Found a subsection in section %s", section_id)
rest, section_number = section_id.split("_sec")
subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, subsection_id_counter)
libeoaconvert.assign_xml_id(subsection, subsection_id)
subsection_id_counter += 1

figures = chapter_tree.xpath("//t:figure", namespaces=NS_MAP)
figure_id_counter = 1
for figure in figures:
figure_id = "{}_fig{:02d}".format(chapter_id, figure_id_counter)
libeoaconvert.assign_xml_id(figure, figure_id)
figure_id_counter += 1

tables = chapter_tree.xpath("//t:table", namespaces=NS_MAP)
table_id_counter = 1
for table in tables:
table_id = "{}_tab{:02d}".format(chapter_id, table_id_counter)
libeoaconvert.assign_xml_id(table, table_id)
table_id_counter += 1

return
# def assign_ids ends here

def main():
"""The main bit"""

parser = argparse.ArgumentParser()
parser.add_argument("teifile", help="TEI file")
args = parser.parse_args()

xml_tree = etree.parse(args.teifile)

chapters = xml_tree.xpath("//t:div[@type='chapter']", namespaces=NS_MAP)

logging.debug("Found %s chapters.", len(chapters))

# in this iteration, a copy is made of each chapter and fitted
# with ids, the original chapter is being discarded
for chapter in chapters:
copied_chapter = deepcopy(chapter)
assign_ids(copied_chapter)
chapter.addprevious(copied_chapter)
chapter.tag = "elementtobestripped"

etree.strip_elements(xml_tree, "elementtobestripped")
libeoaconvert.write_appinfo(xml_tree, "id_assigner", __version__, "idassign", "Assign XML IDs to elements", datetime.now().strftime("%Y-%m-%d"))
output = args.teifile.replace(".xml", "-withids.xml")
xml_tree.write(output, pretty_print=True, xml_declaration=True, encoding="utf-8")
logging.info("Wrote %s." % output)
# def main ends here

if __name__ == '__main__':
main()
# finis

0 comments on commit 467690e

Please sign in to comment.