Skip to content
Permalink
a79740a274
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
111 lines (90 sloc) 3.94 KB
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
"""
Assign xml:ids to various elements fo TEI file
"""
__version__ = "1.0"
__date__ = "20181127"
__author__ = "kthoden@mpiwg-berlin.mpg.de"
import argparse
import logging
from datetime import datetime
from lxml import etree
from copy import deepcopy
import libeoaconvert
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
ns_tei = "http://www.tei-c.org/ns/1.0"
NS_MAP = {"t": ns_tei, "xml": "http://www.w3.org/XML/1998/namespace"}
def assign_ids(chapter_tree):
"""Assign xml:ids to various elements"""
chapter_id = (chapter_tree.attrib["{http://www.w3.org/XML/1998/namespace}id"])
logging.debug("The id of this chapter is %s", chapter_id)
footnotes = chapter_tree.xpath("//t:note[@place='bottom']", namespaces=NS_MAP)
footnote_id_counter = 1
for note in footnotes:
note_id = "{}_ftn{:03d}".format(chapter_id, footnote_id_counter)
note.set("n", str(footnote_id_counter))
libeoaconvert.assign_xml_id(note, note_id)
footnote_id_counter += 1
sections = chapter_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP)
section_id_counter = 1
for section in sections:
if section.get("n") == "nonumber":
logging.info("Leaving out unnumbered section.")
pass
else:
section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter)
libeoaconvert.assign_xml_id(section, section_id)
section_id_counter += 1
subsections = chapter_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP)
subsection_id_counter = 1
for subsection in subsections:
if subsection.get("n") == "nonumber":
logging.info("Leaving out unnumbered subsection.")
pass
else:
section_element = subsection.getparent()
section_id = section_element.attrib["{http://www.w3.org/XML/1998/namespace}id"]
logging.debug("Found a subsection in section %s", section_id)
rest, section_number = section_id.split("_sec")
subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, subsection_id_counter)
libeoaconvert.assign_xml_id(subsection, subsection_id)
subsection_id_counter += 1
figures = chapter_tree.xpath("//t:figure", namespaces=NS_MAP)
figure_id_counter = 1
for figure in figures:
figure_id = "{}_fig{:02d}".format(chapter_id, figure_id_counter)
libeoaconvert.assign_xml_id(figure, figure_id)
figure_id_counter += 1
tables = chapter_tree.xpath("//t:table", namespaces=NS_MAP)
table_id_counter = 1
for table in tables:
table_id = "{}_tab{:02d}".format(chapter_id, table_id_counter)
libeoaconvert.assign_xml_id(table, table_id)
table_id_counter += 1
return
# def assign_ids ends here
def main():
"""The main bit"""
parser = argparse.ArgumentParser()
parser.add_argument("teifile", help="TEI file")
args = parser.parse_args()
xml_tree = etree.parse(args.teifile)
chapters = xml_tree.xpath("//t:div[@type='chapter']", namespaces=NS_MAP)
logging.debug("Found %s chapters.", len(chapters))
# in this iteration, a copy is made of each chapter and fitted
# with ids, the original chapter is being discarded
for chapter in chapters:
copied_chapter = deepcopy(chapter)
assign_ids(copied_chapter)
chapter.addprevious(copied_chapter)
chapter.tag = "elementtobestripped"
etree.strip_elements(xml_tree, "elementtobestripped")
libeoaconvert.write_appinfo(xml_tree, "id_assigner", __version__, "idassign", "Assign XML IDs to elements", datetime.now().strftime("%Y-%m-%d"))
output = args.teifile.replace(".xml", "-withids.xml")
xml_tree.write(output, pretty_print=True, xml_declaration=True, encoding="utf-8")
logging.info("Wrote %s." % output)
# def main ends here
if __name__ == '__main__':
main()
# finis