Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
EOASkripts/idassigner.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
120 lines (97 sloc)
4.23 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8; mode: python -*- | |
""" | |
Assign xml:ids to various elements fo TEI file | |
""" | |
__version__ = "1.0" | |
__date__ = "20181127" | |
__author__ = "kthoden@mpiwg-berlin.mpg.de" | |
import utils.libeoaconvert as libeoaconvert | |
import argparse | |
import logging | |
import sys | |
from datetime import datetime | |
from lxml import etree | |
from copy import deepcopy | |
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') | |
ns_tei = "http://www.tei-c.org/ns/1.0" | |
NS_MAP = {"t": ns_tei, "xml": "http://www.w3.org/XML/1998/namespace"} | |
def assign_ids(chapter_tree): | |
"""Assign xml:ids to various elements""" | |
try: | |
chapter_id = (chapter_tree.attrib["{http://www.w3.org/XML/1998/namespace}id"]) | |
except KeyError: | |
logging.error("Chapter has no id. Exiting") | |
sys.exit() | |
logging.debug("The id of this chapter is %s", chapter_id) | |
footnotes = chapter_tree.xpath("//t:note[@place='bottom']", namespaces=NS_MAP) | |
footnote_id_counter = 1 | |
for note in footnotes: | |
note_id = "{}_ftn{:03d}".format(chapter_id, footnote_id_counter) | |
note.set("n", str(footnote_id_counter)) | |
libeoaconvert.assign_xml_id(note, note_id) | |
footnote_id_counter += 1 | |
sections = chapter_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP) | |
section_id_counter = 1 | |
for section in sections: | |
if section.get("n") == "nonumber": | |
logging.info("Leaving out unnumbered section.") | |
pass | |
else: | |
section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter) | |
libeoaconvert.assign_xml_id(section, section_id) | |
section_id_counter += 1 | |
subsections = chapter_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP) | |
subsection_id_counter = 1 | |
for subsection in subsections: | |
if subsection.get("n") == "nonumber": | |
logging.info("Leaving out unnumbered subsection.") | |
pass | |
else: | |
section_element = subsection.getparent() | |
section_id = section_element.attrib["{http://www.w3.org/XML/1998/namespace}id"] | |
logging.debug("Found a subsection in section %s", section_id) | |
rest, section_number = section_id.split("_sec") | |
subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, subsection_id_counter) | |
libeoaconvert.assign_xml_id(subsection, subsection_id) | |
subsection_id_counter += 1 | |
figures = chapter_tree.xpath("//t:figure", namespaces=NS_MAP) | |
figure_id_counter = 1 | |
for figure in figures: | |
figure_id = "{}_fig{:02d}".format(chapter_id, figure_id_counter) | |
libeoaconvert.assign_xml_id(figure, figure_id) | |
figure_id_counter += 1 | |
tables = chapter_tree.xpath("//t:table", namespaces=NS_MAP) | |
table_id_counter = 1 | |
for table in tables: | |
table_id = "{}_tab{:02d}".format(chapter_id, table_id_counter) | |
libeoaconvert.assign_xml_id(table, table_id) | |
table_id_counter += 1 | |
return | |
# def assign_ids ends here | |
def main(): | |
"""The main bit""" | |
parser = argparse.ArgumentParser() | |
parser.add_argument("teifile", help="TEI file") | |
args = parser.parse_args() | |
xml_tree = etree.parse(args.teifile) | |
chapters = xml_tree.xpath("//t:div[@type='chapter' and not(@n='nonumber')]", namespaces=NS_MAP) | |
logging.debug("Found %s chapters.", len(chapters)) | |
# in this iteration, a copy is made of each chapter and fitted | |
# with ids, the original chapter is being discarded | |
for chapter in chapters: | |
copied_chapter = deepcopy(chapter) | |
assign_ids(copied_chapter) | |
chapter.addprevious(copied_chapter) | |
chapter.tag = "elementtobestripped" | |
etree.strip_elements(xml_tree, "elementtobestripped") | |
appinfo_element = xml_tree.xpath("//t:encodingDesc/t:appInfo", namespaces=NS_MAP)[0] | |
appinfo = libeoaconvert.get_appinfo("id_assigner", __version__, "idassign", "Assign XML IDs to elements", datetime.now().strftime("%Y-%m-%d")) | |
appinfo_element.insert(0, appinfo) | |
output = args.teifile.replace(".xml", "-withids.xml") | |
xml_tree.write(output, pretty_print=True, xml_declaration=True, encoding="utf-8") | |
logging.info("Wrote %s." % output) | |
# def main ends here | |
if __name__ == '__main__': | |
main() | |
# finis |