Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Script for assigning xml:ids
- Loading branch information
Klaus Thoden
committed
Nov 28, 2018
1 parent
2c9b5d1
commit 467690e
Showing
1 changed file
with
111 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8; mode: python -*- | ||
|
||
""" | ||
Docstring goes here | ||
""" | ||
|
||
__version__ = "1.0" | ||
__date__ = "20181127" | ||
__author__ = "kthoden@mpiwg-berlin.mpg.de" | ||
|
||
import argparse | ||
import logging | ||
from datetime import datetime | ||
from lxml import etree | ||
from copy import deepcopy | ||
import libeoaconvert | ||
|
||
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') | ||
|
||
ns_tei = "http://www.tei-c.org/ns/1.0" | ||
NS_MAP = {"t": ns_tei, "xml": "http://www.w3.org/XML/1998/namespace"} | ||
|
||
def assign_ids(chapter_tree): | ||
"""Assign xml:ids to various elements""" | ||
|
||
chapter_id = (chapter_tree.attrib["{http://www.w3.org/XML/1998/namespace}id"]) | ||
logging.debug("The id of this chapter is %s", chapter_id) | ||
|
||
footnotes = chapter_tree.xpath("//t:note[@place='bottom']", namespaces=NS_MAP) | ||
footnote_id_counter = 1 | ||
for note in footnotes: | ||
note_id = "{}_ftn{:03d}".format(chapter_id, footnote_id_counter) | ||
note.set("n", str(footnote_id_counter)) | ||
libeoaconvert.assign_xml_id(note, note_id) | ||
footnote_id_counter += 1 | ||
|
||
sections = chapter_tree.xpath("//t:div[@type='section']", namespaces=NS_MAP) | ||
section_id_counter = 1 | ||
for section in sections: | ||
if section.get("n") == "nonumber": | ||
logging.info("Leaving out unnumbered section.") | ||
pass | ||
else: | ||
section_id = "{}_sec{:02d}".format(chapter_id, section_id_counter) | ||
libeoaconvert.assign_xml_id(section, section_id) | ||
section_id_counter += 1 | ||
|
||
subsections = chapter_tree.xpath("//t:div[@type='subsection']", namespaces=NS_MAP) | ||
subsection_id_counter = 1 | ||
for subsection in subsections: | ||
if subsection.get("n") == "nonumber": | ||
logging.info("Leaving out unnumbered subsection.") | ||
pass | ||
else: | ||
section_element = subsection.getparent() | ||
section_id = section_element.attrib["{http://www.w3.org/XML/1998/namespace}id"] | ||
logging.debug("Found a subsection in section %s", section_id) | ||
rest, section_number = section_id.split("_sec") | ||
subsection_id = "{}_subsec{}-{:02d}".format(chapter_id, section_number, subsection_id_counter) | ||
libeoaconvert.assign_xml_id(subsection, subsection_id) | ||
subsection_id_counter += 1 | ||
|
||
figures = chapter_tree.xpath("//t:figure", namespaces=NS_MAP) | ||
figure_id_counter = 1 | ||
for figure in figures: | ||
figure_id = "{}_fig{:02d}".format(chapter_id, figure_id_counter) | ||
libeoaconvert.assign_xml_id(figure, figure_id) | ||
figure_id_counter += 1 | ||
|
||
tables = chapter_tree.xpath("//t:table", namespaces=NS_MAP) | ||
table_id_counter = 1 | ||
for table in tables: | ||
table_id = "{}_tab{:02d}".format(chapter_id, table_id_counter) | ||
libeoaconvert.assign_xml_id(table, table_id) | ||
table_id_counter += 1 | ||
|
||
return | ||
# def assign_ids ends here | ||
|
||
def main(): | ||
"""The main bit""" | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("teifile", help="TEI file") | ||
args = parser.parse_args() | ||
|
||
xml_tree = etree.parse(args.teifile) | ||
|
||
chapters = xml_tree.xpath("//t:div[@type='chapter']", namespaces=NS_MAP) | ||
|
||
logging.debug("Found %s chapters.", len(chapters)) | ||
|
||
# in this iteration, a copy is made of each chapter and fitted | ||
# with ids, the original chapter is being discarded | ||
for chapter in chapters: | ||
copied_chapter = deepcopy(chapter) | ||
assign_ids(copied_chapter) | ||
chapter.addprevious(copied_chapter) | ||
chapter.tag = "elementtobestripped" | ||
|
||
etree.strip_elements(xml_tree, "elementtobestripped") | ||
libeoaconvert.write_appinfo(xml_tree, "id_assigner", __version__, "idassign", "Assign XML IDs to elements", datetime.now().strftime("%Y-%m-%d")) | ||
output = args.teifile.replace(".xml", "-withids.xml") | ||
xml_tree.write(output, pretty_print=True, xml_declaration=True, encoding="utf-8") | ||
logging.info("Wrote %s." % output) | ||
# def main ends here | ||
|
||
if __name__ == '__main__': | ||
main() | ||
# finis |