Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Adapted fix_tei to current version of RNC schema
  • Loading branch information
Klaus Thoden committed Aug 30, 2018
1 parent 8ecac47 commit 817efbd
Showing 1 changed file with 117 additions and 36 deletions.
153 changes: 117 additions & 36 deletions fix_tei.py
Expand Up @@ -39,21 +39,18 @@
import argparse
import traceback
import libeoaconvert
import configparser

logging.basicConfig(level=logging.INFO, format=' %(asctime)s - %(levelname)s - %(message)s')

# do things like in latex2eoa: search and replace things by regex
# also, delete elements and attributes inserted by metypeset
# and, rename elements according to our schema

# treat
# assignment of identifiers

ns_tei = "http://www.tei-c.org/ns/1.0"
NS_MAP = {"t" : ns_tei}

TMP_DIR = os.path.expanduser("tmp_files")

BOILERPLATES = configparser.ConfigParser()
BOILERPLATES.read("data/tei_boilerplate.cfg")

def get_place_in_xml_tree(element, tree):
"""Find out the position of an element in a tree.
Expand Down Expand Up @@ -82,6 +79,25 @@ def parse_bibtex(bibfile):
# return all_references
# def parse_bibtex ends here

def restore_xml_tags(text):
"""Convert XML entities back to code
&lt; => <
"""

replacements = {
"&lt;" : "<",
"&gt;" : ">",
"&apos;" : "'",
"&amp;" : "&"
}

for item in replacements:
text = text.replace(item, replacements[item])

return text
# def restore_xml_tags ends here

def unescape(text):
"""Remove HTML or XML character references and entities from a text
string. Return a Unicode string.
Expand Down Expand Up @@ -271,10 +287,11 @@ def make_figure_elements(list_of_figures, figure_directory):
# def make_figure_elements ends here

def cleanup_xml(xml_tree):
"""Perform some cleaning on XML"""
"""Perform some cleaning on XML
# also, delete elements and attributes inserted by metypeset
# and, rename elements according to our schema
Also, delete elements and attributes inserted by metypeset and
rename elements according to our schema
"""

metypeset_attrib = xml_tree.findall("//t:*[@meTypesetSize]", namespaces=NS_MAP)
color_attrib = xml_tree.xpath("//t:hi[contains(@rend, 'color') or contains(@rend, 'background')]", namespaces=NS_MAP)
Expand Down Expand Up @@ -343,61 +360,121 @@ def fix_document_structure(xml_tree, highest_level):
subsection.set("type", "subsection")
for subsubsection in subsubsection_divs:
subsubsection.set("type", "subsubsection")

# section_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP)
# subsection_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP)
# subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP)

# for section in section_divs:
# section.set("type", "section")
# for subsection in subsection_divs:
# subsection.set("type", "subsection")
# for subsubsection in subsubsection_divs:
# subsubsection.set("type", "subsubsection")
# def fix_document_structure ends here

def fix_tei_header(xml_tree, bibfile_string):
"""Populate TEI header with mandatory data"""

title_statement = xml_tree.xpath("//t:titleStmt", namespaces=NS_MAP)[0]
title_element = title_statement.find("t:title", namespaces=NS_MAP)
title_element.set("level", "s")
title_element.set("n", "20")
title_element.text = "Titel der Serie"

main_title = etree.Element("title", type="main")
main_title.text = "FotoObjekte"
title_statement.insert(0, main_title)
title_element.set("type", "main")
title_element.set("level", "m")
if title_element.text is None:
title_element.text = "Insert title of publication here"
else:
pass

# series = etree.SubElement(title_statement, "title", level="s", n="20").text = "Studies"
# subtitle = etree.SubElement(title_statement, "title", level="sub").text = "Artikelsammlung"
edition = xml_tree.xpath("//t:editionStmt/t:edition", namespaces=NS_MAP)[0]
edition_date = edition.find("t:date", namespaces=NS_MAP)
edition_date.clear()
edition_date.tag = "tagtobestripped"
edition.text = "First published {} by {}".format(datetime.now().strftime("%Y"), BOILERPLATES.get("Header","eoa_name"))

publication_statement = xml_tree.xpath("//t:publicationStmt", namespaces=NS_MAP)[0]
unknown_paragraph = publication_statement.find("t:p", namespaces=NS_MAP)
if unknown_paragraph.text == "unknown":
unknown_paragraph.clear()
unknown_paragraph.tag = "tagtobestripped"

etree.SubElement(publication_statement, "publisher").text = "Edition Open Access"
publisher_element = etree.SubElement(publication_statement, "publisher")
overall_org = etree.SubElement(publisher_element, "orgName", n="EOA", ref=BOILERPLATES.get("Header","eoa_url"))
overall_org.text = BOILERPLATES.get("Header","eoa_name")
publishing_org = etree.SubElement(publisher_element, "orgName", n="Press", ref=BOILERPLATES.get("Header","mprl_url"))
publishing_org.text = BOILERPLATES.get("Header","mprl_name")

pub_date = etree.SubElement(publication_statement, "date", when=datetime.now().strftime("%Y-%m-%d"))
availability = etree.SubElement(publication_statement, "availability")
licence = etree.SubElement(availability, "licence", target="https://creativecommons.org/licenses/by-nc-sa/3.0/de/deed.en")
licence.text = "by-nc-sa"
# licence_text = etree.SubElement(licence, "p").text = """Distributed under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Germany License."""
licence = etree.SubElement(availability, "licence", target=BOILERPLATES.get("Header","licence_url"))
licence.text = BOILERPLATES.get("Header","licence_text")

# series statement
series_stmt = etree.Element("seriesStmt")
title_element = etree.SubElement(series_stmt, "title").text = "Series title"
resp_stmt = etree.SubElement(series_stmt, "respStmt")
resp_title = etree.SubElement(resp_stmt, "resp").text = "Series Editors"
resp_names = etree.SubElement(resp_stmt, "name", type="serieseditors")
resp_names.text = BOILERPLATES.get("Header","mprl_series_editors")
series_number = etree.SubElement(series_stmt, "idno", type="number").text = "number"
publication_stmt_parent = publication_statement.getparent()
series_stmt_insertion_point = get_place_in_xml_tree(publication_statement, publication_stmt_parent) + 1
publication_stmt_parent.insert(series_stmt_insertion_point, series_stmt)

source_desc = xml_tree.xpath("//t:sourceDesc", namespaces=NS_MAP)[0]
bibfile = etree.SubElement(source_desc, "ab", type="bibliography")
suggested_citation = etree.SubElement(source_desc, "ab", type="suggestedcitation").text = "Suggested Citation"
bibfile = etree.SubElement(source_desc, "ab", type="bibdatabase")
etree.SubElement(bibfile, "ref", type="monograph", target=bibfile_string)

# profile description
profile_desc = etree.Element("profileDesc")
brief_abstract = etree.SubElement(profile_desc, "abstract", n="BriefDescription")
brief_abstract_p = etree.SubElement(brief_abstract, "p").text = "Short abstract"
detailed_abstract = etree.SubElement(profile_desc, "abstract", n="DetailedDescription")
detailed_abstract_p = etree.SubElement(detailed_abstract, "p").text = "Long abstract"
additional_text = etree.SubElement(profile_desc, "abstract", n="additional")
additional_text_p = etree.SubElement(additional_text, "p").text = "Additional text"
textclass = etree.SubElement(profile_desc, "textClass")
keywords = etree.SubElement(textclass, "keywords")
list_keywords = etree.SubElement(keywords, "list")
keyword_item = etree.SubElement(list_keywords, "item").text = BOILERPLATES.get("Header","eoa_name")
langusage = etree.SubElement(profile_desc, "langUsage")
language = etree.SubElement(langusage, "language", ident="en").text = "English"
language = etree.SubElement(langusage, "language", ident="principal language")
xml_tree.insert(1, profile_desc)

encoding_desc = xml_tree.xpath("//t:encodingDesc", namespaces=NS_MAP)[0]

project_desc = etree.Element("projectDesc")
eoainfo_p1 = etree.SubElement(project_desc, "p", n="eoainfo").text = BOILERPLATES.get("Header","eoainfo_p1")
eoainfo_p2 = etree.SubElement(project_desc, "p", n="eoainfo").text = BOILERPLATES.get("Header","eoainfo_p2")
mprlinformation = etree.SubElement(project_desc, "p", n="mprlinformation").text = BOILERPLATES.get("Header","mprlinformation")
scientificboard = etree.SubElement(project_desc, "p", n="scientificboard").text = BOILERPLATES.get("Header","scientificboard")
eoadevteam = etree.SubElement(project_desc, "p", n="eoadevteam").text = BOILERPLATES.get("Header","eoadevteam")
encoding_desc.insert(0, project_desc)

xml_tree.insert(2, profile_desc)
appinfo = encoding_desc.find("t:appInfo", namespaces=NS_MAP)
fix_tei_info = etree.Element("application", ident="fix_tei", version=__version__)
fix_tei_info.attrib["{http://www.w3.org/XML/1998/namespace}id"] = "fixtei"
fix_tei_label = etree.SubElement(fix_tei_info, "label").text = "Fix TEI for EOA"
appinfo.insert(-1, fix_tei_info)

revision_desc = xml_tree.xpath("//t:revisionDesc", namespaces=NS_MAP)[0]
olderchanges = revision_desc.find("t:listChange", namespaces=NS_MAP)
olderchanges.clear()
olderchanges.tag = "tagtobestripped"

first_change = etree.SubElement(revision_desc, "change", when=datetime.now().strftime("%Y-%m-%d"), who="#fixtei")
first_change.text = "Fixed TEI created by oxgarage conversion"

return xml_tree
# def fix_tei_header ends here

def add_tei_frontpart():
"""Add a small front part
Contains a cover image and an optional dedication.
"""

frontpart = etree.Element("front")
cover_image = etree.SubElement(frontpart, "figure", type="cover")
cover_url = etree.SubElement(cover_image, "graphic", url="images/Cover.jpg")
cover_caption = etree.SubElement(cover_image, "head").text = "Cover caption"

dedication = etree.SubElement(frontpart, "div", type="dedication")
dedication_text = etree.SubElement(dedication, "ab").text = "Dedication text"

return frontpart
# def add_tei_frontpart ends here

def evaluate_report(report):
"""Print report of conversion."""

Expand Down Expand Up @@ -510,6 +587,10 @@ def main():
tei_header = xml_tree2.xpath("//t:teiHeader", namespaces=NS_MAP)
fix_tei_header(tei_header[0], str(args.bibfile))

tei_text = xml_tree2.xpath("/t:TEI/t:text", namespaces=NS_MAP)[0]
tei_front_part = add_tei_frontpart()
tei_text.insert(0, tei_front_part)

etree.strip_tags(xml_tree2, "tagtobestripped")

dictChapters = {}
Expand Down

0 comments on commit 817efbd

Please sign in to comment.