Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
EOASkripts/fix_tei.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
522 lines (410 sloc)
19.3 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8; mode: python -*- | |
__version__ = "1.0" | |
__date__ = "20180109" | |
__author__ = "kthoden@mpiwg-berlin.mpg.de" | |
import sys | |
import os | |
import re | |
import json | |
import logging | |
import shlex | |
import pickle | |
import subprocess | |
from lxml import etree | |
from datetime import datetime | |
import bibtexparser | |
import argparse | |
import traceback | |
import libeoaconvert | |
logging.basicConfig(level=logging.INFO, format=' %(asctime)s - %(levelname)s - %(message)s') | |
# do things like in latex2eoa: search and replace things by regex | |
# also, delete elements and attributes inserted by metypeset | |
# and, rename elements according to our schema | |
# treat | |
# assignment of identifiers | |
ns_tei = "http://www.tei-c.org/ns/1.0" | |
NS_MAP = {"t" : ns_tei} | |
TMP_DIR = os.path.expanduser("tmp_files") | |
def parse_bibtex(bibfile): | |
"""Parse the bibtex file, return a dict""" | |
all_references = {} | |
with open(bibfile) as btf: | |
btb = bibtexparser.load(btf) | |
tmp_dict = btb.entries_dict | |
all_references.update(tmp_dict) | |
return tmp_dict | |
# return all_references | |
# def parse_bibtex ends here | |
def unescape(text): | |
"""Remove HTML or XML character references and entities from a text | |
string. Return a Unicode string. | |
With thanks to http://effbot.org/zone/re-sub.htm#unescape-html. | |
Modified to work with Python3. | |
""" | |
import re, html.entities | |
def fixup(m): | |
text = m.group(0) | |
if text[:2] == "&#": | |
# character reference | |
try: | |
if text[:3] == "&#x": | |
return chr(int(text[3:-1], 16)) | |
else: | |
return chr(int(text[2:-1])) | |
except ValueError: | |
pass | |
else: | |
# named entity | |
try: | |
text = chr(html.entities.name2codepoint[text[1:-1]]) | |
except KeyError: | |
pass | |
return text # leave as is | |
return re.sub(r"&#?\w+;", fixup, text) | |
# def unescape ends here | |
def convert_references(string): | |
"""Find reference markers (#) in the text""" | |
references_pattern = re.compile(r"(#)(?P<reference>.+?)(#)") | |
references = re.findall(references_pattern, string) | |
logging.info("Found %s references" % len(references)) | |
for reference in references: | |
string = re.sub(references_pattern, r"<ref><![CDATA[\g<1>]]></ref>", string) | |
return string | |
# def convert_references ends here | |
def convert_citations(string): | |
"""Find citation shorthand using regex. | |
Return a tuple of the modified string and a list of found citations | |
In a second step, parse the result and return citekey and pagerange | |
(if present)). | |
<bibl> | |
<ref target="#Kaulbach_1960"/> | |
<citedRange from="320" to="322"/> | |
</bibl> | |
""" | |
citations = [] | |
year_citations_pattern = re.compile(r"(§|§|§)(§|§|§)(?P<citekey>.+?)(\!(?P<pages>.*?))?(§|§|§)(§|§|§)") | |
year_citations = re.findall(year_citations_pattern, string) | |
logging.info("Found %s year citations." % len(year_citations)) | |
string = re.sub(year_citations_pattern, r"<bibl><ref type='inline' target='#\g<citekey>'/><citedRange from='\g<pages>'/></bibl>", string) | |
authoryear_citation_pattern = re.compile(r"(§|§|§)(?P<citekey>.+?)(\!(?P<pages>.*?))?(§|§|§)") | |
authoryear_citations = re.findall(authoryear_citation_pattern, string) | |
logging.info("Found %s author/year citations." % len(authoryear_citations)) | |
string = re.sub(authoryear_citation_pattern, r"<bibl><ref target='#\g<citekey>'/><citedRange from='\g<pages>'/></bibl>", string) | |
for year_citation in year_citations: | |
citations.append(year_citation[2]) | |
for authoryear_citation in authoryear_citations: | |
citations.append(authoryear_citation[1]) | |
return (string, citations) | |
# def convert_citations ends here | |
def parse_cited_range(list_of_xml_elements): | |
"""citedRange: split up parameters or remove element if attributes are empty""" | |
unsplittable_pageref = [] | |
for reference in list_of_xml_elements: | |
cited_range = reference.find("t:citedRange", namespaces=NS_MAP) | |
from_value = (cited_range.get("from")) | |
split_values = re.findall(r"[\w']+", from_value) | |
if len(from_value) == 0: | |
cited_range.tag = "tagtobestripped" | |
cited_range.attrib.pop("from") | |
elif len(split_values) == 1: | |
cited_range.set("from", split_values[0]) | |
elif len(split_values) == 2: | |
cited_range.set("from", split_values[0]) | |
cited_range.set("to", split_values[1]) | |
elif len(split_values) == 3: | |
cited_range.set("from", split_values[0]) | |
cited_range.set("to", split_values[2]) | |
else: | |
logging.info("Splitting the page range produced unexpected result. Tried to split %s. Wrote to text field." % from_value) | |
cited_range.text = from_value | |
cited_range.attrib.pop("from") | |
unsplittable_pageref.append(from_value) | |
return unsplittable_pageref | |
# def parse_cited_range ends here | |
def validate_citations(used_citekeys, bibdata): | |
"""Check if all found citekeys are in the database | |
Return a list of unavailable citekeys.""" | |
available_citekeys = bibdata.keys() | |
no_citekey = [] | |
for citekey in used_citekeys: | |
if citekey not in available_citekeys: | |
no_citekey.append(citekey) | |
logging.info("%s is not in the bibliographic database" % citekey) | |
return no_citekey | |
# def validate_citations ends here | |
def convert_figures(string): | |
"""Find figures shorthands""" | |
# negative lookbehind assertion. Real + characters must be escaped by \ | |
graphic_pattern = re.compile(r"(?<!\\)\+(.*?)\+") | |
# +Fig.1CarteDuCielPotsdam!Glass photographic plate from the Carte | |
# du Ciel survey, Potsdam Observatory, Plate 5, taken January 11, | |
# 1894. The plate is approximately 16 cm x 16 cm; each plate | |
# covered two square degrees of the sky. Courtesy of the | |
# Leibniz-Institut für Astrophysik, Potsdam+ | |
figures = re.findall(graphic_pattern, string) | |
logging.info("Found %s figures" % len(figures)) | |
for figure in figures: | |
string = re.sub(graphic_pattern, r"<graphic><![CDATA[\g<1>]]></graphic>", string) | |
return string | |
# def convert_figures ends here | |
def make_figure_elements(list_of_figures, figure_directory): | |
"""Construct the figure element.""" | |
bad_images = [] | |
available_images = [] | |
available_images_long = os.listdir(figure_directory) | |
for img in available_images_long: | |
available_images.append(os.path.splitext(img)[0]) | |
for graphic in list_of_figures: | |
parent_tag = graphic.getparent() | |
parent_tag.tag = "figure" | |
original_string = graphic.text | |
graphic.clear() | |
parts = original_string.split("!") | |
if len(parts) in range(2,4): | |
if parts[0] in available_images or parts[0] in available_images_long: | |
selected_image = parts[0] | |
logging.info("Found %s in the text. Selected %s as corresponding image." % (parts[0], selected_image)) | |
graphic.set("scale", "50") | |
graphic.set("url", "images/" + selected_image) | |
else: | |
bad_images.append(original_string) | |
caption = "<head>" + parts[1] + "</head>" | |
head_element = etree.fromstring(caption) | |
parent_tag.insert(1, head_element) | |
if len(parts) == 3: | |
logging.info("This figure contains hyperimage directions") | |
yenda_command = etree.ProcessingInstruction("hyperimage", "Hyperimage direction: %s" % parts[2]) | |
# yenda_command = etree.Comment("Hyperimage direction: %s" % parts[2]) | |
parent_tag.append(yenda_command) | |
else: | |
logging.info("The figure string could not be split by '!': %s" % etree.tostring(graphic)) | |
return bad_images | |
# def make_figure_elements ends here | |
def cleanup_xml(xml_tree): | |
"""Perform some cleaning on XML""" | |
# also, delete elements and attributes inserted by metypeset | |
# and, rename elements according to our schema | |
metypeset_attrib = xml_tree.findall("//t:*[@meTypesetSize]", namespaces=NS_MAP) | |
color_attrib = xml_tree.xpath("//t:hi[contains(@rend, 'color') or contains(@rend, 'background')]", namespaces=NS_MAP) | |
logging.info("Found %s metypesets." % len(metypeset_attrib)) | |
logging.info("Found %s colour attributes." % len(color_attrib)) | |
for attribute in metypeset_attrib: | |
logging.info("number of attributes: %s" % len(attribute.attrib)) | |
attribute.attrib.pop("meTypesetSize") | |
for attribute in color_attrib: | |
attribute.attrib.pop("rend") | |
hi_without_attrib2 = xml_tree.findall("//t:hi", namespaces=NS_MAP) | |
for attribute in hi_without_attrib2: | |
if len(attribute.attrib) == 0: | |
xml_parent = attribute.getparent() | |
attribute.tag = "tagtobestripped" | |
footnotes = xml_tree.xpath("//t:note[@place='foot']", namespaces=NS_MAP) | |
for footnote in footnotes: | |
footnote.set("place", "bottom") | |
etree.strip_tags(xml_tree, "tagtobestripped") | |
return xml_tree | |
# def cleanup_xml ends here | |
def fix_document_structure(xml_tree, highest_level): | |
"""Insert div types""" | |
# Unsure here, but maybe have a rule that one file is one chapter, | |
# so the highest level would be sections | |
if highest_level == "chapter": | |
chapter_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP) | |
section_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP) | |
subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP) | |
subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP) | |
for chapter in chapter_divs: | |
chapter.set("type", "chapter") | |
for section in section_divs: | |
section.set("type", "section") | |
for subsection in subsection_divs: | |
subsection.set("type", "subsection") | |
for subsubsection in subsubsection_divs: | |
subsubsection.set("type", "subsubsection") | |
elif highest_level == "part": | |
part_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP) | |
chapter_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP) | |
section_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP) | |
subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP) | |
subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div/t:div", namespaces=NS_MAP) | |
for part in part_divs: | |
part.set("type", "part") | |
for chapter in chapter_divs: | |
chapter.set("type", "chapter") | |
for section in section_divs: | |
section.set("type", "section") | |
for subsection in subsection_divs: | |
subsection.set("type", "subsection") | |
for subsubsection in subsubsection_divs: | |
subsubsection.set("type", "subsubsection") | |
# section_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP) | |
# subsection_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP) | |
# subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP) | |
# for section in section_divs: | |
# section.set("type", "section") | |
# for subsection in subsection_divs: | |
# subsection.set("type", "subsection") | |
# for subsubsection in subsubsection_divs: | |
# subsubsection.set("type", "subsubsection") | |
# def fix_document_structure ends here | |
def fix_tei_header(xml_tree, bibfile_string): | |
"""Populate TEI header with mandatory data""" | |
title_statement = xml_tree.xpath("//t:titleStmt", namespaces=NS_MAP)[0] | |
title_element = title_statement.find("t:title", namespaces=NS_MAP) | |
title_element.set("level", "s") | |
title_element.set("n", "20") | |
title_element.text = "Titel der Serie" | |
main_title = etree.Element("title", type="main") | |
main_title.text = "FotoObjekte" | |
title_statement.insert(0, main_title) | |
# series = etree.SubElement(title_statement, "title", level="s", n="20").text = "Studies" | |
# subtitle = etree.SubElement(title_statement, "title", level="sub").text = "Artikelsammlung" | |
publication_statement = xml_tree.xpath("//t:publicationStmt", namespaces=NS_MAP)[0] | |
unknown_paragraph = publication_statement.find("t:p", namespaces=NS_MAP) | |
if unknown_paragraph.text == "unknown": | |
unknown_paragraph.clear() | |
unknown_paragraph.tag = "tagtobestripped" | |
etree.SubElement(publication_statement, "publisher").text = "Edition Open Access" | |
pub_date = etree.SubElement(publication_statement, "date", when=datetime.now().strftime("%Y-%m-%d")) | |
availability = etree.SubElement(publication_statement, "availability") | |
licence = etree.SubElement(availability, "licence", target="https://creativecommons.org/licenses/by-nc-sa/3.0/de/deed.en") | |
licence.text = "by-nc-sa" | |
# licence_text = etree.SubElement(licence, "p").text = """Distributed under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Germany License.""" | |
source_desc = xml_tree.xpath("//t:sourceDesc", namespaces=NS_MAP)[0] | |
bibfile = etree.SubElement(source_desc, "ab", type="bibliography") | |
etree.SubElement(bibfile, "ref", type="monograph", target=bibfile_string) | |
profile_desc = etree.Element("profileDesc") | |
langusage = etree.SubElement(profile_desc, "langUsage") | |
language = etree.SubElement(langusage, "language", ident="en").text = "English" | |
xml_tree.insert(2, profile_desc) | |
return xml_tree | |
# def fix_tei_header ends here | |
def evaluate_report(report): | |
"""Print report of conversion.""" | |
print("="*60) | |
print(' '*4, "Conversion report") | |
print("-"*60) | |
if len(report["bad_figures"]) > 0: | |
print("{} {} could not be linked to a file in the image directory:".format(len(report["bad_figures"]), libeoaconvert.plural(len(report["bad_figures"]), "figure"))) | |
for item in report["bad_figures"]: | |
print(' '*4, item) | |
else: | |
print("All figures were linked.") | |
if len(report["citekeys_not_in_bib"]) > 0: | |
print("{} of {} {} could not be found in the bibliography database:".format(len(report["citekeys_not_in_bib"]), report["len_citekeys"], libeoaconvert.plural(len(report["citekeys_not_in_bib"]), "citation"))) | |
for item in report["citekeys_not_in_bib"]: | |
print(' '*4, item) | |
print("\nThe missing citations were also stored in the pickle file and can be re-used by the create_tmpbib tool.\n") | |
else: | |
print("All citekeys were found in the bibliography database.") | |
if len(report["bad_pageref"]) > 0: | |
print("{} page {} could not be parsed into start and end value:".format(len(report["bad_pageref"]), libeoaconvert.plural(len(report["bad_pageref"]), "reference"))) | |
for item in report["bad_pageref"]: | |
print(' '*4, item) | |
else: | |
print("All page references could be parsed into discrete values.") | |
print("="*60) | |
# def evaluate_report ends here | |
def main(): | |
"""The main bit""" | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-d", "--dochighestorder", default='chapter', help="Specify which divider is at the highest level, possible values: part, chapter. Default is chapter.") | |
parser.add_argument("-f", "--finalize", help="Finalize a publication.", action="store_true") | |
parser.add_argument("teifile", help="Output from oxgarage/metypeset, an TEI XML file.") | |
parser.add_argument("bibfile", help="The bibliography database of the publication.") | |
parser.add_argument("figdir", help="The directory that contains the figures belonging to the publication.") | |
args = parser.parse_args() | |
highest_level = args.dochighestorder | |
if highest_level not in ["chapter", "part"]: | |
sys.stderr.write("Specify either 'chapter' or 'part' as highest level. Exiting") | |
sys.exit() | |
if not os.path.exists(TMP_DIR): | |
os.makedirs(TMP_DIR) | |
with open(args.teifile, 'r') as xmlfile: | |
xml_tree = etree.parse(xmlfile) | |
report = {} | |
################ | |
# bibliography # | |
################ | |
# bibtexparser | |
bibdata = parse_bibtex(args.bibfile) | |
xml_cleaned = cleanup_xml(xml_tree) | |
cleaned_path = TMP_DIR + os.path.sep + args.teifile.replace(".xml", "-cleaned.xml") | |
xml_cleaned.write(cleaned_path, pretty_print=True, xml_declaration=True, encoding="utf-8") | |
logging.info("Wrote %s." % cleaned_path) | |
# first some modifications on a string object | |
xml_string = etree.tostring(xml_cleaned).decode('utf-8') | |
# the '#' sign is a bad choice! | |
# mod_string = convert_references(xml_string) | |
mod_string2, cited = convert_citations(xml_string) | |
used_citekeys = [unescape(c) for c in cited] | |
citekeys_not_in_bib = validate_citations(used_citekeys, bibdata) | |
report["len_citekeys"] = len(used_citekeys) | |
report["citekeys_not_in_bib"] = citekeys_not_in_bib | |
mod_string3 = convert_figures(mod_string2) | |
debug_output = TMP_DIR + os.path.sep + args.teifile.replace(".xml", "-modified.xml") | |
with open(debug_output, "w") as debugfile: | |
debugfile.write(mod_string3) | |
logging.info("Wrote %s." % debug_output) | |
# check for wellformedness, read again as xml | |
try: | |
xml_tree2 = etree.fromstring(mod_string3) | |
except etree.XMLSyntaxError: | |
print("\nXML syntax error when trying to parse modified tree. Dumped it to %s." % debug_output) | |
print("-"*60) | |
traceback.print_exc(file=sys.stdout) | |
print("-"*60) | |
exit() | |
if args.finalize: | |
pass | |
else: | |
all_figures = xml_tree2.xpath("//t:graphic", namespaces=NS_MAP) | |
bad_figures = make_figure_elements(all_figures, args.figdir) | |
report["bad_figures"] = bad_figures | |
all_references = xml_tree2.xpath("//t:bibl", namespaces=NS_MAP) | |
if args.finalize: | |
pass | |
else: | |
bad_pageref = parse_cited_range(all_references) | |
report["bad_pageref"] = bad_pageref | |
tei_header = xml_tree2.xpath("//t:teiHeader", namespaces=NS_MAP) | |
fix_tei_header(tei_header[0], str(args.bibfile)) | |
etree.strip_tags(xml_tree2, "tagtobestripped") | |
dictChapters = {} | |
dictEquations = {} | |
dictLists = {} | |
dictTheorems = {} | |
dictFigures = {} | |
dictSections = {} | |
dictFootnotes = {} | |
dictTables = {} | |
dictPagelabels = {} | |
data_to_pickle = {'citekey_not_in_bib' : citekeys_not_in_bib, | |
'citekeys' : used_citekeys, | |
'chapterdict' : dictChapters, | |
'eqdict' : dictEquations, | |
'listdict' : dictLists, | |
'theoremdict' : dictTheorems, | |
'figdict' : dictFigures, | |
'secdict' : dictSections, | |
'fndict' : dictFootnotes, | |
'tabdict' : dictTables, | |
'pagelabeldict' : dictPagelabels} | |
with open('tmp_files/data.pickle', 'wb') as f: | |
# Pickle the 'data' dictionary using the highest protocol available. | |
pickle.dump(data_to_pickle, f, pickle.HIGHEST_PROTOCOL) | |
fix_document_structure(xml_tree2, highest_level) | |
# output | |
output = args.teifile.replace(".xml", "-out.xml") | |
tree = etree.ElementTree(xml_tree2) | |
tree.write(output, pretty_print=True, xml_declaration=True,encoding="utf-8") | |
logging.info("Wrote %s." % output) | |
if args.finalize: | |
pass | |
else: | |
evaluate_report(report) | |
# def main ends here | |
if __name__ == '__main__': | |
main() | |
# finis |