Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
hyperimage_scripts/histandoff.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
553 lines (424 sloc)
19.1 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8; mode: python -*- | |
"""Extract hyperimage information from Django.xml to files in order | |
to integrate it into TEI file. | |
""" | |
__version__ = "1.0" | |
__date__ = "20181120" | |
__author__ = "kthoden@mpiwg-berlin.mpg.de" | |
import argparse | |
import base64 | |
import logging | |
import sys | |
import json | |
import csv | |
import re | |
import os | |
from pathlib import Path | |
from lxml import etree | |
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') | |
ns_tei = "http://www.tei-c.org/ns/1.0" | |
NS_MAP = {"t" : ns_tei} | |
TEI = f"{{{ns_tei}}}" | |
DEFAULT_OUTPUT_DIR = \ | |
Path(os.environ['OUTPUT_DIR'] if 'OUTPUT_DIR' in os.environ else './output') | |
def write_xml_result(div_list, xml_path): | |
"""Combine TEI header and body and write XML file""" | |
xml_tree = etree.Element("TEI") | |
tei_header = etree.Element("teiHeader") | |
filedesc = etree.SubElement(tei_header, "fileDesc") | |
titlestmt = etree.SubElement(filedesc, "titleStmt") | |
document_title = etree.SubElement(titlestmt, "title").text = "Hyperimage code" | |
pub_statement = etree.SubElement(filedesc, "publicationStmt") | |
publication_text = etree.SubElement(pub_statement, "title").text = "Data to insert into TEI document" | |
sourcedesc = etree.SubElement(filedesc, "sourceDesc") | |
sourcedesc_text = etree.SubElement(sourcedesc, "p").text = "Created by histandoff.py" | |
xml_tree.append(tei_header) | |
tei_text = etree.SubElement(xml_tree, "text") | |
tei_body = etree.SubElement(tei_text, "body") | |
for div in div_list: | |
tei_body.append(div) | |
if isinstance(xml_tree, etree._ElementTree): | |
pass | |
else: | |
xml_tree = etree.ElementTree(xml_tree) | |
xml_tree.write(str(xml_path), pretty_print=True, xml_declaration=True,encoding="utf-8") | |
logging.info(f"Wrote XML file: {xml_path}.") | |
# Remove namespace info (brute force solution) | |
ns_string = ' xmlns="http://www.tei-c.org/ns/1.0"' | |
with open(xml_path, 'r') as textfile: | |
xml_as_string = textfile.read() | |
removed_namespace = xml_as_string.replace("<TEI>", f"<TEI{ns_string}>") | |
with open(xml_path, 'w') as amended_textfile: | |
amended_textfile.write(removed_namespace) | |
return | |
# def write_xml_result ends here | |
def create_xmlid(number): | |
"""Create a valid xml id""" | |
chap_string = re.compile(r"([0-9]{1,2}).([0-9]{1,2})$") | |
match_convention = re.match(chap_string, number) | |
if match_convention: | |
number += "chap{:02d}_fig{}-hi".format(int(match_convention.group(1)), match_convention.group(2)) | |
else: | |
replacements = {" ": "_", "+": "_"} | |
for char in replacements.keys(): | |
number = number.replace(char, replacements[char]) | |
number = "figid-" + number | |
return "please_adjust_" + number | |
# def create_xmlid ends here | |
def get_tei_id(image_file, tei_tree): | |
"""Retrieve xml:id for figure from tei file""" | |
def get_parent_id(graphic_element): | |
"""Get the id of figure after finding parent.""" | |
figure = graphic_element.getparent() | |
tei_id = figure.attrib["{http://www.w3.org/XML/1998/namespace}id"] | |
return tei_id | |
# def get_parent_id ends here | |
graphics = tei_tree.xpath(f"//t:figure/t:graphic[@url='images/{image_file}']", namespaces=NS_MAP) | |
if len(graphics) == 1: | |
tei_id = get_parent_id(graphics[0]) | |
elif len(graphics) == 0: | |
logging.error(f"Found no graphic with filename {image_file}. Exiting!") | |
sys.exit(1) | |
else: | |
logging.warning(f"Image {image_file} is referenced not once, but {len(graphics)} times!") | |
figure_ids = [] | |
for graphic in graphics: | |
figure_ids.append(get_parent_id(graphic)) | |
if len(set(figure_ids)) == 1: | |
tei_id = figure_ids[0] | |
else: | |
logging.error(f"There is more than one xml:id for {image_file}. You should check the source. Exiting") | |
sys.exit(1) | |
return tei_id | |
# def get_tei_id ends here | |
def create_copypaste_figures(list_of_elements, output_dir): | |
"""Create an XML file that contains ready-made XML elements for Hyperimage-activated figures""" | |
copypaste_tree = etree.Element("figures") | |
for element in list_of_elements: | |
hinumber = element.get("number") | |
xmlid = create_xmlid(hinumber) | |
element.tag = "figure" | |
element.set("type", "hionly") | |
element.attrib["{http://www.w3.org/XML/1998/namespace}id"] = xmlid | |
graphic = etree.Element("graphic", scale="50") | |
graphic.set("url", element.get("file").replace("images", "images/")) | |
caption = element.xpath("./caption")[0] | |
caption.tag = "head" | |
caption.addprevious(graphic) | |
copypaste_tree.append(element) | |
attributes_to_be_stripped = ["order", "file", "width", "height", "number", "hielement"] | |
for attribute in attributes_to_be_stripped: | |
try: | |
element.attrib.pop(attribute) | |
except KeyError: | |
logging.warning("Attribute %s not found.", attribute) | |
pass | |
output = XML_SNIPPETS_DIR / "hi_figurescopypaste.xml" | |
tree = etree.ElementTree(copypaste_tree) | |
tree.write(output, pretty_print=True, xml_declaration=True, encoding="utf-8") | |
logging.info("Wrote %s." % output) | |
return | |
#def create_copypaste_figures ends here | |
def create_copypaste_layers(list_of_elements, id_lookup, output_dir): | |
"""Create an XML file that contains ready-made XML elements for references to Hyperimage figures""" | |
copypaste_tree = etree.Element("reflayers") | |
advice = etree.Comment("""Please observe: if the text of the reference is the | |
number of an image, change the @type value to number-hi.""") | |
copypaste_tree.append(advice) | |
for layer in list_of_elements: | |
layer.tag = "ref" | |
layer.tail = "" | |
layer.set("type", "text-hi") | |
layer.set("select", layer.get("data-hilayer")) | |
layer.set("target", f"#{id_lookup[layer.get('href')[1:]]}") | |
copypaste_tree.append(layer) | |
attributes_to_be_stripped = ["href", "data-hilayer", "class"] | |
for attribute in attributes_to_be_stripped: | |
try: | |
layer.attrib.pop(attribute) | |
except KeyError: | |
logging.warning("Attribute %s not found.", attribute) | |
pass | |
output = output_dir / "xml_snippets" / "hi_layerscopypaste.xml" | |
tree = etree.ElementTree(copypaste_tree) | |
tree.write(str(output), pretty_print=True, xml_declaration=True, encoding="utf-8") | |
logging.info("Wrote %s." % output) | |
return | |
#def create_copypaste_layers ends here | |
def dump_json_file(json_string, filename): | |
"""Dump json to file""" | |
with open(filename, "w") as json_file: | |
json_file.write(json_string) | |
logging.info("Wrote %s", filename) | |
return | |
# def dump_json_file ends here | |
def decode_hielement(hielement_string, hinumber, output_dir): | |
"""Decode hielement to JSON | |
This is a base64 encoded string.""" | |
byte_string = hielement_string.encode() | |
json_representation = base64.decodebytes(byte_string).decode("utf-8") | |
jjx = json_representation.replace("\n", "") | |
json_string = jjx.replace("\t", "") | |
filename = output_dir / "json" / f"{hinumber}.json" | |
dump_json_file(json_string, filename) | |
return json_string | |
# def decode_hielement ends here | |
def get_json_info(json_decoded): | |
"""Retrieve Hyperimage ID and layer names from json""" | |
hiid = json_decoded["view"]["id"] | |
# so sophisticated!! | |
hilayers = ",".join([*json_decoded["view"]["layers"].keys()]) | |
return hiid, hilayers | |
# def get_json_info ends here | |
def create_xml_hielements(django_tree, tei_tree, output_dir): | |
"""Create a master for the hielements to be inserted into the HTML | |
representation. | |
This function creates a TEI XML file for storing the data. This | |
makes it easier to insert the data in an XSLT environment. | |
""" | |
id_lookup = {} | |
div_list = [] | |
hielements = django_tree.xpath("//EOAfigure[@hielement]") | |
logging.debug("Found %s hielements", len(hielements)) | |
for element in hielements: | |
image_file_raw = element.get("file") | |
image_file = re.sub(r"^images", "", image_file_raw) | |
tei_id = get_tei_id(image_file, tei_tree) | |
hinumber = element.get("number") | |
if element.get("data-hilayer") is not None: | |
logging.info(f"Figure {hinumber} contains a data-hilayer attribute: {element.get('data-hilayer')}") | |
else: | |
pass | |
hielement_string = element.get("hielement") | |
hielement_string_normalized = re.sub(r" +", " ", hielement_string) | |
json_decoded = decode_hielement(hielement_string, tei_id, output_dir) | |
json_object = json.loads(json_decoded) | |
hiid, hilayers = get_json_info(json_object) | |
id_lookup[hiid] = tei_id | |
div_element = etree.Element("div") | |
div_element.attrib["{http://www.w3.org/XML/1998/namespace}id"] = tei_id | |
div_element.set("corresp", hiid) | |
if hilayers: | |
list_element = etree.SubElement(div_element, "list", type="layers") | |
layer_list = hilayers.split(",") | |
for item in layer_list: | |
item_element = etree.SubElement(list_element, "item").text = item | |
else: | |
pass | |
ab_element = etree.SubElement(div_element, "ab", type="config").text = hielement_string_normalized | |
json_code = etree.SubElement(div_element, "ab", type="jsonconfig").text = etree.CDATA(json_decoded) | |
div_list.append(div_element) | |
return (id_lookup, div_list) | |
# def create_xml_hielements ends here | |
def create_master_hielements(django_tree, tei_tree, output_dir): | |
"""Create master files for the hielements | |
Get from Django file all figures with hielement strings and write a | |
csv file that connects a unique identifier (derived from number | |
attribute) with the hielement part. These are figures that have | |
hyperimage functionality enabled, but it doesn't mean that they are | |
not hyperimage-exclusive. | |
The rest of this information is written to TEI-like figure elements | |
that are pasted manually into the TEI file. Through their xml:id | |
they can be linked to to the data in the csv file. | |
During the tei2imxml or imxml2django step, the hielement string is | |
written back. | |
What is missing is the information of the Hyperimage-ID. This | |
seems to be encoded in the hielement string. This is necessary to | |
create the link with the xml:id. | |
""" | |
id_lookup = {} | |
hielements = django_tree.xpath("//EOAfigure[@hielement]") | |
logging.debug("Found %s hielements", len(hielements)) | |
filename = output_dir / 'hi_figures.csv' | |
with open(filename, 'w', newline='') as csvfile: | |
fieldnames = ['xmlid', 'hiid', 'layers', 'elementstring'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for element in hielements: | |
image_file_raw = element.get("file") | |
image_file = re.sub(r"^images", "", image_file_raw) | |
tei_id = get_tei_id(image_file, tei_tree) | |
hinumber = element.get("number") | |
# xmlid = create_xmlid(hinumber) | |
if element.get("data-hilayer") is not None: | |
logging.info(f"Figure {hinumber} contains a data-hilayer attribute: {element.get('data-hilayer')}") | |
else: | |
pass | |
caption = element.xpath("./caption/text()") | |
hielement_string = element.get("hielement") | |
hielement_string_normalized = re.sub(r" +", " ", hielement_string) | |
json_decoded = decode_hielement(hielement_string, tei_id, output_dir) | |
json_object = json.loads(json_decoded) | |
hiid, hilayers = get_json_info(json_object) | |
id_lookup[hiid] = tei_id | |
writer.writerow({'xmlid': tei_id, 'hiid': hiid, 'layers': hilayers, 'elementstring': hielement_string_normalized}) | |
logging.info("Wrote " + str(filename)) | |
# create_copypaste_figures(hielements) | |
return id_lookup | |
# def create_master_hielements ends here | |
def create_master_hilayers(django_tree, id_lookup, output_dir): | |
"""Create an overview over hilayers. | |
Hilayers are annotations on the images (polygon markings). | |
The two attributes data-hilayer and href have to be retained as they | |
occur. They connect to a part of the encrypted string in | |
hi-elements. | |
data-hilayer is a unique string. So, in the TEI file, | |
the phrase to be linked is surrounded by a ref tag that contains | |
this identifier (target) and a common type (hilayer, for instance). | |
The CSV contains the href and the data-hilayer. | |
Django: was mounted (see Fig. <a href="#Fig28" data-hilayer="Fig28note" class="HILink">2.8</a>) | |
TEI: was mounted (see Fig. <ref target="#chap02_fig8" type="number-hi select="Fig28note""/> | |
""" | |
hilayers = django_tree.xpath("//a[@data-hilayer]") | |
logging.debug("Found %s hilayers", len(hilayers)) | |
filename = output_dir / 'hi_layers.csv' | |
layerdict = {} | |
for link in hilayers: | |
linkref = link.get("href")[1:] | |
if not link.get("data-hilayer"): | |
logging.warning("Did not find data-hilayer attribute!") | |
else: | |
layer = link.get("data-hilayer") | |
if linkref in layerdict.keys(): | |
layerdict[linkref] += f",{layer}" | |
else: | |
layerdict[linkref] = layer | |
with open(filename, 'w', newline='') as csvfile: | |
fieldnames = ['href', 'hilayers'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for link in layerdict.keys(): | |
writer.writerow({'href': link, 'hilayers': layerdict[link]}) | |
logging.info("Wrote " + str(filename)) | |
create_copypaste_layers(hilayers, id_lookup, output_dir) | |
return | |
# def create_master_hilayers ends here | |
def create_master_hilinks(django_tree, id_lookup, output_dir): | |
"""Find the hilinks | |
These are links to hyperimage images that provide extra | |
demonstration possibilities (like side by side, and so on). The href | |
values here are not unique, they simply link to the viewer. | |
Also in this case, the TEI file still contains the commands. They | |
need to be written to the CSV, while a ref has to be inserted in the | |
TEI file. | |
Problem here: some refs are already inserted here to cater for the | |
formatting of the figure numbers. Needs to be dealt with! Possibly | |
through @corresp. | |
In Django, they should come out as: | |
<a href="#Fig74" class="HILink">7.4</a> | |
""" | |
filename = output_dir / 'hi_links.csv' | |
hilinks = django_tree.xpath("//a[contains(@class, 'HILink') and not(@data-hilayer)]") | |
logging.debug("Found %s hilinks", len(hilinks)) | |
with open(str(filename), 'w', newline='') as csvfile: | |
fieldnames = ['href', 'text'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for link in hilinks: | |
link_string = etree.tostring(link, with_tail=False, encoding='unicode') | |
if not link.get("data-hilayer"): | |
layer = "No hilayer" | |
else: | |
layer = link.get("data-hilayer") | |
if not link.text: | |
firstlinkchild = link.getchildren()[0] | |
linktext = etree.tostring(firstlinkchild, with_tail=False, encoding='unicode') | |
else: | |
linktext = link.text | |
writer.writerow({'href': link.get("href")[1:], 'text': linktext}) | |
figure_number_pattern = re.compile(r"[0-9]{,2}(\.[0-9][0-9])?") | |
copypaste_tree = etree.Element("reflinks") | |
for link in hilinks: | |
link.tag = "ref" | |
link.tail = "" | |
match_figure_name = re.match(figure_number_pattern, link.text) | |
if match_figure_name.group(): | |
link.set("type", "number-hi") | |
else: | |
link.set("type", "text-hi") | |
link.set("target", f"#{id_lookup[link.get('href')[1:]]}") | |
copypaste_tree.append(link) | |
attributes_to_be_stripped = ["href", "class"] | |
for attribute in attributes_to_be_stripped: | |
try: | |
link.attrib.pop(attribute) | |
except KeyError: | |
logging.warning("Attribute %s not found.", attribute) | |
pass | |
output = output_dir / "xml_snippets" / "hi_linkscopypaste.xml" | |
tree = etree.ElementTree(copypaste_tree) | |
tree.write(str(output), pretty_print=True, xml_declaration=True, encoding="utf-8") | |
logging.info("Wrote %s." % output) | |
return | |
# def create_master_hilinks ends here | |
def get_tei_comments(tei_tree, output_dir): | |
"""Get comments from TEI file.""" | |
filename = output_dir / 'hi_comments.txt' | |
comments = tei_tree.xpath("//comment()") | |
logging.debug("Found %s comments", len(comments)) | |
with open(str(filename), 'w', newline="\n") as textfile: | |
for line in comments: | |
comment_line = etree.tostring(line, with_tail=False, encoding='unicode') | |
if comment_line.startswith("<!-- HYPERIMAGE"): | |
textfile.write(comment_line + "\n") | |
else: | |
logging.info("Leaving out comment that starts with %s." % comment_line[0:30]) | |
pass | |
return | |
# def get_tei_comments ends here | |
def main(): | |
"""The main bit""" | |
parser = argparse.ArgumentParser() | |
parser.add_argument("djangofile", help="The enriched Django file.") | |
parser.add_argument("teifile", help="The original TEI file.") | |
parser.add_argument("-c", "--csv", help="Use CSV output instead of XML", action="store_true") | |
parser.add_argument( | |
"-o", "--output-dir", | |
default = DEFAULT_OUTPUT_DIR / "../hyperimage", | |
help="output directory" | |
) | |
args = parser.parse_args() | |
OUTPUT_DIR = Path( args.output_dir ) | |
JSON_OUTPUT_DIR = OUTPUT_DIR / "json" | |
XML_SNIPPETS_DIR = OUTPUT_DIR / "xml_snippets" | |
if not os.path.exists(OUTPUT_DIR): | |
os.mkdir( OUTPUT_DIR ) | |
if not os.path.exists(JSON_OUTPUT_DIR): | |
os.mkdir( JSON_OUTPUT_DIR ) | |
if not os.path.exists(XML_SNIPPETS_DIR): | |
os.mkdir( XML_SNIPPETS_DIR ) | |
django_tree = etree.parse(args.djangofile) | |
tei_tree = etree.parse(args.teifile) | |
############## | |
# First step # | |
############## | |
if args.csv: | |
logging.debug("Creating CSV as output formats.") | |
id_lookup = create_master_hielements(django_tree, tei_tree, OUTPUT_DIR) | |
# Above step has been done, the figure elements have to be matched | |
# with the TEI file | |
############### | |
# Second step # | |
############### | |
# create_master_hilayers(django_tree, id_lookup, OUTPUT_DIR) | |
# Above step has been done. The refs have to be included in the TEI | |
# file, and the command has to be put into the CSV. The annotation | |
# has already been done, this is just for documentation. | |
############## | |
# Third step # | |
############## | |
# create_master_hilinks(django_tree, id_lookup, OUTPUT_DIR) | |
################### | |
# Additional step # | |
################### | |
# get_tei_comments(tei_tree, OUTPUT_DIR) | |
else: | |
output_file = OUTPUT_DIR / 'hi_figures.xml' | |
logging.debug("Creating TEI XML as output format.") | |
id_lookup, div_list = create_xml_hielements(django_tree, tei_tree, OUTPUT_DIR) | |
write_xml_result(div_list, output_file) | |
# def main ends here | |
if __name__ == '__main__': | |
main() | |
# finis |