Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
"""Extract hyperimage information from Django.xml to files in order
to integrate it into TEI file.
"""
__version__ = "1.0"
__date__ = "20181120"
__author__ = "kthoden@mpiwg-berlin.mpg.de"
import argparse
import base64
import logging
import sys
import json
import csv
import re
import os
from pathlib import Path
from lxml import etree
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
ns_tei = "http://www.tei-c.org/ns/1.0"
NS_MAP = {"t" : ns_tei}
TEI = f"{{{ns_tei}}}"
DEFAULT_OUTPUT_DIR = \
Path(os.environ['OUTPUT_DIR'] if 'OUTPUT_DIR' in os.environ else './output')
def write_xml_result(div_list, xml_path):
"""Combine TEI header and body and write XML file"""
xml_tree = etree.Element("TEI")
tei_header = etree.Element("teiHeader")
filedesc = etree.SubElement(tei_header, "fileDesc")
titlestmt = etree.SubElement(filedesc, "titleStmt")
document_title = etree.SubElement(titlestmt, "title").text = "Hyperimage code"
pub_statement = etree.SubElement(filedesc, "publicationStmt")
publication_text = etree.SubElement(pub_statement, "title").text = "Data to insert into TEI document"
sourcedesc = etree.SubElement(filedesc, "sourceDesc")
sourcedesc_text = etree.SubElement(sourcedesc, "p").text = "Created by histandoff.py"
xml_tree.append(tei_header)
tei_text = etree.SubElement(xml_tree, "text")
tei_body = etree.SubElement(tei_text, "body")
for div in div_list:
tei_body.append(div)
if isinstance(xml_tree, etree._ElementTree):
pass
else:
xml_tree = etree.ElementTree(xml_tree)
xml_tree.write(str(xml_path), pretty_print=True, xml_declaration=True,encoding="utf-8")
logging.info(f"Wrote XML file: {xml_path}.")
# Remove namespace info (brute force solution)
ns_string = ' xmlns="http://www.tei-c.org/ns/1.0"'
with open(xml_path, 'r') as textfile:
xml_as_string = textfile.read()
removed_namespace = xml_as_string.replace("<TEI>", f"<TEI{ns_string}>")
with open(xml_path, 'w') as amended_textfile:
amended_textfile.write(removed_namespace)
return
# def write_xml_result ends here
def create_xmlid(number):
"""Create a valid xml id"""
chap_string = re.compile(r"([0-9]{1,2}).([0-9]{1,2})$")
match_convention = re.match(chap_string, number)
if match_convention:
number += "chap{:02d}_fig{}-hi".format(int(match_convention.group(1)), match_convention.group(2))
else:
replacements = {" ": "_", "+": "_"}
for char in replacements.keys():
number = number.replace(char, replacements[char])
number = "figid-" + number
return "please_adjust_" + number
# def create_xmlid ends here
def get_tei_id(image_file, tei_tree):
"""Retrieve xml:id for figure from tei file"""
def get_parent_id(graphic_element):
"""Get the id of figure after finding parent."""
figure = graphic_element.getparent()
tei_id = figure.attrib["{http://www.w3.org/XML/1998/namespace}id"]
return tei_id
# def get_parent_id ends here
graphics = tei_tree.xpath(f"//t:figure/t:graphic[@url='images/{image_file}']", namespaces=NS_MAP)
if len(graphics) == 1:
tei_id = get_parent_id(graphics[0])
elif len(graphics) == 0:
logging.error(f"Found no graphic with filename {image_file}. Exiting!")
sys.exit(1)
else:
logging.warning(f"Image {image_file} is referenced not once, but {len(graphics)} times!")
figure_ids = []
for graphic in graphics:
figure_ids.append(get_parent_id(graphic))
if len(set(figure_ids)) == 1:
tei_id = figure_ids[0]
else:
logging.error(f"There is more than one xml:id for {image_file}. You should check the source. Exiting")
sys.exit(1)
return tei_id
# def get_tei_id ends here
def create_copypaste_figures(list_of_elements, output_dir):
"""Create an XML file that contains ready-made XML elements for Hyperimage-activated figures"""
copypaste_tree = etree.Element("figures")
for element in list_of_elements:
hinumber = element.get("number")
xmlid = create_xmlid(hinumber)
element.tag = "figure"
element.set("type", "hionly")
element.attrib["{http://www.w3.org/XML/1998/namespace}id"] = xmlid
graphic = etree.Element("graphic", scale="50")
graphic.set("url", element.get("file").replace("images", "images/"))
caption = element.xpath("./caption")[0]
caption.tag = "head"
caption.addprevious(graphic)
copypaste_tree.append(element)
attributes_to_be_stripped = ["order", "file", "width", "height", "number", "hielement"]
for attribute in attributes_to_be_stripped:
try:
element.attrib.pop(attribute)
except KeyError:
logging.warning("Attribute %s not found.", attribute)
pass
output = XML_SNIPPETS_DIR / "hi_figurescopypaste.xml"
tree = etree.ElementTree(copypaste_tree)
tree.write(output, pretty_print=True, xml_declaration=True, encoding="utf-8")
logging.info("Wrote %s." % output)
return
#def create_copypaste_figures ends here
def create_copypaste_layers(list_of_elements, id_lookup, output_dir):
"""Create an XML file that contains ready-made XML elements for references to Hyperimage figures"""
copypaste_tree = etree.Element("reflayers")
advice = etree.Comment("""Please observe: if the text of the reference is the
number of an image, change the @type value to number-hi.""")
copypaste_tree.append(advice)
for layer in list_of_elements:
layer.tag = "ref"
layer.tail = ""
layer.set("type", "text-hi")
layer.set("select", layer.get("data-hilayer"))
layer.set("target", f"#{id_lookup[layer.get('href')[1:]]}")
copypaste_tree.append(layer)
attributes_to_be_stripped = ["href", "data-hilayer", "class"]
for attribute in attributes_to_be_stripped:
try:
layer.attrib.pop(attribute)
except KeyError:
logging.warning("Attribute %s not found.", attribute)
pass
output = output_dir / "xml_snippets" / "hi_layerscopypaste.xml"
tree = etree.ElementTree(copypaste_tree)
tree.write(str(output), pretty_print=True, xml_declaration=True, encoding="utf-8")
logging.info("Wrote %s." % output)
return
#def create_copypaste_layers ends here
def dump_json_file(json_string, filename):
"""Dump json to file"""
with open(filename, "w") as json_file:
json_file.write(json_string)
logging.info("Wrote %s", filename)
return
# def dump_json_file ends here
def decode_hielement(hielement_string, hinumber, output_dir):
"""Decode hielement to JSON
This is a base64 encoded string."""
byte_string = hielement_string.encode()
json_representation = base64.decodebytes(byte_string).decode("utf-8")
jjx = json_representation.replace("\n", "")
json_string = jjx.replace("\t", "")
filename = output_dir / "json" / f"{hinumber}.json"
dump_json_file(json_string, filename)
return json_string
# def decode_hielement ends here
def get_json_info(json_decoded):
"""Retrieve Hyperimage ID and layer names from json"""
hiid = json_decoded["view"]["id"]
# so sophisticated!!
hilayers = ",".join([*json_decoded["view"]["layers"].keys()])
return hiid, hilayers
# def get_json_info ends here
def create_xml_hielements(django_tree, tei_tree, output_dir):
"""Create a master for the hielements to be inserted into the HTML
representation.
This function creates a TEI XML file for storing the data. This
makes it easier to insert the data in an XSLT environment.
"""
id_lookup = {}
div_list = []
hielements = django_tree.xpath("//EOAfigure[@hielement]")
logging.debug("Found %s hielements", len(hielements))
for element in hielements:
image_file_raw = element.get("file")
image_file = re.sub(r"^images", "", image_file_raw)
tei_id = get_tei_id(image_file, tei_tree)
hinumber = element.get("number")
if element.get("data-hilayer") is not None:
logging.info(f"Figure {hinumber} contains a data-hilayer attribute: {element.get('data-hilayer')}")
else:
pass
hielement_string = element.get("hielement")
hielement_string_normalized = re.sub(r" +", " ", hielement_string)
json_decoded = decode_hielement(hielement_string, tei_id, output_dir)
json_object = json.loads(json_decoded)
hiid, hilayers = get_json_info(json_object)
id_lookup[hiid] = tei_id
div_element = etree.Element("div")
div_element.attrib["{http://www.w3.org/XML/1998/namespace}id"] = tei_id
div_element.set("corresp", hiid)
if hilayers:
list_element = etree.SubElement(div_element, "list", type="layers")
layer_list = hilayers.split(",")
for item in layer_list:
item_element = etree.SubElement(list_element, "item").text = item
else:
pass
ab_element = etree.SubElement(div_element, "ab", type="config").text = hielement_string_normalized
json_code = etree.SubElement(div_element, "ab", type="jsonconfig").text = etree.CDATA(json_decoded)
div_list.append(div_element)
return (id_lookup, div_list)
# def create_xml_hielements ends here
def create_master_hielements(django_tree, tei_tree, output_dir):
"""Create master files for the hielements
Get from Django file all figures with hielement strings and write a
csv file that connects a unique identifier (derived from number
attribute) with the hielement part. These are figures that have
hyperimage functionality enabled, but it doesn't mean that they are
not hyperimage-exclusive.
The rest of this information is written to TEI-like figure elements
that are pasted manually into the TEI file. Through their xml:id
they can be linked to to the data in the csv file.
During the tei2imxml or imxml2django step, the hielement string is
written back.
What is missing is the information of the Hyperimage-ID. This
seems to be encoded in the hielement string. This is necessary to
create the link with the xml:id.
"""
id_lookup = {}
hielements = django_tree.xpath("//EOAfigure[@hielement]")
logging.debug("Found %s hielements", len(hielements))
filename = output_dir / 'hi_figures.csv'
with open(filename, 'w', newline='') as csvfile:
fieldnames = ['xmlid', 'hiid', 'layers', 'elementstring']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for element in hielements:
image_file_raw = element.get("file")
image_file = re.sub(r"^images", "", image_file_raw)
tei_id = get_tei_id(image_file, tei_tree)
hinumber = element.get("number")
# xmlid = create_xmlid(hinumber)
if element.get("data-hilayer") is not None:
logging.info(f"Figure {hinumber} contains a data-hilayer attribute: {element.get('data-hilayer')}")
else:
pass
caption = element.xpath("./caption/text()")
hielement_string = element.get("hielement")
hielement_string_normalized = re.sub(r" +", " ", hielement_string)
json_decoded = decode_hielement(hielement_string, tei_id, output_dir)
json_object = json.loads(json_decoded)
hiid, hilayers = get_json_info(json_object)
id_lookup[hiid] = tei_id
writer.writerow({'xmlid': tei_id, 'hiid': hiid, 'layers': hilayers, 'elementstring': hielement_string_normalized})
logging.info("Wrote " + str(filename))
# create_copypaste_figures(hielements)
return id_lookup
# def create_master_hielements ends here
def create_master_hilayers(django_tree, id_lookup, output_dir):
"""Create an overview over hilayers.
Hilayers are annotations on the images (polygon markings).
The two attributes data-hilayer and href have to be retained as they
occur. They connect to a part of the encrypted string in
hi-elements.
data-hilayer is a unique string. So, in the TEI file,
the phrase to be linked is surrounded by a ref tag that contains
this identifier (target) and a common type (hilayer, for instance).
The CSV contains the href and the data-hilayer.
Django: was mounted (see Fig. <a href="#Fig28" data-hilayer="Fig28note" class="HILink">2.8</a>)
TEI: was mounted (see Fig. <ref target="#chap02_fig8" type="number-hi select="Fig28note""/>
"""
hilayers = django_tree.xpath("//a[@data-hilayer]")
logging.debug("Found %s hilayers", len(hilayers))
filename = output_dir / 'hi_layers.csv'
layerdict = {}
for link in hilayers:
linkref = link.get("href")[1:]
if not link.get("data-hilayer"):
logging.warning("Did not find data-hilayer attribute!")
else:
layer = link.get("data-hilayer")
if linkref in layerdict.keys():
layerdict[linkref] += f",{layer}"
else:
layerdict[linkref] = layer
with open(filename, 'w', newline='') as csvfile:
fieldnames = ['href', 'hilayers']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for link in layerdict.keys():
writer.writerow({'href': link, 'hilayers': layerdict[link]})
logging.info("Wrote " + str(filename))
create_copypaste_layers(hilayers, id_lookup, output_dir)
return
# def create_master_hilayers ends here
def create_master_hilinks(django_tree, id_lookup, output_dir):
"""Find the hilinks
These are links to hyperimage images that provide extra
demonstration possibilities (like side by side, and so on). The href
values here are not unique, they simply link to the viewer.
Also in this case, the TEI file still contains the commands. They
need to be written to the CSV, while a ref has to be inserted in the
TEI file.
Problem here: some refs are already inserted here to cater for the
formatting of the figure numbers. Needs to be dealt with! Possibly
through @corresp.
In Django, they should come out as:
<a href="#Fig74" class="HILink">7.4</a>
"""
filename = output_dir / 'hi_links.csv'
hilinks = django_tree.xpath("//a[contains(@class, 'HILink') and not(@data-hilayer)]")
logging.debug("Found %s hilinks", len(hilinks))
with open(str(filename), 'w', newline='') as csvfile:
fieldnames = ['href', 'text']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for link in hilinks:
link_string = etree.tostring(link, with_tail=False, encoding='unicode')
if not link.get("data-hilayer"):
layer = "No hilayer"
else:
layer = link.get("data-hilayer")
if not link.text:
firstlinkchild = link.getchildren()[0]
linktext = etree.tostring(firstlinkchild, with_tail=False, encoding='unicode')
else:
linktext = link.text
writer.writerow({'href': link.get("href")[1:], 'text': linktext})
figure_number_pattern = re.compile(r"[0-9]{,2}(\.[0-9][0-9])?")
copypaste_tree = etree.Element("reflinks")
for link in hilinks:
link.tag = "ref"
link.tail = ""
match_figure_name = re.match(figure_number_pattern, link.text)
if match_figure_name.group():
link.set("type", "number-hi")
else:
link.set("type", "text-hi")
link.set("target", f"#{id_lookup[link.get('href')[1:]]}")
copypaste_tree.append(link)
attributes_to_be_stripped = ["href", "class"]
for attribute in attributes_to_be_stripped:
try:
link.attrib.pop(attribute)
except KeyError:
logging.warning("Attribute %s not found.", attribute)
pass
output = output_dir / "xml_snippets" / "hi_linkscopypaste.xml"
tree = etree.ElementTree(copypaste_tree)
tree.write(str(output), pretty_print=True, xml_declaration=True, encoding="utf-8")
logging.info("Wrote %s." % output)
return
# def create_master_hilinks ends here
def get_tei_comments(tei_tree, output_dir):
"""Get comments from TEI file."""
filename = output_dir / 'hi_comments.txt'
comments = tei_tree.xpath("//comment()")
logging.debug("Found %s comments", len(comments))
with open(str(filename), 'w', newline="\n") as textfile:
for line in comments:
comment_line = etree.tostring(line, with_tail=False, encoding='unicode')
if comment_line.startswith("<!-- HYPERIMAGE"):
textfile.write(comment_line + "\n")
else:
logging.info("Leaving out comment that starts with %s." % comment_line[0:30])
pass
return
# def get_tei_comments ends here
def main():
"""The main bit"""
parser = argparse.ArgumentParser()
parser.add_argument("djangofile", help="The enriched Django file.")
parser.add_argument("teifile", help="The original TEI file.")
parser.add_argument("-c", "--csv", help="Use CSV output instead of XML", action="store_true")
parser.add_argument(
"-o", "--output-dir",
default = DEFAULT_OUTPUT_DIR / "../hyperimage",
help="output directory"
)
args = parser.parse_args()
OUTPUT_DIR = Path( args.output_dir )
JSON_OUTPUT_DIR = OUTPUT_DIR / "json"
XML_SNIPPETS_DIR = OUTPUT_DIR / "xml_snippets"
if not os.path.exists(OUTPUT_DIR):
os.mkdir( OUTPUT_DIR )
if not os.path.exists(JSON_OUTPUT_DIR):
os.mkdir( JSON_OUTPUT_DIR )
if not os.path.exists(XML_SNIPPETS_DIR):
os.mkdir( XML_SNIPPETS_DIR )
django_tree = etree.parse(args.djangofile)
tei_tree = etree.parse(args.teifile)
##############
# First step #
##############
if args.csv:
logging.debug("Creating CSV as output formats.")
id_lookup = create_master_hielements(django_tree, tei_tree, OUTPUT_DIR)
# Above step has been done, the figure elements have to be matched
# with the TEI file
###############
# Second step #
###############
# create_master_hilayers(django_tree, id_lookup, OUTPUT_DIR)
# Above step has been done. The refs have to be included in the TEI
# file, and the command has to be put into the CSV. The annotation
# has already been done, this is just for documentation.
##############
# Third step #
##############
# create_master_hilinks(django_tree, id_lookup, OUTPUT_DIR)
###################
# Additional step #
###################
# get_tei_comments(tei_tree, OUTPUT_DIR)
else:
output_file = OUTPUT_DIR / 'hi_figures.xml'
logging.debug("Creating TEI XML as output format.")
id_lookup, div_list = create_xml_hielements(django_tree, tei_tree, OUTPUT_DIR)
write_xml_result(div_list, output_file)
# def main ends here
if __name__ == '__main__':
main()
# finis