histandoff.py

#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-

"""Extract hyperimage information from Django.xml to files in order
to integrate it into TEI file.
"""

__version__ = "1.0"
__date__ = "20181120"
__author__ = "kthoden@mpiwg-berlin.mpg.de"

import argparse
import base64
import logging
import sys
import json
import csv
import re
import os
from pathlib import Path
from lxml import etree

logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')

ns_tei = "http://www.tei-c.org/ns/1.0"
NS_MAP = {"t" : ns_tei}
TEI = f"{{{ns_tei}}}"

DEFAULT_OUTPUT_DIR = \
    Path(os.environ['OUTPUT_DIR'] if 'OUTPUT_DIR' in os.environ else './output')


def write_xml_result(div_list, xml_path):
    """Combine TEI header and body and write XML file"""

    xml_tree = etree.Element("TEI")
    tei_header = etree.Element("teiHeader")
    filedesc = etree.SubElement(tei_header, "fileDesc")
    titlestmt = etree.SubElement(filedesc, "titleStmt")
    document_title = etree.SubElement(titlestmt, "title").text = "Hyperimage code"
    pub_statement = etree.SubElement(filedesc, "publicationStmt")
    publication_text = etree.SubElement(pub_statement, "title").text = "Data to insert into TEI document"
    sourcedesc = etree.SubElement(filedesc, "sourceDesc")
    sourcedesc_text = etree.SubElement(sourcedesc, "p").text = "Created by histandoff.py"
    xml_tree.append(tei_header)
    tei_text = etree.SubElement(xml_tree, "text")
    tei_body = etree.SubElement(tei_text, "body")

    for div in div_list:
        tei_body.append(div)

    if isinstance(xml_tree, etree._ElementTree):
        pass
    else:
        xml_tree = etree.ElementTree(xml_tree)

    xml_tree.write(str(xml_path), pretty_print=True, xml_declaration=True,encoding="utf-8")
    logging.info(f"Wrote XML file: {xml_path}.")

    # Remove namespace info (brute force solution)
    ns_string =  ' xmlns="http://www.tei-c.org/ns/1.0"'
    with open(xml_path, 'r') as textfile:
        xml_as_string = textfile.read()

    removed_namespace = xml_as_string.replace("<TEI>", f"<TEI{ns_string}>")

    with open(xml_path, 'w') as amended_textfile:
        amended_textfile.write(removed_namespace)

    return
# def write_xml_result ends here


def create_xmlid(number):
    """Create a valid xml id"""

    chap_string = re.compile(r"([0-9]{1,2}).([0-9]{1,2})$")

    match_convention = re.match(chap_string, number)

    if match_convention:
        number += "chap{:02d}_fig{}-hi".format(int(match_convention.group(1)), match_convention.group(2))
    else:
        replacements = {" ": "_", "+": "_"}
        for char in replacements.keys():
            number = number.replace(char, replacements[char])

        number = "figid-" + number

    return "please_adjust_" + number
# def create_xmlid ends here


def get_tei_id(image_file, tei_tree):
    """Retrieve xml:id for figure from tei file"""

    def get_parent_id(graphic_element):
        """Get the id of figure after finding parent."""

        figure = graphic_element.getparent()
        tei_id = figure.attrib["{http://www.w3.org/XML/1998/namespace}id"]

        return tei_id
    # def get_parent_id ends here

    graphics = tei_tree.xpath(f"//t:figure/t:graphic[@url='images/{image_file}']", namespaces=NS_MAP)
    if len(graphics) == 1:
        tei_id = get_parent_id(graphics[0])
    elif len(graphics) == 0:
        logging.error(f"Found no graphic with filename {image_file}. Exiting!")
        sys.exit(1)
    else:
        logging.warning(f"Image {image_file} is referenced not once, but {len(graphics)} times!")
        figure_ids = []
        for graphic in graphics:
            figure_ids.append(get_parent_id(graphic))
            if len(set(figure_ids)) == 1:
                tei_id = figure_ids[0]
            else:
                logging.error(f"There is more than one xml:id for {image_file}. You should check the source. Exiting")
                sys.exit(1)
    return tei_id
# def get_tei_id ends here


def create_copypaste_figures(list_of_elements, output_dir):
    """Create an XML file that contains ready-made XML elements for Hyperimage-activated figures"""

    copypaste_tree = etree.Element("figures")
    for element in list_of_elements:
        hinumber = element.get("number")
        xmlid = create_xmlid(hinumber)
        element.tag = "figure"
        element.set("type", "hionly")
        element.attrib["{http://www.w3.org/XML/1998/namespace}id"] = xmlid

        graphic = etree.Element("graphic", scale="50")
        graphic.set("url", element.get("file").replace("images", "images/"))

        caption = element.xpath("./caption")[0]
        caption.tag = "head"

        caption.addprevious(graphic)

        copypaste_tree.append(element)
        attributes_to_be_stripped = ["order", "file", "width", "height", "number", "hielement"]
        for attribute in attributes_to_be_stripped:
            try:
                element.attrib.pop(attribute)
            except KeyError:
                logging.warning("Attribute %s not found.", attribute)
                pass

    output = XML_SNIPPETS_DIR / "hi_figurescopypaste.xml"
    tree = etree.ElementTree(copypaste_tree)
    tree.write(output, pretty_print=True, xml_declaration=True, encoding="utf-8")
    logging.info("Wrote %s." % output)

    return
#def create_copypaste_figures ends here


def create_copypaste_layers(list_of_elements, id_lookup, output_dir):
    """Create an XML file that contains ready-made XML elements for references to Hyperimage figures"""

    copypaste_tree = etree.Element("reflayers")

    advice = etree.Comment("""Please observe: if the text of the reference is the
    number of an image, change the @type value to number-hi.""")

    copypaste_tree.append(advice)
    for layer in list_of_elements:
        layer.tag = "ref"
        layer.tail = ""

        layer.set("type", "text-hi")
        layer.set("select", layer.get("data-hilayer"))
        layer.set("target", f"#{id_lookup[layer.get('href')[1:]]}")

        copypaste_tree.append(layer)

        attributes_to_be_stripped = ["href", "data-hilayer", "class"]
        for attribute in attributes_to_be_stripped:
            try:
                layer.attrib.pop(attribute)
            except KeyError:
                logging.warning("Attribute %s not found.", attribute)
                pass

    output = output_dir / "xml_snippets" / "hi_layerscopypaste.xml"
    tree = etree.ElementTree(copypaste_tree)
    tree.write(str(output), pretty_print=True, xml_declaration=True, encoding="utf-8")
    logging.info("Wrote %s." % output)

    return
#def create_copypaste_layers ends here


def dump_json_file(json_string, filename):
    """Dump json to file"""

    with open(filename, "w") as json_file:
        json_file.write(json_string)
        logging.info("Wrote %s", filename)

    return
# def dump_json_file ends here


def decode_hielement(hielement_string, hinumber, output_dir):
    """Decode hielement to JSON

    This is a base64 encoded string."""

    byte_string = hielement_string.encode()
    json_representation = base64.decodebytes(byte_string).decode("utf-8")

    jjx = json_representation.replace("\n", "")
    json_string = jjx.replace("\t", "")

    filename = output_dir / "json" / f"{hinumber}.json"
    dump_json_file(json_string, filename)

    return json_string
# def decode_hielement ends here


def get_json_info(json_decoded):
    """Retrieve Hyperimage ID and layer names from json"""

    hiid = json_decoded["view"]["id"]
    # so sophisticated!!
    hilayers = ",".join([*json_decoded["view"]["layers"].keys()])

    return hiid, hilayers
# def get_json_info ends here


def create_xml_hielements(django_tree, tei_tree, output_dir):
    """Create a master for the hielements to be inserted into the HTML
    representation.

    This function creates a TEI XML file for storing the data. This
    makes it easier to insert the data in an XSLT environment.
    """

    id_lookup = {}

    div_list = []

    hielements = django_tree.xpath("//EOAfigure[@hielement]")
    logging.debug("Found %s hielements", len(hielements))
    for element in hielements:
        image_file_raw = element.get("file")
        image_file = re.sub(r"^images", "", image_file_raw)
        tei_id = get_tei_id(image_file, tei_tree)
        hinumber = element.get("number")
        if element.get("data-hilayer") is not None:
            logging.info(f"Figure {hinumber} contains a data-hilayer attribute: {element.get('data-hilayer')}")
        else:
            pass
        hielement_string = element.get("hielement")
        hielement_string_normalized = re.sub(r" +", " ", hielement_string)
        json_decoded = decode_hielement(hielement_string, tei_id, output_dir)
        json_object = json.loads(json_decoded)
        hiid, hilayers = get_json_info(json_object)
        id_lookup[hiid] = tei_id

        div_element = etree.Element("div")
        div_element.attrib["{http://www.w3.org/XML/1998/namespace}id"] = tei_id
        div_element.set("corresp", hiid)
        if hilayers:
            list_element = etree.SubElement(div_element, "list", type="layers")
            layer_list = hilayers.split(",")
            for item in layer_list:
                item_element = etree.SubElement(list_element, "item").text = item
        else:
            pass
        ab_element = etree.SubElement(div_element, "ab", type="config").text = hielement_string_normalized
        json_code = etree.SubElement(div_element, "ab", type="jsonconfig").text = etree.CDATA(json_decoded)

        div_list.append(div_element)

    return (id_lookup, div_list)
# def create_xml_hielements ends here


def create_master_hielements(django_tree, tei_tree, output_dir):
    """Create master files for the hielements

    Get from Django file all figures with hielement strings and write a
    csv file that connects a unique identifier (derived from number
    attribute) with the hielement part. These are figures that have
    hyperimage functionality enabled, but it doesn't mean that they are
    not hyperimage-exclusive.

    The rest of this information is written to TEI-like figure elements
    that are pasted manually into the TEI file. Through their xml:id
    they can be linked to to the data in the csv file.

    During the tei2imxml or imxml2django step, the hielement string is
    written back.

    What is missing is the information of the Hyperimage-ID. This
    seems to be encoded in the hielement string. This is necessary to
    create the link with the xml:id.
    """

    id_lookup = {}

    hielements = django_tree.xpath("//EOAfigure[@hielement]")
    logging.debug("Found %s hielements", len(hielements))
    filename = output_dir / 'hi_figures.csv'
    with open(filename, 'w', newline='') as csvfile:
        fieldnames = ['xmlid', 'hiid', 'layers', 'elementstring']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for element in hielements:
            image_file_raw = element.get("file")
            image_file = re.sub(r"^images", "", image_file_raw)
            tei_id = get_tei_id(image_file, tei_tree)
            hinumber = element.get("number")
            # xmlid = create_xmlid(hinumber)
            if element.get("data-hilayer") is not None:
                logging.info(f"Figure {hinumber} contains a data-hilayer attribute: {element.get('data-hilayer')}")
            else:
                pass
            caption = element.xpath("./caption/text()")
            hielement_string = element.get("hielement")
            hielement_string_normalized = re.sub(r" +", " ", hielement_string)
            json_decoded = decode_hielement(hielement_string, tei_id, output_dir)
            json_object = json.loads(json_decoded)
            hiid, hilayers = get_json_info(json_object)
            id_lookup[hiid] = tei_id

            writer.writerow({'xmlid': tei_id, 'hiid': hiid, 'layers': hilayers, 'elementstring': hielement_string_normalized})

    logging.info("Wrote " + str(filename))

    # create_copypaste_figures(hielements)

    return id_lookup
# def create_master_hielements ends here


def create_master_hilayers(django_tree, id_lookup, output_dir):
    """Create an overview over hilayers.

    Hilayers are annotations on the images (polygon markings).

    The two attributes data-hilayer and href have to be retained as they
    occur. They connect to a part of the encrypted string in
    hi-elements.

    data-hilayer is a unique string. So, in the TEI file,
    the phrase to be linked is surrounded by a ref tag that contains
    this identifier (target) and a common type (hilayer, for instance).

    The CSV contains the href and the  data-hilayer.

    Django: was mounted (see Fig. <a href="#Fig28" data-hilayer="Fig28note" class="HILink">2.8</a>)
    TEI: was mounted (see Fig. <ref target="#chap02_fig8" type="number-hi select="Fig28note""/>
    """

    hilayers = django_tree.xpath("//a[@data-hilayer]")
    logging.debug("Found %s hilayers", len(hilayers))
    filename = output_dir / 'hi_layers.csv'

    layerdict = {}

    for link in hilayers:
        linkref = link.get("href")[1:]

        if not link.get("data-hilayer"):
            logging.warning("Did not find data-hilayer attribute!")
        else:
            layer = link.get("data-hilayer")

        if linkref in layerdict.keys():
            layerdict[linkref] += f",{layer}"
        else:
            layerdict[linkref] = layer

    with open(filename, 'w', newline='') as csvfile:
        fieldnames = ['href', 'hilayers']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for link in layerdict.keys():
            writer.writerow({'href': link, 'hilayers': layerdict[link]})

    logging.info("Wrote " + str(filename))

    create_copypaste_layers(hilayers, id_lookup, output_dir)

    return
# def create_master_hilayers ends here


def create_master_hilinks(django_tree, id_lookup, output_dir):
    """Find the hilinks

    These are links to hyperimage images that provide extra
    demonstration possibilities (like side by side, and so on). The href
    values here are not unique, they simply link to the viewer.

    Also in this case, the TEI file still contains the commands. They
    need to be written to the CSV, while a ref has to be inserted in the
    TEI file.

    Problem here: some refs are already inserted here to cater for the
    formatting of the figure numbers. Needs to be dealt with! Possibly
    through @corresp.

    In Django, they should come out as:

    <a href="#Fig74" class="HILink">7.4</a>
    """

    filename = output_dir / 'hi_links.csv'

    hilinks = django_tree.xpath("//a[contains(@class, 'HILink') and not(@data-hilayer)]")
    logging.debug("Found %s hilinks", len(hilinks))
    with open(str(filename), 'w', newline='') as csvfile:
        fieldnames = ['href', 'text']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for link in hilinks:
            link_string = etree.tostring(link, with_tail=False, encoding='unicode')
            if not link.get("data-hilayer"):
                layer = "No hilayer"
            else:
                layer = link.get("data-hilayer")
            if not link.text:
                firstlinkchild = link.getchildren()[0]
                linktext = etree.tostring(firstlinkchild, with_tail=False, encoding='unicode')
            else:
                linktext = link.text
            writer.writerow({'href': link.get("href")[1:], 'text': linktext})

    figure_number_pattern = re.compile(r"[0-9]{,2}(\.[0-9][0-9])?")

    copypaste_tree = etree.Element("reflinks")
    for link in hilinks:
        link.tag = "ref"
        link.tail = ""
        match_figure_name = re.match(figure_number_pattern, link.text)
        if match_figure_name.group():
            link.set("type", "number-hi")
        else:
            link.set("type", "text-hi")
        link.set("target", f"#{id_lookup[link.get('href')[1:]]}")

        copypaste_tree.append(link)

        attributes_to_be_stripped = ["href", "class"]
        for attribute in attributes_to_be_stripped:
            try:
                link.attrib.pop(attribute)
            except KeyError:
                logging.warning("Attribute %s not found.", attribute)
                pass

    output = output_dir / "xml_snippets" / "hi_linkscopypaste.xml"
    tree = etree.ElementTree(copypaste_tree)
    tree.write(str(output), pretty_print=True, xml_declaration=True, encoding="utf-8")
    logging.info("Wrote %s." % output)

    return
# def create_master_hilinks ends here

def get_tei_comments(tei_tree, output_dir):
    """Get comments from TEI file."""

    filename = output_dir / 'hi_comments.txt'

    comments = tei_tree.xpath("//comment()")
    logging.debug("Found %s comments", len(comments))
    with open(str(filename), 'w', newline="\n") as textfile:
        for line in comments:
            comment_line = etree.tostring(line, with_tail=False, encoding='unicode')
            if comment_line.startswith("<!-- HYPERIMAGE"):
                textfile.write(comment_line + "\n")
            else:
                logging.info("Leaving out comment that starts with %s." % comment_line[0:30])
                pass
    return
# def get_tei_comments ends here

def main():
    """The main bit"""

    parser = argparse.ArgumentParser()
    parser.add_argument("djangofile", help="The enriched Django file.")
    parser.add_argument("teifile", help="The original TEI file.")
    parser.add_argument("-c", "--csv", help="Use CSV output instead of XML", action="store_true")
    parser.add_argument(
            "-o", "--output-dir",
            default = DEFAULT_OUTPUT_DIR / "../hyperimage",
            help="output directory"
    )
    args = parser.parse_args()

    OUTPUT_DIR = Path( args.output_dir )
    JSON_OUTPUT_DIR = OUTPUT_DIR / "json"
    XML_SNIPPETS_DIR = OUTPUT_DIR / "xml_snippets"

    if not os.path.exists(OUTPUT_DIR):
        os.mkdir( OUTPUT_DIR )
    if not os.path.exists(JSON_OUTPUT_DIR):
        os.mkdir( JSON_OUTPUT_DIR )
    if not os.path.exists(XML_SNIPPETS_DIR):
        os.mkdir( XML_SNIPPETS_DIR )

    django_tree = etree.parse(args.djangofile)
    tei_tree = etree.parse(args.teifile)

    ##############
    # First step #
    ##############
    if args.csv:
        logging.debug("Creating CSV as output formats.")
        id_lookup = create_master_hielements(django_tree, tei_tree, OUTPUT_DIR)
        # Above step has been done, the figure elements have to be matched
        # with the TEI file

        ###############
        # Second step #
        ###############
        # create_master_hilayers(django_tree, id_lookup, OUTPUT_DIR)

        # Above step has been done. The refs have to be included in the TEI
        # file, and the command has to be put into the CSV. The annotation
        # has already been done, this is just for documentation.

        ##############
        # Third step #
        ##############
        # create_master_hilinks(django_tree, id_lookup, OUTPUT_DIR)

        ###################
        # Additional step #
        ###################
        # get_tei_comments(tei_tree, OUTPUT_DIR)
    else:
        output_file = OUTPUT_DIR / 'hi_figures.xml'
        logging.debug("Creating TEI XML as output format.")
        id_lookup, div_list = create_xml_hielements(django_tree, tei_tree, OUTPUT_DIR)
        write_xml_result(div_list, output_file)
# def main ends here

if __name__ == '__main__':
    main()
# finis
	#!/usr/bin/env python3
	# -- coding: utf-8; mode: python --

	"""Extract hyperimage information from Django.xml to files in order
	to integrate it into TEI file.
	"""

	__version__ = "1.0"
	__date__ = "20181120"
	__author__ = "kthoden@mpiwg-berlin.mpg.de"

	import argparse
	import base64
	import logging
	import sys
	import json
	import csv
	import re
	import os
	from pathlib import Path
	from lxml import etree

	logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')

	ns_tei = "http://www.tei-c.org/ns/1.0"
	NS_MAP = {"t" : ns_tei}
	TEI = f"{{{ns_tei}}}"

	DEFAULT_OUTPUT_DIR = \
	Path(os.environ['OUTPUT_DIR'] if 'OUTPUT_DIR' in os.environ else './output')


	def write_xml_result(div_list, xml_path):
	"""Combine TEI header and body and write XML file"""

	xml_tree = etree.Element("TEI")
	tei_header = etree.Element("teiHeader")
	filedesc = etree.SubElement(tei_header, "fileDesc")
	titlestmt = etree.SubElement(filedesc, "titleStmt")
	document_title = etree.SubElement(titlestmt, "title").text = "Hyperimage code"
	pub_statement = etree.SubElement(filedesc, "publicationStmt")
	publication_text = etree.SubElement(pub_statement, "title").text = "Data to insert into TEI document"
	sourcedesc = etree.SubElement(filedesc, "sourceDesc")
	sourcedesc_text = etree.SubElement(sourcedesc, "p").text = "Created by histandoff.py"
	xml_tree.append(tei_header)
	tei_text = etree.SubElement(xml_tree, "text")
	tei_body = etree.SubElement(tei_text, "body")

	for div in div_list:
	tei_body.append(div)

	if isinstance(xml_tree, etree._ElementTree):
	pass
	else:
	xml_tree = etree.ElementTree(xml_tree)

	xml_tree.write(str(xml_path), pretty_print=True, xml_declaration=True,encoding="utf-8")
	logging.info(f"Wrote XML file: {xml_path}.")

	# Remove namespace info (brute force solution)
	ns_string = ' xmlns="http://www.tei-c.org/ns/1.0"'
	with open(xml_path, 'r') as textfile:
	xml_as_string = textfile.read()

	removed_namespace = xml_as_string.replace("<TEI>", f"<TEI{ns_string}>")

	with open(xml_path, 'w') as amended_textfile:
	amended_textfile.write(removed_namespace)

	return
	# def write_xml_result ends here


	def create_xmlid(number):
	"""Create a valid xml id"""

	chap_string = re.compile(r"([0-9]{1,2}).([0-9]{1,2})$")

	match_convention = re.match(chap_string, number)

	if match_convention:
	number += "chap{:02d}_fig{}-hi".format(int(match_convention.group(1)), match_convention.group(2))
	else:
	replacements = {" ": "_", "+": "_"}
	for char in replacements.keys():
	number = number.replace(char, replacements[char])

	number = "figid-" + number

	return "please_adjust_" + number
	# def create_xmlid ends here


	def get_tei_id(image_file, tei_tree):
	"""Retrieve xml:id for figure from tei file"""

	def get_parent_id(graphic_element):
	"""Get the id of figure after finding parent."""

	figure = graphic_element.getparent()
	tei_id = figure.attrib["{http://www.w3.org/XML/1998/namespace}id"]

	return tei_id
	# def get_parent_id ends here

	graphics = tei_tree.xpath(f"//t:figure/t:graphic[@url='images/{image_file}']", namespaces=NS_MAP)
	if len(graphics) == 1:
	tei_id = get_parent_id(graphics[0])
	elif len(graphics) == 0:
	logging.error(f"Found no graphic with filename {image_file}. Exiting!")
	sys.exit(1)
	else:
	logging.warning(f"Image {image_file} is referenced not once, but {len(graphics)} times!")
	figure_ids = []
	for graphic in graphics:
	figure_ids.append(get_parent_id(graphic))
	if len(set(figure_ids)) == 1:
	tei_id = figure_ids[0]
	else:
	logging.error(f"There is more than one xml:id for {image_file}. You should check the source. Exiting")
	sys.exit(1)
	return tei_id
	# def get_tei_id ends here


	def create_copypaste_figures(list_of_elements, output_dir):
	"""Create an XML file that contains ready-made XML elements for Hyperimage-activated figures"""

	copypaste_tree = etree.Element("figures")
	for element in list_of_elements:
	hinumber = element.get("number")
	xmlid = create_xmlid(hinumber)
	element.tag = "figure"
	element.set("type", "hionly")
	element.attrib["{http://www.w3.org/XML/1998/namespace}id"] = xmlid

	graphic = etree.Element("graphic", scale="50")
	graphic.set("url", element.get("file").replace("images", "images/"))

	caption = element.xpath("./caption")[0]
	caption.tag = "head"

	caption.addprevious(graphic)

	copypaste_tree.append(element)
	attributes_to_be_stripped = ["order", "file", "width", "height", "number", "hielement"]
	for attribute in attributes_to_be_stripped:
	try:
	element.attrib.pop(attribute)
	except KeyError:
	logging.warning("Attribute %s not found.", attribute)
	pass

	output = XML_SNIPPETS_DIR / "hi_figurescopypaste.xml"
	tree = etree.ElementTree(copypaste_tree)
	tree.write(output, pretty_print=True, xml_declaration=True, encoding="utf-8")
	logging.info("Wrote %s." % output)

	return
	#def create_copypaste_figures ends here


	def create_copypaste_layers(list_of_elements, id_lookup, output_dir):
	"""Create an XML file that contains ready-made XML elements for references to Hyperimage figures"""

	copypaste_tree = etree.Element("reflayers")

	advice = etree.Comment("""Please observe: if the text of the reference is the
	number of an image, change the @type value to number-hi.""")

	copypaste_tree.append(advice)
	for layer in list_of_elements:
	layer.tag = "ref"
	layer.tail = ""

	layer.set("type", "text-hi")
	layer.set("select", layer.get("data-hilayer"))
	layer.set("target", f"#{id_lookup[layer.get('href')[1:]]}")

	copypaste_tree.append(layer)

	attributes_to_be_stripped = ["href", "data-hilayer", "class"]
	for attribute in attributes_to_be_stripped:
	try:
	layer.attrib.pop(attribute)
	except KeyError:
	logging.warning("Attribute %s not found.", attribute)
	pass

	output = output_dir / "xml_snippets" / "hi_layerscopypaste.xml"
	tree = etree.ElementTree(copypaste_tree)
	tree.write(str(output), pretty_print=True, xml_declaration=True, encoding="utf-8")
	logging.info("Wrote %s." % output)

	return
	#def create_copypaste_layers ends here


	def dump_json_file(json_string, filename):
	"""Dump json to file"""

	with open(filename, "w") as json_file:
	json_file.write(json_string)
	logging.info("Wrote %s", filename)

	return
	# def dump_json_file ends here


	def decode_hielement(hielement_string, hinumber, output_dir):
	"""Decode hielement to JSON

	This is a base64 encoded string."""

	byte_string = hielement_string.encode()
	json_representation = base64.decodebytes(byte_string).decode("utf-8")

	jjx = json_representation.replace("\n", "")
	json_string = jjx.replace("\t", "")

	filename = output_dir / "json" / f"{hinumber}.json"
	dump_json_file(json_string, filename)

	return json_string
	# def decode_hielement ends here


	def get_json_info(json_decoded):
	"""Retrieve Hyperimage ID and layer names from json"""

	hiid = json_decoded["view"]["id"]
	# so sophisticated!!
	hilayers = ",".join([*json_decoded["view"]["layers"].keys()])

	return hiid, hilayers
	# def get_json_info ends here


	def create_xml_hielements(django_tree, tei_tree, output_dir):
	"""Create a master for the hielements to be inserted into the HTML
	representation.

	This function creates a TEI XML file for storing the data. This
	makes it easier to insert the data in an XSLT environment.
	"""

	id_lookup = {}

	div_list = []

	hielements = django_tree.xpath("//EOAfigure[@hielement]")
	logging.debug("Found %s hielements", len(hielements))
	for element in hielements:
	image_file_raw = element.get("file")
	image_file = re.sub(r"^images", "", image_file_raw)
	tei_id = get_tei_id(image_file, tei_tree)
	hinumber = element.get("number")
	if element.get("data-hilayer") is not None:
	logging.info(f"Figure {hinumber} contains a data-hilayer attribute: {element.get('data-hilayer')}")
	else:
	pass
	hielement_string = element.get("hielement")
	hielement_string_normalized = re.sub(r" +", " ", hielement_string)
	json_decoded = decode_hielement(hielement_string, tei_id, output_dir)
	json_object = json.loads(json_decoded)
	hiid, hilayers = get_json_info(json_object)
	id_lookup[hiid] = tei_id

	div_element = etree.Element("div")
	div_element.attrib["{http://www.w3.org/XML/1998/namespace}id"] = tei_id
	div_element.set("corresp", hiid)
	if hilayers:
	list_element = etree.SubElement(div_element, "list", type="layers")
	layer_list = hilayers.split(",")
	for item in layer_list:
	item_element = etree.SubElement(list_element, "item").text = item
	else:
	pass
	ab_element = etree.SubElement(div_element, "ab", type="config").text = hielement_string_normalized
	json_code = etree.SubElement(div_element, "ab", type="jsonconfig").text = etree.CDATA(json_decoded)

	div_list.append(div_element)

	return (id_lookup, div_list)
	# def create_xml_hielements ends here


	def create_master_hielements(django_tree, tei_tree, output_dir):
	"""Create master files for the hielements

	Get from Django file all figures with hielement strings and write a
	csv file that connects a unique identifier (derived from number
	attribute) with the hielement part. These are figures that have
	hyperimage functionality enabled, but it doesn't mean that they are
	not hyperimage-exclusive.

	The rest of this information is written to TEI-like figure elements
	that are pasted manually into the TEI file. Through their xml:id
	they can be linked to to the data in the csv file.

	During the tei2imxml or imxml2django step, the hielement string is
	written back.

	What is missing is the information of the Hyperimage-ID. This
	seems to be encoded in the hielement string. This is necessary to
	create the link with the xml:id.
	"""

	id_lookup = {}

	hielements = django_tree.xpath("//EOAfigure[@hielement]")
	logging.debug("Found %s hielements", len(hielements))
	filename = output_dir / 'hi_figures.csv'
	with open(filename, 'w', newline='') as csvfile:
	fieldnames = ['xmlid', 'hiid', 'layers', 'elementstring']
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
	writer.writeheader()
	for element in hielements:
	image_file_raw = element.get("file")
	image_file = re.sub(r"^images", "", image_file_raw)
	tei_id = get_tei_id(image_file, tei_tree)
	hinumber = element.get("number")
	# xmlid = create_xmlid(hinumber)
	if element.get("data-hilayer") is not None:
	logging.info(f"Figure {hinumber} contains a data-hilayer attribute: {element.get('data-hilayer')}")
	else:
	pass
	caption = element.xpath("./caption/text()")
	hielement_string = element.get("hielement")
	hielement_string_normalized = re.sub(r" +", " ", hielement_string)
	json_decoded = decode_hielement(hielement_string, tei_id, output_dir)
	json_object = json.loads(json_decoded)
	hiid, hilayers = get_json_info(json_object)
	id_lookup[hiid] = tei_id

	writer.writerow({'xmlid': tei_id, 'hiid': hiid, 'layers': hilayers, 'elementstring': hielement_string_normalized})

	logging.info("Wrote " + str(filename))

	# create_copypaste_figures(hielements)

	return id_lookup
	# def create_master_hielements ends here


	def create_master_hilayers(django_tree, id_lookup, output_dir):
	"""Create an overview over hilayers.

	Hilayers are annotations on the images (polygon markings).

	The two attributes data-hilayer and href have to be retained as they
	occur. They connect to a part of the encrypted string in
	hi-elements.

	data-hilayer is a unique string. So, in the TEI file,
	the phrase to be linked is surrounded by a ref tag that contains
	this identifier (target) and a common type (hilayer, for instance).

	The CSV contains the href and the data-hilayer.

	Django: was mounted (see Fig. <a href="#Fig28" data-hilayer="Fig28note" class="HILink">2.8</a>)
	TEI: was mounted (see Fig. <ref target="#chap02_fig8" type="number-hi select="Fig28note""/>
	"""

	hilayers = django_tree.xpath("//a[@data-hilayer]")
	logging.debug("Found %s hilayers", len(hilayers))
	filename = output_dir / 'hi_layers.csv'

	layerdict = {}

	for link in hilayers:
	linkref = link.get("href")[1:]

	if not link.get("data-hilayer"):
	logging.warning("Did not find data-hilayer attribute!")
	else:
	layer = link.get("data-hilayer")

	if linkref in layerdict.keys():
	layerdict[linkref] += f",{layer}"
	else:
	layerdict[linkref] = layer

	with open(filename, 'w', newline='') as csvfile:
	fieldnames = ['href', 'hilayers']
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
	writer.writeheader()
	for link in layerdict.keys():
	writer.writerow({'href': link, 'hilayers': layerdict[link]})

	logging.info("Wrote " + str(filename))

	create_copypaste_layers(hilayers, id_lookup, output_dir)

	return
	# def create_master_hilayers ends here


	def create_master_hilinks(django_tree, id_lookup, output_dir):
	"""Find the hilinks

	These are links to hyperimage images that provide extra
	demonstration possibilities (like side by side, and so on). The href
	values here are not unique, they simply link to the viewer.

	Also in this case, the TEI file still contains the commands. They
	need to be written to the CSV, while a ref has to be inserted in the
	TEI file.

	Problem here: some refs are already inserted here to cater for the
	formatting of the figure numbers. Needs to be dealt with! Possibly
	through @corresp.

	In Django, they should come out as:

	<a href="#Fig74" class="HILink">7.4</a>
	"""

	filename = output_dir / 'hi_links.csv'

	hilinks = django_tree.xpath("//a[contains(@class, 'HILink') and not(@data-hilayer)]")
	logging.debug("Found %s hilinks", len(hilinks))
	with open(str(filename), 'w', newline='') as csvfile:
	fieldnames = ['href', 'text']
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
	writer.writeheader()
	for link in hilinks:
	link_string = etree.tostring(link, with_tail=False, encoding='unicode')
	if not link.get("data-hilayer"):
	layer = "No hilayer"
	else:
	layer = link.get("data-hilayer")
	if not link.text:
	firstlinkchild = link.getchildren()[0]
	linktext = etree.tostring(firstlinkchild, with_tail=False, encoding='unicode')
	else:
	linktext = link.text
	writer.writerow({'href': link.get("href")[1:], 'text': linktext})

	figure_number_pattern = re.compile(r"[0-9]{,2}(\.[0-9][0-9])?")

	copypaste_tree = etree.Element("reflinks")
	for link in hilinks:
	link.tag = "ref"
	link.tail = ""
	match_figure_name = re.match(figure_number_pattern, link.text)
	if match_figure_name.group():
	link.set("type", "number-hi")
	else:
	link.set("type", "text-hi")
	link.set("target", f"#{id_lookup[link.get('href')[1:]]}")

	copypaste_tree.append(link)

	attributes_to_be_stripped = ["href", "class"]
	for attribute in attributes_to_be_stripped:
	try:
	link.attrib.pop(attribute)
	except KeyError:
	logging.warning("Attribute %s not found.", attribute)
	pass

	output = output_dir / "xml_snippets" / "hi_linkscopypaste.xml"
	tree = etree.ElementTree(copypaste_tree)
	tree.write(str(output), pretty_print=True, xml_declaration=True, encoding="utf-8")
	logging.info("Wrote %s." % output)

	return
	# def create_master_hilinks ends here

	def get_tei_comments(tei_tree, output_dir):
	"""Get comments from TEI file."""

	filename = output_dir / 'hi_comments.txt'

	comments = tei_tree.xpath("//comment()")
	logging.debug("Found %s comments", len(comments))
	with open(str(filename), 'w', newline="\n") as textfile:
	for line in comments:
	comment_line = etree.tostring(line, with_tail=False, encoding='unicode')
	if comment_line.startswith("<!-- HYPERIMAGE"):
	textfile.write(comment_line + "\n")
	else:
	logging.info("Leaving out comment that starts with %s." % comment_line[0:30])
	pass
	return
	# def get_tei_comments ends here

	def main():
	"""The main bit"""

	parser = argparse.ArgumentParser()
	parser.add_argument("djangofile", help="The enriched Django file.")
	parser.add_argument("teifile", help="The original TEI file.")
	parser.add_argument("-c", "--csv", help="Use CSV output instead of XML", action="store_true")
	parser.add_argument(
	"-o", "--output-dir",
	default = DEFAULT_OUTPUT_DIR / "../hyperimage",
	help="output directory"
	)
	args = parser.parse_args()

	OUTPUT_DIR = Path( args.output_dir )
	JSON_OUTPUT_DIR = OUTPUT_DIR / "json"
	XML_SNIPPETS_DIR = OUTPUT_DIR / "xml_snippets"

	if not os.path.exists(OUTPUT_DIR):
	os.mkdir( OUTPUT_DIR )
	if not os.path.exists(JSON_OUTPUT_DIR):
	os.mkdir( JSON_OUTPUT_DIR )
	if not os.path.exists(XML_SNIPPETS_DIR):
	os.mkdir( XML_SNIPPETS_DIR )

	django_tree = etree.parse(args.djangofile)
	tei_tree = etree.parse(args.teifile)

	##############
	# First step #
	##############
	if args.csv:
	logging.debug("Creating CSV as output formats.")
	id_lookup = create_master_hielements(django_tree, tei_tree, OUTPUT_DIR)
	# Above step has been done, the figure elements have to be matched
	# with the TEI file

	###############
	# Second step #
	###############
	# create_master_hilayers(django_tree, id_lookup, OUTPUT_DIR)

	# Above step has been done. The refs have to be included in the TEI
	# file, and the command has to be put into the CSV. The annotation
	# has already been done, this is just for documentation.

	##############
	# Third step #
	##############
	# create_master_hilinks(django_tree, id_lookup, OUTPUT_DIR)

	###################
	# Additional step #
	###################
	# get_tei_comments(tei_tree, OUTPUT_DIR)
	else:
	output_file = OUTPUT_DIR / 'hi_figures.xml'
	logging.debug("Creating TEI XML as output format.")
	id_lookup, div_list = create_xml_hielements(django_tree, tei_tree, OUTPUT_DIR)
	write_xml_result(div_list, output_file)
	# def main ends here

	if __name__ == '__main__':
	main()
	# finis