jsonfixer.py

#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-

"""
Convert fixtures with embededd EOA1.0 design to embedded EOA2.0 design.
"""

__version__ = "1.0"
__date__ = "20190826"
__author__ = "kthoden@mpiwg-berlin.mpg.de"

import argparse
import logging
import json
import shutil
import sys
from lxml import etree
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')


def treat_citations(fulltext):
    """Doc"""

    fulltext_prepared = fulltext.replace(" & ", " &amp; ")
    xml_element = etree.fromstring(f"<tmp>{fulltext_prepared}</tmp>")

    citations = xml_element.xpath("//span[@class='citation']")

    for citation in citations:
        convert_citation(citation)

    cleaned = etree.tostring(xml_element)[5:-6]

    return cleaned.decode("utf-8")
# def treat_citations ends here


def convert_citation(citation_element):
    """Convert citation to new format"""

    citation_element.tag = "a"
    citation_element.set("class", "publications-popup-text")
    # citation_element.set("data-title", "")
    # citation_element.set("data-content", "")

    superfluous_attributes = ["data-toggle", "html", "data-placement", "rel"]

    for i in superfluous_attributes:
        if citation_element.get(i) is not None:
            del citation_element.attrib[i]

    return
# def convert_citation ends here


def treat_indexsections(fulltext):
    """Doc"""

    def make_link_list(entries):
        """Make a list of entries"""

        link_list = etree.Element("ul")
        for entry in entries:
            list_item = etree.Element("li")
            list_item.append(entry)
            link_list.append(list_item)

        return link_list
    # def make_link_list ends here


    dummy_root = etree.Element("tmp")

    fulltext_prepared_1 = fulltext.replace("\n", "")
    fulltext_prepared_2 = fulltext_prepared_1.replace(" & ", " &amp; ")
    xml_element = etree.fromstring(fulltext_prepared_2)

    entries = xml_element.xpath("//div[@class='accordion-group']")
    for entry in entries:
        heading = entry.xpath("div[@class='accordion-heading']/a")[0]
        head_string = heading.text
        if len(heading.getchildren()) != 0:
            logging.error("Unexpected children in index heading starting with %s. Exiting.", head_string)
            sys.exit(1)
        logging.debug("Working on %s", head_string)
        heading.tag = "h4"
        etree.strip_attributes(heading, "{}*")

        dummy_root.append(heading)

        instances_wrapper = entry.xpath("//div[@class='accordion-inner']")[0]
        etree.strip_attributes(instances_wrapper, "class")

        first_level_entries = instances_wrapper.xpath("a")
        second_level_entries = instances_wrapper.xpath("p")

        if len(first_level_entries) > 0:
            first_level_list = make_link_list(first_level_entries)
            dummy_root.append(first_level_list)

        for entry in second_level_entries:
            subheading = etree.Element("h5")
            subheading.text = entry.text
            dummy_root.append(subheading)
            second_level_links = entry.xpath("a")
            second_level_list = make_link_list(second_level_links)
            dummy_root.append(second_level_list)

    fulltext_cleaned = etree.tostring(dummy_root)[5:-6]

    return fulltext_cleaned.decode("utf-8")
# def treat_indexsections ends here


def main():
    """The main bit"""

    parser = argparse.ArgumentParser()
    parser.add_argument("jsonfile", help="jsonfile.")
    parser.add_argument("-c", "--citations", help="Convert citations.", action="store_true")
    parser.add_argument("-i", "--indexsections", help="Convert indexsections.", action="store_true")
    args = parser.parse_args()

    logging.debug("Making a backup")
    shutil.copy(args.jsonfile, args.jsonfile.replace(".json", "-backup.json"))

    with open(args.jsonfile, "r") as mj:
        jsonentries = json.load(mj)

    for i in jsonentries:
        logging.debug("Looking at entry " + str(jsonentries.index(i)) + " of " + str(len(jsonentries)) + ". This is pk " + str(i["pk"]) + ".")
        if args.citations:
            # other fields to check:
            # Tablehtml
            if i["model"] == "eoapublications.element":
                # logging.debug("Checking Fulltext")
                # fulltext = i["fields"]["Fulltext"]
                # cleaned_citation = treat_citations(fulltext)
                # i["fields"]["Fulltext"] = cleaned_citation
                # logging.debug("Checking Caption")
                # caption = i["fields"]["Caption"]
                # cleaned_citation = treat_citations(caption)
                # i["fields"]["Caption"] = cleaned_citation
                logging.debug("Checking Tablehtml")
                tablehtml = i["fields"]["Tablehtml"]
                cleaned_citation = treat_citations(tablehtml)
                i["fields"]["Tablehtml"] = cleaned_citation
        if args.indexsections:
            if i["model"] == "eoapublications.indexsection":
                fulltext = i["fields"]["Html"]
                if fulltext.startswith("<h4"):
                    pass
                else:
                    cleaned_index = treat_indexsections(fulltext)
                    i["fields"]["Html"] = cleaned_index

    shutil.move(args.jsonfile, args.jsonfile.replace(".json", "-beforefix.json"))
    outfile = args.jsonfile

    with open(args.jsonfile, "w") as mj:
        json.dump(jsonentries, mj, indent=4, separators=(',', ': '))
# def main ends here

if __name__ == '__main__':
    main()
# finis