Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
eoa-utilities/jsonfixer.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
166 lines (125 sloc)
5.44 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8; mode: python -*- | |
""" | |
Convert fixtures with embededd EOA1.0 design to embedded EOA2.0 design. | |
""" | |
__version__ = "1.0" | |
__date__ = "20190826" | |
__author__ = "kthoden@mpiwg-berlin.mpg.de" | |
import argparse | |
import logging | |
import json | |
import shutil | |
import sys | |
from lxml import etree | |
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') | |
def treat_citations(fulltext): | |
"""Doc""" | |
fulltext_prepared = fulltext.replace(" & ", " & ") | |
xml_element = etree.fromstring(f"<tmp>{fulltext_prepared}</tmp>") | |
citations = xml_element.xpath("//span[@class='citation']") | |
for citation in citations: | |
convert_citation(citation) | |
cleaned = etree.tostring(xml_element)[5:-6] | |
return cleaned.decode("utf-8") | |
# def treat_citations ends here | |
def convert_citation(citation_element): | |
"""Convert citation to new format""" | |
citation_element.tag = "a" | |
citation_element.set("class", "publications-popup-text") | |
# citation_element.set("data-title", "") | |
# citation_element.set("data-content", "") | |
superfluous_attributes = ["data-toggle", "html", "data-placement", "rel"] | |
for i in superfluous_attributes: | |
if citation_element.get(i) is not None: | |
del citation_element.attrib[i] | |
return | |
# def convert_citation ends here | |
def treat_indexsections(fulltext): | |
"""Doc""" | |
def make_link_list(entries): | |
"""Make a list of entries""" | |
link_list = etree.Element("ul") | |
for entry in entries: | |
list_item = etree.Element("li") | |
list_item.append(entry) | |
link_list.append(list_item) | |
return link_list | |
# def make_link_list ends here | |
dummy_root = etree.Element("tmp") | |
fulltext_prepared_1 = fulltext.replace("\n", "") | |
fulltext_prepared_2 = fulltext_prepared_1.replace(" & ", " & ") | |
xml_element = etree.fromstring(fulltext_prepared_2) | |
entries = xml_element.xpath("//div[@class='accordion-group']") | |
for entry in entries: | |
heading = entry.xpath("div[@class='accordion-heading']/a")[0] | |
head_string = heading.text | |
if len(heading.getchildren()) != 0: | |
logging.error("Unexpected children in index heading starting with %s. Exiting.", head_string) | |
sys.exit(1) | |
logging.debug("Working on %s", head_string) | |
heading.tag = "h4" | |
etree.strip_attributes(heading, "{}*") | |
dummy_root.append(heading) | |
instances_wrapper = entry.xpath("//div[@class='accordion-inner']")[0] | |
etree.strip_attributes(instances_wrapper, "class") | |
first_level_entries = instances_wrapper.xpath("a") | |
second_level_entries = instances_wrapper.xpath("p") | |
if len(first_level_entries) > 0: | |
first_level_list = make_link_list(first_level_entries) | |
dummy_root.append(first_level_list) | |
for entry in second_level_entries: | |
subheading = etree.Element("h5") | |
subheading.text = entry.text | |
dummy_root.append(subheading) | |
second_level_links = entry.xpath("a") | |
second_level_list = make_link_list(second_level_links) | |
dummy_root.append(second_level_list) | |
fulltext_cleaned = etree.tostring(dummy_root)[5:-6] | |
return fulltext_cleaned.decode("utf-8") | |
# def treat_indexsections ends here | |
def main(): | |
"""The main bit""" | |
parser = argparse.ArgumentParser() | |
parser.add_argument("jsonfile", help="jsonfile.") | |
parser.add_argument("-c", "--citations", help="Convert citations.", action="store_true") | |
parser.add_argument("-i", "--indexsections", help="Convert indexsections.", action="store_true") | |
args = parser.parse_args() | |
logging.debug("Making a backup") | |
shutil.copy(args.jsonfile, args.jsonfile.replace(".json", "-backup.json")) | |
with open(args.jsonfile, "r") as mj: | |
jsonentries = json.load(mj) | |
for i in jsonentries: | |
logging.debug("Looking at entry " + str(jsonentries.index(i)) + " of " + str(len(jsonentries)) + ". This is pk " + str(i["pk"]) + ".") | |
if args.citations: | |
# other fields to check: | |
# Tablehtml | |
if i["model"] == "eoapublications.element": | |
# logging.debug("Checking Fulltext") | |
# fulltext = i["fields"]["Fulltext"] | |
# cleaned_citation = treat_citations(fulltext) | |
# i["fields"]["Fulltext"] = cleaned_citation | |
# logging.debug("Checking Caption") | |
# caption = i["fields"]["Caption"] | |
# cleaned_citation = treat_citations(caption) | |
# i["fields"]["Caption"] = cleaned_citation | |
logging.debug("Checking Tablehtml") | |
tablehtml = i["fields"]["Tablehtml"] | |
cleaned_citation = treat_citations(tablehtml) | |
i["fields"]["Tablehtml"] = cleaned_citation | |
if args.indexsections: | |
if i["model"] == "eoapublications.indexsection": | |
fulltext = i["fields"]["Html"] | |
if fulltext.startswith("<h4"): | |
pass | |
else: | |
cleaned_index = treat_indexsections(fulltext) | |
i["fields"]["Html"] = cleaned_index | |
shutil.move(args.jsonfile, args.jsonfile.replace(".json", "-beforefix.json")) | |
outfile = args.jsonfile | |
with open(args.jsonfile, "w") as mj: | |
json.dump(jsonentries, mj, indent=4, separators=(',', ': ')) | |
# def main ends here | |
if __name__ == '__main__': | |
main() | |
# finis |