Skip to content
Permalink
6d0ca52dfe
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
166 lines (125 sloc) 5.44 KB
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
"""
Convert fixtures with embededd EOA1.0 design to embedded EOA2.0 design.
"""
__version__ = "1.0"
__date__ = "20190826"
__author__ = "kthoden@mpiwg-berlin.mpg.de"
import argparse
import logging
import json
import shutil
import sys
from lxml import etree
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
def treat_citations(fulltext):
"""Doc"""
fulltext_prepared = fulltext.replace(" & ", " & ")
xml_element = etree.fromstring(f"<tmp>{fulltext_prepared}</tmp>")
citations = xml_element.xpath("//span[@class='citation']")
for citation in citations:
convert_citation(citation)
cleaned = etree.tostring(xml_element)[5:-6]
return cleaned.decode("utf-8")
# def treat_citations ends here
def convert_citation(citation_element):
"""Convert citation to new format"""
citation_element.tag = "a"
citation_element.set("class", "publications-popup-text")
# citation_element.set("data-title", "")
# citation_element.set("data-content", "")
superfluous_attributes = ["data-toggle", "html", "data-placement", "rel"]
for i in superfluous_attributes:
if citation_element.get(i) is not None:
del citation_element.attrib[i]
return
# def convert_citation ends here
def treat_indexsections(fulltext):
"""Doc"""
def make_link_list(entries):
"""Make a list of entries"""
link_list = etree.Element("ul")
for entry in entries:
list_item = etree.Element("li")
list_item.append(entry)
link_list.append(list_item)
return link_list
# def make_link_list ends here
dummy_root = etree.Element("tmp")
fulltext_prepared_1 = fulltext.replace("\n", "")
fulltext_prepared_2 = fulltext_prepared_1.replace(" & ", " &amp; ")
xml_element = etree.fromstring(fulltext_prepared_2)
entries = xml_element.xpath("//div[@class='accordion-group']")
for entry in entries:
heading = entry.xpath("div[@class='accordion-heading']/a")[0]
head_string = heading.text
if len(heading.getchildren()) != 0:
logging.error("Unexpected children in index heading starting with %s. Exiting.", head_string)
sys.exit(1)
logging.debug("Working on %s", head_string)
heading.tag = "h4"
etree.strip_attributes(heading, "{}*")
dummy_root.append(heading)
instances_wrapper = entry.xpath("//div[@class='accordion-inner']")[0]
etree.strip_attributes(instances_wrapper, "class")
first_level_entries = instances_wrapper.xpath("a")
second_level_entries = instances_wrapper.xpath("p")
if len(first_level_entries) > 0:
first_level_list = make_link_list(first_level_entries)
dummy_root.append(first_level_list)
for entry in second_level_entries:
subheading = etree.Element("h5")
subheading.text = entry.text
dummy_root.append(subheading)
second_level_links = entry.xpath("a")
second_level_list = make_link_list(second_level_links)
dummy_root.append(second_level_list)
fulltext_cleaned = etree.tostring(dummy_root)[5:-6]
return fulltext_cleaned.decode("utf-8")
# def treat_indexsections ends here
def main():
"""The main bit"""
parser = argparse.ArgumentParser()
parser.add_argument("jsonfile", help="jsonfile.")
parser.add_argument("-c", "--citations", help="Convert citations.", action="store_true")
parser.add_argument("-i", "--indexsections", help="Convert indexsections.", action="store_true")
args = parser.parse_args()
logging.debug("Making a backup")
shutil.copy(args.jsonfile, args.jsonfile.replace(".json", "-backup.json"))
with open(args.jsonfile, "r") as mj:
jsonentries = json.load(mj)
for i in jsonentries:
logging.debug("Looking at entry " + str(jsonentries.index(i)) + " of " + str(len(jsonentries)) + ". This is pk " + str(i["pk"]) + ".")
if args.citations:
# other fields to check:
# Tablehtml
if i["model"] == "eoapublications.element":
# logging.debug("Checking Fulltext")
# fulltext = i["fields"]["Fulltext"]
# cleaned_citation = treat_citations(fulltext)
# i["fields"]["Fulltext"] = cleaned_citation
# logging.debug("Checking Caption")
# caption = i["fields"]["Caption"]
# cleaned_citation = treat_citations(caption)
# i["fields"]["Caption"] = cleaned_citation
logging.debug("Checking Tablehtml")
tablehtml = i["fields"]["Tablehtml"]
cleaned_citation = treat_citations(tablehtml)
i["fields"]["Tablehtml"] = cleaned_citation
if args.indexsections:
if i["model"] == "eoapublications.indexsection":
fulltext = i["fields"]["Html"]
if fulltext.startswith("<h4"):
pass
else:
cleaned_index = treat_indexsections(fulltext)
i["fields"]["Html"] = cleaned_index
shutil.move(args.jsonfile, args.jsonfile.replace(".json", "-beforefix.json"))
outfile = args.jsonfile
with open(args.jsonfile, "w") as mj:
json.dump(jsonentries, mj, indent=4, separators=(',', ': '))
# def main ends here
if __name__ == '__main__':
main()
# finis