From 5cabc7f716bb61fbc69a4b9e6808d1d2dca13e86 Mon Sep 17 00:00:00 2001 From: kthoden Date: Fri, 28 Feb 2020 16:11:06 +0100 Subject: [PATCH] Popup now yields full citation (but without markup, see #41) --- src/tei2imxml.py | 59 ++++++++++++++++++++++++++++++++++---- src/utils/libeoaconvert.py | 7 +++-- 2 files changed, 58 insertions(+), 8 deletions(-) diff --git a/src/tei2imxml.py b/src/tei2imxml.py index 17f9916..4635dca 100755 --- a/src/tei2imxml.py +++ b/src/tei2imxml.py @@ -237,11 +237,16 @@ def make_publication_cfg(info_dict, translation_file): # def make_publication_cfg ends here -def sanitize_data_string(text_string): +def sanitize_data_string(text_string, newline_to_space=False): """Remove line breaks and multiple spaces""" - text_string = text_string.replace('\r', '') - text_string = text_string.replace('\n', '') + if newline_to_space: + text_string = text_string.replace('\r', ' ') + text_string = text_string.replace('\n', ' ') + else: + text_string = text_string.replace('\r', '') + text_string = text_string.replace('\n', '') + return_string = re.sub("\s\s+" , " ", text_string) return return_string.strip() @@ -324,9 +329,48 @@ def format_citations( ): """Return a dictionary of the used citations as formatted entries. - citation_dict[citekey] = (authoryear_citation, year_citation, title) + citation_dict[citekey] = (authoryear_citation, year_citation, title, full_citation) """ + def cleanup_full_citation_with_markup(citation_element, entry): + """Generate a cleaned variant of the full citation""" + + citation_element_string = etree.tostring(citation_element).decode('utf-8') + sanitized_citation = sanitize_data_string(citation_element_string, newline_to_space=True) + + prequel = f"""

""" + sequel = f"

" + + prequel_end = sanitized_citation.index(prequel) + len(prequel) + sequel_begin = sanitized_citation.index(sequel) + + escaped_citation = libeoaconvert.escape_xml(sanitized_citation[prequel_end:sequel_begin], decode=False) + + return escaped_citation + # def cleanup_full_citation_with_markup ends here + + + def cleanup_full_citation_without_markup(citation_element): + """Generate a cleaned variant of full citation, but without markup""" + + textnodes = gettext(citation_element) + sanitized_text = sanitize_data_string(textnodes, newline_to_space=True) + + return sanitized_text + # def cleanup_full_citation_without_markup ends here + + + def gettext(xmlElement): + """Get text nodes of element.""" + + xmlText = xmlElement.text or "" + for xmlChild in xmlElement: + xmlText += gettext(xmlChild) + if xmlChild.tail: + xmlText += xmlChild.tail + return xmlText + # def gettext ends here + try: cites = etree.parse(str(html_file)) except OSError: @@ -339,10 +383,12 @@ def format_citations( authoryear_citation = cites.xpath(f"//div[@class='authoryear']/p/span[@data-cites='{entry}']")[0].text year_citation = cites.xpath(f"//div[@class='year']/p/span[@data-cites='{entry}']")[0].text title = cites.xpath(f"//div[@class='title']/p/span[@data-cites='{entry}']")[0].text + # full_citation = cleanup_full_citation_with_markup(cites.xpath(f"//div[@class='full']/p[@data-cites='{entry}']")[0], entry) + full_citation = cleanup_full_citation_without_markup(cites.xpath(f"//div[@class='full']/p[@data-cites='{entry}']")[0]) except IndexError: logging.error(f"Entry {entry} was not found in HTML file. Maybe you should run the tool again without -n option. Exiting.") sys.exit(1) - citation_dict[entry] = (authoryear_citation, year_citation, title) + citation_dict[entry] = (authoryear_citation, year_citation, title, full_citation) return citation_dict # def format_citations ends here @@ -768,7 +814,7 @@ def handle_refs_default(ref): citation.set("citekey", citekey) citation.set("data-title", sanitized_citation_string) - citation.set("data-content", cited_data[citekey][2]) + citation.set("data-content", cited_data[citekey][3]) citation.text = sanitized_citation_string ############# @@ -1486,6 +1532,7 @@ def main(): logging.info("Transforming body with Hyperimage support") else: pass + body_transformed_tmp = transform_body( tei_body, cited_dict, diff --git a/src/utils/libeoaconvert.py b/src/utils/libeoaconvert.py index 9b8a6db..f04df91 100644 --- a/src/utils/libeoaconvert.py +++ b/src/utils/libeoaconvert.py @@ -467,10 +467,13 @@ def restore_xml_tags(text): # def restore_xml_tags ends here -def escape_xml(text_bytes): +def escape_xml(raw_text, decode=True): """Convert xml markup to entities""" - text = text_bytes.decode("utf-8") + if decode: + text = raw_text.decode("utf-8") + else: + text = raw_text replacements = { "&" : "&",