Popup now yields full citation (but without markup, see #41)

EditionOpenAccess · Feb 28, 2020 · 5cabc7f · 5cabc7f
1 parent 51a5089
commit 5cabc7f
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 8 deletions.
diff --git a/src/tei2imxml.py b/src/tei2imxml.py
@@ -237,11 +237,16 @@ def make_publication_cfg(info_dict, translation_file):
 # def make_publication_cfg ends here
 
 
-def sanitize_data_string(text_string):
+def sanitize_data_string(text_string, newline_to_space=False):
     """Remove line breaks and multiple spaces"""
 
-    text_string = text_string.replace('\r', '')
-    text_string = text_string.replace('\n', '')
+    if newline_to_space:
+        text_string = text_string.replace('\r', ' ')
+        text_string = text_string.replace('\n', ' ')
+    else:
+        text_string = text_string.replace('\r', '')
+        text_string = text_string.replace('\n', '')
+
     return_string = re.sub("\s\s+" , " ", text_string)
 
     return return_string.strip()
@@ -324,9 +329,48 @@ def format_citations(
 ):
     """Return a dictionary of the used citations as formatted entries.
 
-    citation_dict[citekey] = (authoryear_citation, year_citation, title)
+    citation_dict[citekey] = (authoryear_citation, year_citation, title, full_citation)
     """
 
+    def cleanup_full_citation_with_markup(citation_element, entry):
+        """Generate a cleaned variant of the full citation"""
+
+        citation_element_string = etree.tostring(citation_element).decode('utf-8')
+        sanitized_citation = sanitize_data_string(citation_element_string, newline_to_space=True)
+
+        prequel = f"""<p data-cites="{entry}">"""
+        sequel = f"</p>"
+
+        prequel_end = sanitized_citation.index(prequel) + len(prequel)
+        sequel_begin = sanitized_citation.index(sequel)
+
+        escaped_citation = libeoaconvert.escape_xml(sanitized_citation[prequel_end:sequel_begin], decode=False)
+
+        return escaped_citation
+    # def cleanup_full_citation_with_markup ends here
+
+
+    def cleanup_full_citation_without_markup(citation_element):
+        """Generate a cleaned variant of full citation, but without markup"""
+
+        textnodes = gettext(citation_element)
+        sanitized_text = sanitize_data_string(textnodes, newline_to_space=True)
+
+        return sanitized_text
+    # def cleanup_full_citation_without_markup ends here
+
+
+    def gettext(xmlElement):
+        """Get text nodes of element."""
+
+        xmlText = xmlElement.text or ""
+        for xmlChild in xmlElement:
+            xmlText += gettext(xmlChild)
+            if xmlChild.tail:
+                xmlText += xmlChild.tail
+        return xmlText
+    # def gettext ends here
+
     try:
         cites = etree.parse(str(html_file))
     except OSError:
@@ -339,10 +383,12 @@ def format_citations(
             authoryear_citation = cites.xpath(f"//div[@class='authoryear']/p/span[@data-cites='{entry}']")[0].text
             year_citation = cites.xpath(f"//div[@class='year']/p/span[@data-cites='{entry}']")[0].text
             title = cites.xpath(f"//div[@class='title']/p/span[@data-cites='{entry}']")[0].text
+            # full_citation = cleanup_full_citation_with_markup(cites.xpath(f"//div[@class='full']/p[@data-cites='{entry}']")[0], entry)
+            full_citation = cleanup_full_citation_without_markup(cites.xpath(f"//div[@class='full']/p[@data-cites='{entry}']")[0])
         except IndexError:
             logging.error(f"Entry {entry} was not found in HTML file. Maybe you should run the tool again without -n option. Exiting.")
             sys.exit(1)
-        citation_dict[entry] = (authoryear_citation, year_citation, title)
+        citation_dict[entry] = (authoryear_citation, year_citation, title, full_citation)
 
     return citation_dict
 # def format_citations ends here
@@ -768,7 +814,7 @@ def handle_refs_default(ref):
             citation.set("citekey", citekey)
 
         citation.set("data-title", sanitized_citation_string)
-        citation.set("data-content", cited_data[citekey][2])
+        citation.set("data-content", cited_data[citekey][3])
         citation.text = sanitized_citation_string
 
     #############
@@ -1486,6 +1532,7 @@ def main():
         logging.info("Transforming body with Hyperimage support")
     else:
         pass
+
     body_transformed_tmp = transform_body(
             tei_body,
             cited_dict,

diff --git a/src/utils/libeoaconvert.py b/src/utils/libeoaconvert.py
@@ -467,10 +467,13 @@ def restore_xml_tags(text):
 # def restore_xml_tags ends here
 
 
-def escape_xml(text_bytes):
+def escape_xml(raw_text, decode=True):
     """Convert xml markup to entities"""
 
-    text = text_bytes.decode("utf-8")
+    if decode:
+        text = raw_text.decode("utf-8")
+    else:
+        text = raw_text
 
     replacements = {
         "&" : "&amp;",