Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Popup now yields full citation (but without markup, see #41)
  • Loading branch information
kthoden committed Feb 28, 2020
1 parent 51a5089 commit 5cabc7f
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 8 deletions.
59 changes: 53 additions & 6 deletions src/tei2imxml.py
Expand Up @@ -237,11 +237,16 @@ def make_publication_cfg(info_dict, translation_file):
# def make_publication_cfg ends here


def sanitize_data_string(text_string):
def sanitize_data_string(text_string, newline_to_space=False):
"""Remove line breaks and multiple spaces"""

text_string = text_string.replace('\r', '')
text_string = text_string.replace('\n', '')
if newline_to_space:
text_string = text_string.replace('\r', ' ')
text_string = text_string.replace('\n', ' ')
else:
text_string = text_string.replace('\r', '')
text_string = text_string.replace('\n', '')

return_string = re.sub("\s\s+" , " ", text_string)

return return_string.strip()
Expand Down Expand Up @@ -324,9 +329,48 @@ def format_citations(
):
"""Return a dictionary of the used citations as formatted entries.
citation_dict[citekey] = (authoryear_citation, year_citation, title)
citation_dict[citekey] = (authoryear_citation, year_citation, title, full_citation)
"""

def cleanup_full_citation_with_markup(citation_element, entry):
"""Generate a cleaned variant of the full citation"""

citation_element_string = etree.tostring(citation_element).decode('utf-8')
sanitized_citation = sanitize_data_string(citation_element_string, newline_to_space=True)

prequel = f"""<p data-cites="{entry}">"""
sequel = f"</p>"

prequel_end = sanitized_citation.index(prequel) + len(prequel)
sequel_begin = sanitized_citation.index(sequel)

escaped_citation = libeoaconvert.escape_xml(sanitized_citation[prequel_end:sequel_begin], decode=False)

return escaped_citation
# def cleanup_full_citation_with_markup ends here


def cleanup_full_citation_without_markup(citation_element):
"""Generate a cleaned variant of full citation, but without markup"""

textnodes = gettext(citation_element)
sanitized_text = sanitize_data_string(textnodes, newline_to_space=True)

return sanitized_text
# def cleanup_full_citation_without_markup ends here


def gettext(xmlElement):
"""Get text nodes of element."""

xmlText = xmlElement.text or ""
for xmlChild in xmlElement:
xmlText += gettext(xmlChild)
if xmlChild.tail:
xmlText += xmlChild.tail
return xmlText
# def gettext ends here

try:
cites = etree.parse(str(html_file))
except OSError:
Expand All @@ -339,10 +383,12 @@ def format_citations(
authoryear_citation = cites.xpath(f"//div[@class='authoryear']/p/span[@data-cites='{entry}']")[0].text
year_citation = cites.xpath(f"//div[@class='year']/p/span[@data-cites='{entry}']")[0].text
title = cites.xpath(f"//div[@class='title']/p/span[@data-cites='{entry}']")[0].text
# full_citation = cleanup_full_citation_with_markup(cites.xpath(f"//div[@class='full']/p[@data-cites='{entry}']")[0], entry)
full_citation = cleanup_full_citation_without_markup(cites.xpath(f"//div[@class='full']/p[@data-cites='{entry}']")[0])
except IndexError:
logging.error(f"Entry {entry} was not found in HTML file. Maybe you should run the tool again without -n option. Exiting.")
sys.exit(1)
citation_dict[entry] = (authoryear_citation, year_citation, title)
citation_dict[entry] = (authoryear_citation, year_citation, title, full_citation)

return citation_dict
# def format_citations ends here
Expand Down Expand Up @@ -768,7 +814,7 @@ def handle_refs_default(ref):
citation.set("citekey", citekey)

citation.set("data-title", sanitized_citation_string)
citation.set("data-content", cited_data[citekey][2])
citation.set("data-content", cited_data[citekey][3])
citation.text = sanitized_citation_string

#############
Expand Down Expand Up @@ -1486,6 +1532,7 @@ def main():
logging.info("Transforming body with Hyperimage support")
else:
pass

body_transformed_tmp = transform_body(
tei_body,
cited_dict,
Expand Down
7 changes: 5 additions & 2 deletions src/utils/libeoaconvert.py
Expand Up @@ -467,10 +467,13 @@ def restore_xml_tags(text):
# def restore_xml_tags ends here


def escape_xml(text_bytes):
def escape_xml(raw_text, decode=True):
"""Convert xml markup to entities"""

text = text_bytes.decode("utf-8")
if decode:
text = raw_text.decode("utf-8")
else:
text = raw_text

replacements = {
"&" : "&amp;",
Expand Down

0 comments on commit 5cabc7f

Please sign in to comment.