Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Inserting code for references
  • Loading branch information
Klaus Thoden committed Mar 23, 2018
1 parent eece57e commit ea0f654
Showing 1 changed file with 21 additions and 5 deletions.
26 changes: 21 additions & 5 deletions fix_tei.py
Expand Up @@ -79,6 +79,18 @@ def fixup(m):
return re.sub(r"&#?\w+;", fixup, text)
# def unescape ends here

def convert_references(string):
"""Find reference markers (#) in the text"""

references_pattern = re.compile(r"(#)(?P<reference>.+?)(#)")
references = re.findall(references_pattern, string)
logging.info("Found %s references" % len(references))
for reference in references:
string = re.sub(references_pattern, r"<ref><![CDATA[\g<1>]]></ref>", string)

return string
# def convert_references ends here

def convert_citations(string):
"""Find citation shorthand using regex.
Expand Down Expand Up @@ -384,28 +396,32 @@ def main():
xml_cleaned = cleanup_xml(xml_tree)
cleaned_path = TMP_DIR + os.path.sep + args.teifile.replace(".xml", "-cleaned.xml")
xml_cleaned.write(cleaned_path, pretty_print=True, xml_declaration=True, encoding="utf-8")
logging.info("Wrote cleanup.xml")
logging.info("Wrote %s." % cleaned_path)

# first some modifications on a string object
xml_string = etree.tostring(xml_cleaned).decode('utf-8')
mod_string, cited = convert_citations(xml_string)

# the '#' sign is a bad choice!
# mod_string = convert_references(xml_string)

mod_string2, cited = convert_citations(xml_string)

used_citekeys = [unescape(c) for c in cited]
not_cited = validate_citations(used_citekeys, bibdata)

report["len_citekeys"] = len(used_citekeys)
report["not_cited"] = not_cited

mod_string2 = convert_figures(mod_string)
mod_string3 = convert_figures(mod_string2)

debug_output = TMP_DIR + os.path.sep + args.teifile.replace(".xml", "-modified.xml")
with open(debug_output, "w") as debugfile:
debugfile.write(mod_string2)
debugfile.write(mod_string3)
logging.info("Wrote %s." % debug_output)

# check for wellformedness, read again as xml
try:
xml_tree2 = etree.fromstring(mod_string2)
xml_tree2 = etree.fromstring(mod_string3)
except etree.XMLSyntaxError:
print("\nXML syntax error when trying to parse modified tree. Dumped it to %s." % debug_output)
print("-"*60)
Expand Down

0 comments on commit ea0f654

Please sign in to comment.