diff --git a/fix_tei.py b/fix_tei.py index 8d18495..3563135 100644 --- a/fix_tei.py +++ b/fix_tei.py @@ -79,6 +79,18 @@ def fixup(m): return re.sub(r"&#?\w+;", fixup, text) # def unescape ends here +def convert_references(string): + """Find reference markers (#) in the text""" + + references_pattern = re.compile(r"(#)(?P.+?)(#)") + references = re.findall(references_pattern, string) + logging.info("Found %s references" % len(references)) + for reference in references: + string = re.sub(references_pattern, r"]]>", string) + + return string +# def convert_references ends here + def convert_citations(string): """Find citation shorthand using regex. @@ -384,11 +396,15 @@ def main(): xml_cleaned = cleanup_xml(xml_tree) cleaned_path = TMP_DIR + os.path.sep + args.teifile.replace(".xml", "-cleaned.xml") xml_cleaned.write(cleaned_path, pretty_print=True, xml_declaration=True, encoding="utf-8") - logging.info("Wrote cleanup.xml") + logging.info("Wrote %s." % cleaned_path) # first some modifications on a string object xml_string = etree.tostring(xml_cleaned).decode('utf-8') - mod_string, cited = convert_citations(xml_string) + + # the '#' sign is a bad choice! + # mod_string = convert_references(xml_string) + + mod_string2, cited = convert_citations(xml_string) used_citekeys = [unescape(c) for c in cited] not_cited = validate_citations(used_citekeys, bibdata) @@ -396,16 +412,16 @@ def main(): report["len_citekeys"] = len(used_citekeys) report["not_cited"] = not_cited - mod_string2 = convert_figures(mod_string) + mod_string3 = convert_figures(mod_string2) debug_output = TMP_DIR + os.path.sep + args.teifile.replace(".xml", "-modified.xml") with open(debug_output, "w") as debugfile: - debugfile.write(mod_string2) + debugfile.write(mod_string3) logging.info("Wrote %s." % debug_output) # check for wellformedness, read again as xml try: - xml_tree2 = etree.fromstring(mod_string2) + xml_tree2 = etree.fromstring(mod_string3) except etree.XMLSyntaxError: print("\nXML syntax error when trying to parse modified tree. Dumped it to %s." % debug_output) print("-"*60)