From 6a5e3eb969a0c15ec19281398cb5d89e6415f5dd Mon Sep 17 00:00:00 2001 From: kthoden Date: Mon, 11 Feb 2019 16:28:12 +0100 Subject: [PATCH] Math conversion --- fix_tei.py | 44 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/fix_tei.py b/fix_tei.py index 963da3f..1797f93 100644 --- a/fix_tei.py +++ b/fix_tei.py @@ -176,6 +176,27 @@ def convert_citations(string, citedrangetext): return (string, citations) # def convert_citations ends here + +def convert_math(string): + """Find math shorthand using regex. + + 2^2 = 4 + """ + formulae = [] + + math_pattern = re.compile(r"(\$|$|$)(?P.+?)(\$|$|$)") + found_math = re.findall(math_pattern, string) + logging.info(f"Found {len(found_math)} formulae.") + + string = re.sub(math_pattern, r'\g', string) + + for formula in found_math: + formulae.append(formula[1]) + + return string +# def convert_math ends here + + def parse_cited_range(list_of_xml_elements): """citedRange: split up parameters or remove element if attributes are empty""" @@ -340,6 +361,19 @@ def cleanup_xml(xml_tree): for attribute in xml_preserve: attribute.attrib.pop("{http://www.w3.org/XML/1998/namespace}space") + formulae = xmltree.xpath("//t:formula", namespaces=NS_MAP) + logging.info(f"Found {len(formulae)} formulae.") + + for formula in formulae: + hi_in_formula = formula.xpath("t:hi", namespaces=NS_MAP) + for hi in hi_in_formula: + logging.debug(f"Found something in the formula: {etree.tostring(hi)}") + if hi.attrib["rend"] == "italic": + hi.attrib.pop("rend") + else: + logging.warning(f"Found another rend attribute: {hi.get('rend')}") + hi.tag = "tagtobestripped" + empty_rend = xml_tree.xpath("//t:hi[(@rend='')]", namespaces=NS_MAP) logging.info(f"Found {len(empty_rend)} empty rend attributes.") @@ -353,6 +387,10 @@ def cleanup_xml(xml_tree): xml_parent = attribute.getparent() attribute.tag = "tagtobestripped" + seg_element = xml_tree.findall("//t:seg", namespaces=NS_MAP) + for seg in seg_element: + seg.tag = "tagtobestripped" + footnotes = xml_tree.xpath("//t:note[@place='foot']", namespaces=NS_MAP) for footnote in footnotes: footnote.set("place", "bottom") @@ -642,14 +680,16 @@ def main(): mod_string3 = convert_figures(mod_string2) + math_string = convert_math(mod_string3) + debug_output = TMP_DIR + os.path.sep + args.teifile.replace(".xml", "-modified.xml") with open(debug_output, "w") as debugfile: - debugfile.write(mod_string3) + debugfile.write(math_string) logging.info("Wrote %s." % debug_output) # check for wellformedness, read again as xml try: - xml_tree2 = etree.fromstring(mod_string3) + xml_tree2 = etree.fromstring(math_string) except etree.XMLSyntaxError: logging.error("\nXML syntax error when trying to parse modified tree. Dumped it to %s." % debug_output) print("-"*60)