diff --git a/fix_tei.py b/fix_tei.py
index 963da3f..1797f93 100644
--- a/fix_tei.py
+++ b/fix_tei.py
@@ -176,6 +176,27 @@ def convert_citations(string, citedrangetext):
return (string, citations)
# def convert_citations ends here
+
+def convert_math(string):
+ """Find math shorthand using regex.
+
+ 2^2 = 4
+ """
+ formulae = []
+
+ math_pattern = re.compile(r"(\$|$|$)(?P.+?)(\$|$|$)")
+ found_math = re.findall(math_pattern, string)
+ logging.info(f"Found {len(found_math)} formulae.")
+
+ string = re.sub(math_pattern, r'\g', string)
+
+ for formula in found_math:
+ formulae.append(formula[1])
+
+ return string
+# def convert_math ends here
+
+
def parse_cited_range(list_of_xml_elements):
"""citedRange: split up parameters or remove element if attributes are empty"""
@@ -340,6 +361,19 @@ def cleanup_xml(xml_tree):
for attribute in xml_preserve:
attribute.attrib.pop("{http://www.w3.org/XML/1998/namespace}space")
+ formulae = xmltree.xpath("//t:formula", namespaces=NS_MAP)
+ logging.info(f"Found {len(formulae)} formulae.")
+
+ for formula in formulae:
+ hi_in_formula = formula.xpath("t:hi", namespaces=NS_MAP)
+ for hi in hi_in_formula:
+ logging.debug(f"Found something in the formula: {etree.tostring(hi)}")
+ if hi.attrib["rend"] == "italic":
+ hi.attrib.pop("rend")
+ else:
+ logging.warning(f"Found another rend attribute: {hi.get('rend')}")
+ hi.tag = "tagtobestripped"
+
empty_rend = xml_tree.xpath("//t:hi[(@rend='')]", namespaces=NS_MAP)
logging.info(f"Found {len(empty_rend)} empty rend attributes.")
@@ -353,6 +387,10 @@ def cleanup_xml(xml_tree):
xml_parent = attribute.getparent()
attribute.tag = "tagtobestripped"
+ seg_element = xml_tree.findall("//t:seg", namespaces=NS_MAP)
+ for seg in seg_element:
+ seg.tag = "tagtobestripped"
+
footnotes = xml_tree.xpath("//t:note[@place='foot']", namespaces=NS_MAP)
for footnote in footnotes:
footnote.set("place", "bottom")
@@ -642,14 +680,16 @@ def main():
mod_string3 = convert_figures(mod_string2)
+ math_string = convert_math(mod_string3)
+
debug_output = TMP_DIR + os.path.sep + args.teifile.replace(".xml", "-modified.xml")
with open(debug_output, "w") as debugfile:
- debugfile.write(mod_string3)
+ debugfile.write(math_string)
logging.info("Wrote %s." % debug_output)
# check for wellformedness, read again as xml
try:
- xml_tree2 = etree.fromstring(mod_string3)
+ xml_tree2 = etree.fromstring(math_string)
except etree.XMLSyntaxError:
logging.error("\nXML syntax error when trying to parse modified tree. Dumped it to %s." % debug_output)
print("-"*60)