Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Math conversion
  • Loading branch information
kthoden committed Feb 11, 2019
1 parent 4c7aa98 commit 6a5e3eb
Showing 1 changed file with 42 additions and 2 deletions.
44 changes: 42 additions & 2 deletions fix_tei.py
Expand Up @@ -176,6 +176,27 @@ def convert_citations(string, citedrangetext):
return (string, citations)
# def convert_citations ends here


def convert_math(string):
"""Find math shorthand using regex.
<formula notation="tex" rend="inline">2^2 = 4</formula>
"""
formulae = []

math_pattern = re.compile(r"(\$|&#x24;|&#36;)(?P<contents>.+?)(\$|&#x24;|&#36;)")
found_math = re.findall(math_pattern, string)
logging.info(f"Found {len(found_math)} formulae.")

string = re.sub(math_pattern, r'<formula notation="tex" rend="inline">\g<contents></formula>', string)

for formula in found_math:
formulae.append(formula[1])

return string
# def convert_math ends here


def parse_cited_range(list_of_xml_elements):
"""citedRange: split up parameters or remove element if attributes are empty"""

Expand Down Expand Up @@ -340,6 +361,19 @@ def cleanup_xml(xml_tree):
for attribute in xml_preserve:
attribute.attrib.pop("{http://www.w3.org/XML/1998/namespace}space")

formulae = xmltree.xpath("//t:formula", namespaces=NS_MAP)
logging.info(f"Found {len(formulae)} formulae.")

for formula in formulae:
hi_in_formula = formula.xpath("t:hi", namespaces=NS_MAP)
for hi in hi_in_formula:
logging.debug(f"Found something in the formula: {etree.tostring(hi)}")
if hi.attrib["rend"] == "italic":
hi.attrib.pop("rend")
else:
logging.warning(f"Found another rend attribute: {hi.get('rend')}")
hi.tag = "tagtobestripped"

empty_rend = xml_tree.xpath("//t:hi[(@rend='')]", namespaces=NS_MAP)
logging.info(f"Found {len(empty_rend)} empty rend attributes.")

Expand All @@ -353,6 +387,10 @@ def cleanup_xml(xml_tree):
xml_parent = attribute.getparent()
attribute.tag = "tagtobestripped"

seg_element = xml_tree.findall("//t:seg", namespaces=NS_MAP)
for seg in seg_element:
seg.tag = "tagtobestripped"

footnotes = xml_tree.xpath("//t:note[@place='foot']", namespaces=NS_MAP)
for footnote in footnotes:
footnote.set("place", "bottom")
Expand Down Expand Up @@ -642,14 +680,16 @@ def main():

mod_string3 = convert_figures(mod_string2)

math_string = convert_math(mod_string3)

debug_output = TMP_DIR + os.path.sep + args.teifile.replace(".xml", "-modified.xml")
with open(debug_output, "w") as debugfile:
debugfile.write(mod_string3)
debugfile.write(math_string)
logging.info("Wrote %s." % debug_output)

# check for wellformedness, read again as xml
try:
xml_tree2 = etree.fromstring(mod_string3)
xml_tree2 = etree.fromstring(math_string)
except etree.XMLSyntaxError:
logging.error("\nXML syntax error when trying to parse modified tree. Dumped it to %s." % debug_output)
print("-"*60)
Expand Down

0 comments on commit 6a5e3eb

Please sign in to comment.