diff --git a/fix_tei.py b/fix_tei.py index 5ae2fdd..7003999 100644 --- a/fix_tei.py +++ b/fix_tei.py @@ -316,9 +316,13 @@ def cleanup_xml(xml_tree): metypeset_attrib = xml_tree.findall("//t:*[@meTypesetSize]", namespaces=NS_MAP) color_attrib = xml_tree.xpath("//t:hi[contains(@rend, 'color') or contains(@rend, 'background')]", namespaces=NS_MAP) + hi_style_attrib = xml_tree.xpath("//t:hi[contains(@style,'font-size')]", namespaces=NS_MAP) + xml_preserve = xml_tree.xpath("//*[@xml:space]") logging.info("Found %s metypesets." % len(metypeset_attrib)) logging.info("Found %s colour attributes." % len(color_attrib)) + logging.info(f"Found {len(hi_style_attrib)} hi style attributes.") + logging.info(f"Found {len(xml_preserve)} xml:space attributes.") for attribute in metypeset_attrib: logging.info("Number of attributes: %s" % len(attribute.attrib)) @@ -327,6 +331,18 @@ def cleanup_xml(xml_tree): for attribute in color_attrib: attribute.attrib.pop("rend") + for attribute in hi_style_attrib: + attribute.attrib.pop("style") + + for attribute in xml_preserve: + attribute.attrib.pop("{http://www.w3.org/XML/1998/namespace}space") + + empty_rend = xml_tree.xpath("//t:hi[(@rend='')]", namespaces=NS_MAP) + logging.info(f"Found {len(empty_rend)} empty rend attributes.") + + for attribute in empty_rend: + attribute.attrib.pop("rend") + hi_without_attrib2 = xml_tree.findall("//t:hi", namespaces=NS_MAP) for attribute in hi_without_attrib2: