Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
More docx source cleaning
Signed-off-by: kthoden <kthoden@gwdg.de>
  • Loading branch information
kthoden committed Feb 8, 2019
1 parent cbf263b commit 317ab43
Showing 1 changed file with 16 additions and 0 deletions.
16 changes: 16 additions & 0 deletions fix_tei.py
Expand Up @@ -316,9 +316,13 @@ def cleanup_xml(xml_tree):

metypeset_attrib = xml_tree.findall("//t:*[@meTypesetSize]", namespaces=NS_MAP)
color_attrib = xml_tree.xpath("//t:hi[contains(@rend, 'color') or contains(@rend, 'background')]", namespaces=NS_MAP)
hi_style_attrib = xml_tree.xpath("//t:hi[contains(@style,'font-size')]", namespaces=NS_MAP)
xml_preserve = xml_tree.xpath("//*[@xml:space]")

logging.info("Found %s metypesets." % len(metypeset_attrib))
logging.info("Found %s colour attributes." % len(color_attrib))
logging.info(f"Found {len(hi_style_attrib)} hi style attributes.")
logging.info(f"Found {len(xml_preserve)} xml:space attributes.")

for attribute in metypeset_attrib:
logging.info("Number of attributes: %s" % len(attribute.attrib))
Expand All @@ -327,6 +331,18 @@ def cleanup_xml(xml_tree):
for attribute in color_attrib:
attribute.attrib.pop("rend")

for attribute in hi_style_attrib:
attribute.attrib.pop("style")

for attribute in xml_preserve:
attribute.attrib.pop("{http://www.w3.org/XML/1998/namespace}space")

empty_rend = xml_tree.xpath("//t:hi[(@rend='')]", namespaces=NS_MAP)
logging.info(f"Found {len(empty_rend)} empty rend attributes.")

for attribute in empty_rend:
attribute.attrib.pop("rend")

hi_without_attrib2 = xml_tree.findall("//t:hi", namespaces=NS_MAP)

for attribute in hi_without_attrib2:
Expand Down

0 comments on commit 317ab43

Please sign in to comment.