Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
More command line options
Highest level of sections can be specified
  • Loading branch information
Klaus Thoden committed May 29, 2018
1 parent 914ef82 commit ba29277
Showing 1 changed file with 58 additions and 24 deletions.
82 changes: 58 additions & 24 deletions fix_tei.py
Expand Up @@ -271,25 +271,44 @@ def cleanup_xml(xml_tree):
return xml_tree return xml_tree
# def cleanup_xml ends here # def cleanup_xml ends here


def fix_document_structure(xml_tree): def fix_document_structure(xml_tree, highest_level):
"""Insert div types""" """Insert div types"""


# Unsure here, but maybe have a rule that one file is one chapter, # Unsure here, but maybe have a rule that one file is one chapter,
# so the highest level would be sections # so the highest level would be sections


chapter_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP) if highest_level == "chapter":
section_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP) chapter_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP)
subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP) section_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP)
subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP) subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP)

subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP)
for chapter in chapter_divs:
chapter.set("type", "chapter") for chapter in chapter_divs:
for section in section_divs: chapter.set("type", "chapter")
section.set("type", "section") for section in section_divs:
for subsection in subsection_divs: section.set("type", "section")
subsection.set("type", "subsection") for subsection in subsection_divs:
for subsubsection in subsubsection_divs: subsection.set("type", "subsection")
subsubsection.set("type", "subsubsection") for subsubsection in subsubsection_divs:
subsubsection.set("type", "subsubsection")

elif highest_level == "part":
part_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP)
chapter_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP)
section_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP)
subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP)
subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div/t:div", namespaces=NS_MAP)

for part in part_divs:
part.set("type", "part")
for chapter in chapter_divs:
chapter.set("type", "chapter")
for section in section_divs:
section.set("type", "section")
for subsection in subsection_divs:
subsection.set("type", "subsection")
for subsubsection in subsubsection_divs:
subsubsection.set("type", "subsubsection")


# section_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP) # section_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP)
# subsection_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP) # subsection_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP)
Expand Down Expand Up @@ -377,11 +396,18 @@ def main():
"""The main bit""" """The main bit"""


parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dochighestorder", default='chapter', help="Specify which divider is at the highest level, possible values: part, chapter. Default is chapter.")
parser.add_argument("-f", "--finalize", help="Finalize a publication.", action="store_true")
parser.add_argument("teifile", help="Output from oxgarage/metypeset, an TEI XML file.") parser.add_argument("teifile", help="Output from oxgarage/metypeset, an TEI XML file.")
parser.add_argument("bibfile", help="The bibliography database of the publication.") parser.add_argument("bibfile", help="The bibliography database of the publication.")
parser.add_argument("figdir", help="The directory that contains the figures belonging to the publication.") parser.add_argument("figdir", help="The directory that contains the figures belonging to the publication.")
args = parser.parse_args() args = parser.parse_args()


highest_level = args.dochighestorder
if highest_level not in ["chapter", "part"]:
sys.stderr.write("Specify either 'chapter' or 'part' as highest level. Exiting")
sys.exit()

if not os.path.exists(TMP_DIR): if not os.path.exists(TMP_DIR):
os.makedirs(TMP_DIR) os.makedirs(TMP_DIR)


Expand Down Expand Up @@ -432,18 +458,23 @@ def main():
print("-"*60) print("-"*60)
exit() exit()


all_figures = xml_tree2.xpath("//t:graphic", namespaces=NS_MAP) if args.finalize:
bad_figures = make_figure_elements(all_figures, args.figdir) pass
else:
all_figures = xml_tree2.xpath("//t:graphic", namespaces=NS_MAP)
bad_figures = make_figure_elements(all_figures, args.figdir)


report["bad_figures"] = bad_figures report["bad_figures"] = bad_figures


all_references = xml_tree2.xpath("//t:bibl", namespaces=NS_MAP) all_references = xml_tree2.xpath("//t:bibl", namespaces=NS_MAP)
bad_pageref = parse_cited_range(all_references) if args.finalize:

pass
report["bad_pageref"] = bad_pageref else:
bad_pageref = parse_cited_range(all_references)
report["bad_pageref"] = bad_pageref


tei_header = xml_tree2.xpath("//t:teiHeader", namespaces=NS_MAP) tei_header = xml_tree2.xpath("//t:teiHeader", namespaces=NS_MAP)
fix_tei_header(tei_header[0], str(args.bibfile)) fix_tei_header(tei_header[0], str(args.bibfile))


etree.strip_tags(xml_tree2, "tagtobestripped") etree.strip_tags(xml_tree2, "tagtobestripped")


Expand Down Expand Up @@ -473,14 +504,17 @@ def main():
# Pickle the 'data' dictionary using the highest protocol available. # Pickle the 'data' dictionary using the highest protocol available.
pickle.dump(data_to_pickle, f, pickle.HIGHEST_PROTOCOL) pickle.dump(data_to_pickle, f, pickle.HIGHEST_PROTOCOL)


fix_document_structure(xml_tree2) fix_document_structure(xml_tree2, highest_level)
# output # output
output = args.teifile.replace(".xml", "-out.xml") output = args.teifile.replace(".xml", "-out.xml")
tree = etree.ElementTree(xml_tree2) tree = etree.ElementTree(xml_tree2)
tree.write(output, pretty_print=True, xml_declaration=True,encoding="utf-8") tree.write(output, pretty_print=True, xml_declaration=True,encoding="utf-8")
logging.info("Wrote %s." % output) logging.info("Wrote %s." % output)


evaluate_report(report) if args.finalize:
pass
else:
evaluate_report(report)
# def main ends here # def main ends here


if __name__ == '__main__': if __name__ == '__main__':
Expand Down

0 comments on commit ba29277

Please sign in to comment.