Permalink
Browse files

More command line options

Highest level of sections can be specified
  • Loading branch information...
kthoden committed May 29, 2018
1 parent 914ef82 commit ba29277d82b2a3e4971f06b7dc2fdd5ffdf6f61e
Showing with 58 additions and 24 deletions.
  1. +58 −24 fix_tei.py
View
@@ -271,25 +271,44 @@ def cleanup_xml(xml_tree):
return xml_tree
# def cleanup_xml ends here
def fix_document_structure(xml_tree):
def fix_document_structure(xml_tree, highest_level):
"""Insert div types"""
# Unsure here, but maybe have a rule that one file is one chapter,
# so the highest level would be sections
chapter_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP)
section_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP)
subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP)
subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP)
for chapter in chapter_divs:
chapter.set("type", "chapter")
for section in section_divs:
section.set("type", "section")
for subsection in subsection_divs:
subsection.set("type", "subsection")
for subsubsection in subsubsection_divs:
subsubsection.set("type", "subsubsection")
if highest_level == "chapter":
chapter_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP)
section_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP)
subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP)
subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP)
for chapter in chapter_divs:
chapter.set("type", "chapter")
for section in section_divs:
section.set("type", "section")
for subsection in subsection_divs:
subsection.set("type", "subsection")
for subsubsection in subsubsection_divs:
subsubsection.set("type", "subsubsection")
elif highest_level == "part":
part_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP)
chapter_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP)
section_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP)
subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP)
subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div/t:div", namespaces=NS_MAP)
for part in part_divs:
part.set("type", "part")
for chapter in chapter_divs:
chapter.set("type", "chapter")
for section in section_divs:
section.set("type", "section")
for subsection in subsection_divs:
subsection.set("type", "subsection")
for subsubsection in subsubsection_divs:
subsubsection.set("type", "subsubsection")
# section_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP)
# subsection_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP)
@@ -377,11 +396,18 @@ def main():
"""The main bit"""
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dochighestorder", default='chapter', help="Specify which divider is at the highest level, possible values: part, chapter. Default is chapter.")
parser.add_argument("-f", "--finalize", help="Finalize a publication.", action="store_true")
parser.add_argument("teifile", help="Output from oxgarage/metypeset, an TEI XML file.")
parser.add_argument("bibfile", help="The bibliography database of the publication.")
parser.add_argument("figdir", help="The directory that contains the figures belonging to the publication.")
args = parser.parse_args()
highest_level = args.dochighestorder
if highest_level not in ["chapter", "part"]:
sys.stderr.write("Specify either 'chapter' or 'part' as highest level. Exiting")
sys.exit()
if not os.path.exists(TMP_DIR):
os.makedirs(TMP_DIR)
@@ -432,18 +458,23 @@ def main():
print("-"*60)
exit()
all_figures = xml_tree2.xpath("//t:graphic", namespaces=NS_MAP)
bad_figures = make_figure_elements(all_figures, args.figdir)
if args.finalize:
pass
else:
all_figures = xml_tree2.xpath("//t:graphic", namespaces=NS_MAP)
bad_figures = make_figure_elements(all_figures, args.figdir)
report["bad_figures"] = bad_figures
report["bad_figures"] = bad_figures
all_references = xml_tree2.xpath("//t:bibl", namespaces=NS_MAP)
bad_pageref = parse_cited_range(all_references)
report["bad_pageref"] = bad_pageref
if args.finalize:
pass
else:
bad_pageref = parse_cited_range(all_references)
report["bad_pageref"] = bad_pageref
tei_header = xml_tree2.xpath("//t:teiHeader", namespaces=NS_MAP)
fix_tei_header(tei_header[0], str(args.bibfile))
tei_header = xml_tree2.xpath("//t:teiHeader", namespaces=NS_MAP)
fix_tei_header(tei_header[0], str(args.bibfile))
etree.strip_tags(xml_tree2, "tagtobestripped")
@@ -473,14 +504,17 @@ def main():
# Pickle the 'data' dictionary using the highest protocol available.
pickle.dump(data_to_pickle, f, pickle.HIGHEST_PROTOCOL)
fix_document_structure(xml_tree2)
fix_document_structure(xml_tree2, highest_level)
# output
output = args.teifile.replace(".xml", "-out.xml")
tree = etree.ElementTree(xml_tree2)
tree.write(output, pretty_print=True, xml_declaration=True,encoding="utf-8")
logging.info("Wrote %s." % output)
evaluate_report(report)
if args.finalize:
pass
else:
evaluate_report(report)
# def main ends here
if __name__ == '__main__':

0 comments on commit ba29277

Please sign in to comment.