Permalink
Browse files

More command line options

Highest level of sections can be specified
  • Loading branch information...
kthoden committed May 29, 2018
1 parent 914ef82 commit ba29277d82b2a3e4971f06b7dc2fdd5ffdf6f61e
Showing with 58 additions and 24 deletions.
  1. +58 −24 fix_tei.py
View
@@ -271,25 +271,44 @@ def cleanup_xml(xml_tree):
return xml_tree
# def cleanup_xml ends here
-def fix_document_structure(xml_tree):
+def fix_document_structure(xml_tree, highest_level):
"""Insert div types"""
# Unsure here, but maybe have a rule that one file is one chapter,
# so the highest level would be sections
- chapter_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP)
- section_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP)
- subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP)
- subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP)
-
- for chapter in chapter_divs:
- chapter.set("type", "chapter")
- for section in section_divs:
- section.set("type", "section")
- for subsection in subsection_divs:
- subsection.set("type", "subsection")
- for subsubsection in subsubsection_divs:
- subsubsection.set("type", "subsubsection")
+ if highest_level == "chapter":
+ chapter_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP)
+ section_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP)
+ subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP)
+ subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP)
+
+ for chapter in chapter_divs:
+ chapter.set("type", "chapter")
+ for section in section_divs:
+ section.set("type", "section")
+ for subsection in subsection_divs:
+ subsection.set("type", "subsection")
+ for subsubsection in subsubsection_divs:
+ subsubsection.set("type", "subsubsection")
+
+ elif highest_level == "part":
+ part_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP)
+ chapter_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP)
+ section_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP)
+ subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP)
+ subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div/t:div", namespaces=NS_MAP)
+
+ for part in part_divs:
+ part.set("type", "part")
+ for chapter in chapter_divs:
+ chapter.set("type", "chapter")
+ for section in section_divs:
+ section.set("type", "section")
+ for subsection in subsection_divs:
+ subsection.set("type", "subsection")
+ for subsubsection in subsubsection_divs:
+ subsubsection.set("type", "subsubsection")
# section_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP)
# subsection_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP)
@@ -377,11 +396,18 @@ def main():
"""The main bit"""
parser = argparse.ArgumentParser()
+ parser.add_argument("-d", "--dochighestorder", default='chapter', help="Specify which divider is at the highest level, possible values: part, chapter. Default is chapter.")
+ parser.add_argument("-f", "--finalize", help="Finalize a publication.", action="store_true")
parser.add_argument("teifile", help="Output from oxgarage/metypeset, an TEI XML file.")
parser.add_argument("bibfile", help="The bibliography database of the publication.")
parser.add_argument("figdir", help="The directory that contains the figures belonging to the publication.")
args = parser.parse_args()
+ highest_level = args.dochighestorder
+ if highest_level not in ["chapter", "part"]:
+ sys.stderr.write("Specify either 'chapter' or 'part' as highest level. Exiting")
+ sys.exit()
+
if not os.path.exists(TMP_DIR):
os.makedirs(TMP_DIR)
@@ -432,18 +458,23 @@ def main():
print("-"*60)
exit()
- all_figures = xml_tree2.xpath("//t:graphic", namespaces=NS_MAP)
- bad_figures = make_figure_elements(all_figures, args.figdir)
+ if args.finalize:
+ pass
+ else:
+ all_figures = xml_tree2.xpath("//t:graphic", namespaces=NS_MAP)
+ bad_figures = make_figure_elements(all_figures, args.figdir)
- report["bad_figures"] = bad_figures
+ report["bad_figures"] = bad_figures
all_references = xml_tree2.xpath("//t:bibl", namespaces=NS_MAP)
- bad_pageref = parse_cited_range(all_references)
-
- report["bad_pageref"] = bad_pageref
+ if args.finalize:
+ pass
+ else:
+ bad_pageref = parse_cited_range(all_references)
+ report["bad_pageref"] = bad_pageref
- tei_header = xml_tree2.xpath("//t:teiHeader", namespaces=NS_MAP)
- fix_tei_header(tei_header[0], str(args.bibfile))
+ tei_header = xml_tree2.xpath("//t:teiHeader", namespaces=NS_MAP)
+ fix_tei_header(tei_header[0], str(args.bibfile))
etree.strip_tags(xml_tree2, "tagtobestripped")
@@ -473,14 +504,17 @@ def main():
# Pickle the 'data' dictionary using the highest protocol available.
pickle.dump(data_to_pickle, f, pickle.HIGHEST_PROTOCOL)
- fix_document_structure(xml_tree2)
+ fix_document_structure(xml_tree2, highest_level)
# output
output = args.teifile.replace(".xml", "-out.xml")
tree = etree.ElementTree(xml_tree2)
tree.write(output, pretty_print=True, xml_declaration=True,encoding="utf-8")
logging.info("Wrote %s." % output)
- evaluate_report(report)
+ if args.finalize:
+ pass
+ else:
+ evaluate_report(report)
# def main ends here
if __name__ == '__main__':

0 comments on commit ba29277

Please sign in to comment.