From ba29277d82b2a3e4971f06b7dc2fdd5ffdf6f61e Mon Sep 17 00:00:00 2001 From: Klaus Thoden Date: Tue, 29 May 2018 16:47:58 +0200 Subject: [PATCH] More command line options Highest level of sections can be specified --- fix_tei.py | 82 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 24 deletions(-) diff --git a/fix_tei.py b/fix_tei.py index 5a46262..9143e4e 100644 --- a/fix_tei.py +++ b/fix_tei.py @@ -271,25 +271,44 @@ def cleanup_xml(xml_tree): return xml_tree # def cleanup_xml ends here -def fix_document_structure(xml_tree): +def fix_document_structure(xml_tree, highest_level): """Insert div types""" # Unsure here, but maybe have a rule that one file is one chapter, # so the highest level would be sections - chapter_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP) - section_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP) - subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP) - subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP) - - for chapter in chapter_divs: - chapter.set("type", "chapter") - for section in section_divs: - section.set("type", "section") - for subsection in subsection_divs: - subsection.set("type", "subsection") - for subsubsection in subsubsection_divs: - subsubsection.set("type", "subsubsection") + if highest_level == "chapter": + chapter_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP) + section_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP) + subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP) + subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP) + + for chapter in chapter_divs: + chapter.set("type", "chapter") + for section in section_divs: + section.set("type", "section") + for subsection in subsection_divs: + subsection.set("type", "subsection") + for subsubsection in subsubsection_divs: + subsubsection.set("type", "subsubsection") + + elif highest_level == "part": + part_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP) + chapter_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP) + section_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP) + subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP) + subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div/t:div", namespaces=NS_MAP) + + for part in part_divs: + part.set("type", "part") + for chapter in chapter_divs: + chapter.set("type", "chapter") + for section in section_divs: + section.set("type", "section") + for subsection in subsection_divs: + subsection.set("type", "subsection") + for subsubsection in subsubsection_divs: + subsubsection.set("type", "subsubsection") # section_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP) # subsection_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP) @@ -377,11 +396,18 @@ def main(): """The main bit""" parser = argparse.ArgumentParser() + parser.add_argument("-d", "--dochighestorder", default='chapter', help="Specify which divider is at the highest level, possible values: part, chapter. Default is chapter.") + parser.add_argument("-f", "--finalize", help="Finalize a publication.", action="store_true") parser.add_argument("teifile", help="Output from oxgarage/metypeset, an TEI XML file.") parser.add_argument("bibfile", help="The bibliography database of the publication.") parser.add_argument("figdir", help="The directory that contains the figures belonging to the publication.") args = parser.parse_args() + highest_level = args.dochighestorder + if highest_level not in ["chapter", "part"]: + sys.stderr.write("Specify either 'chapter' or 'part' as highest level. Exiting") + sys.exit() + if not os.path.exists(TMP_DIR): os.makedirs(TMP_DIR) @@ -432,18 +458,23 @@ def main(): print("-"*60) exit() - all_figures = xml_tree2.xpath("//t:graphic", namespaces=NS_MAP) - bad_figures = make_figure_elements(all_figures, args.figdir) + if args.finalize: + pass + else: + all_figures = xml_tree2.xpath("//t:graphic", namespaces=NS_MAP) + bad_figures = make_figure_elements(all_figures, args.figdir) - report["bad_figures"] = bad_figures + report["bad_figures"] = bad_figures all_references = xml_tree2.xpath("//t:bibl", namespaces=NS_MAP) - bad_pageref = parse_cited_range(all_references) - - report["bad_pageref"] = bad_pageref + if args.finalize: + pass + else: + bad_pageref = parse_cited_range(all_references) + report["bad_pageref"] = bad_pageref - tei_header = xml_tree2.xpath("//t:teiHeader", namespaces=NS_MAP) - fix_tei_header(tei_header[0], str(args.bibfile)) + tei_header = xml_tree2.xpath("//t:teiHeader", namespaces=NS_MAP) + fix_tei_header(tei_header[0], str(args.bibfile)) etree.strip_tags(xml_tree2, "tagtobestripped") @@ -473,14 +504,17 @@ def main(): # Pickle the 'data' dictionary using the highest protocol available. pickle.dump(data_to_pickle, f, pickle.HIGHEST_PROTOCOL) - fix_document_structure(xml_tree2) + fix_document_structure(xml_tree2, highest_level) # output output = args.teifile.replace(".xml", "-out.xml") tree = etree.ElementTree(xml_tree2) tree.write(output, pretty_print=True, xml_declaration=True,encoding="utf-8") logging.info("Wrote %s." % output) - evaluate_report(report) + if args.finalize: + pass + else: + evaluate_report(report) # def main ends here if __name__ == '__main__':