Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
More command line options
Highest level of sections can be specified
  • Loading branch information
Klaus Thoden committed May 29, 2018
1 parent 914ef82 commit ba29277
Showing 1 changed file with 58 additions and 24 deletions.
82 changes: 58 additions & 24 deletions fix_tei.py
Expand Up @@ -271,25 +271,44 @@ def cleanup_xml(xml_tree):
return xml_tree
# def cleanup_xml ends here

def fix_document_structure(xml_tree):
def fix_document_structure(xml_tree, highest_level):
"""Insert div types"""

# Unsure here, but maybe have a rule that one file is one chapter,
# so the highest level would be sections

chapter_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP)
section_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP)
subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP)
subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP)

for chapter in chapter_divs:
chapter.set("type", "chapter")
for section in section_divs:
section.set("type", "section")
for subsection in subsection_divs:
subsection.set("type", "subsection")
for subsubsection in subsubsection_divs:
subsubsection.set("type", "subsubsection")
if highest_level == "chapter":
chapter_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP)
section_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP)
subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP)
subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP)

for chapter in chapter_divs:
chapter.set("type", "chapter")
for section in section_divs:
section.set("type", "section")
for subsection in subsection_divs:
subsection.set("type", "subsection")
for subsubsection in subsubsection_divs:
subsubsection.set("type", "subsubsection")

elif highest_level == "part":
part_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP)
chapter_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP)
section_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP)
subsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div", namespaces=NS_MAP)
subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div/t:div/t:div", namespaces=NS_MAP)

for part in part_divs:
part.set("type", "part")
for chapter in chapter_divs:
chapter.set("type", "chapter")
for section in section_divs:
section.set("type", "section")
for subsection in subsection_divs:
subsection.set("type", "subsection")
for subsubsection in subsubsection_divs:
subsubsection.set("type", "subsubsection")

# section_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP)
# subsection_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP)
Expand Down Expand Up @@ -377,11 +396,18 @@ def main():
"""The main bit"""

parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dochighestorder", default='chapter', help="Specify which divider is at the highest level, possible values: part, chapter. Default is chapter.")
parser.add_argument("-f", "--finalize", help="Finalize a publication.", action="store_true")
parser.add_argument("teifile", help="Output from oxgarage/metypeset, an TEI XML file.")
parser.add_argument("bibfile", help="The bibliography database of the publication.")
parser.add_argument("figdir", help="The directory that contains the figures belonging to the publication.")
args = parser.parse_args()

highest_level = args.dochighestorder
if highest_level not in ["chapter", "part"]:
sys.stderr.write("Specify either 'chapter' or 'part' as highest level. Exiting")
sys.exit()

if not os.path.exists(TMP_DIR):
os.makedirs(TMP_DIR)

Expand Down Expand Up @@ -432,18 +458,23 @@ def main():
print("-"*60)
exit()

all_figures = xml_tree2.xpath("//t:graphic", namespaces=NS_MAP)
bad_figures = make_figure_elements(all_figures, args.figdir)
if args.finalize:
pass
else:
all_figures = xml_tree2.xpath("//t:graphic", namespaces=NS_MAP)
bad_figures = make_figure_elements(all_figures, args.figdir)

report["bad_figures"] = bad_figures
report["bad_figures"] = bad_figures

all_references = xml_tree2.xpath("//t:bibl", namespaces=NS_MAP)
bad_pageref = parse_cited_range(all_references)

report["bad_pageref"] = bad_pageref
if args.finalize:
pass
else:
bad_pageref = parse_cited_range(all_references)
report["bad_pageref"] = bad_pageref

tei_header = xml_tree2.xpath("//t:teiHeader", namespaces=NS_MAP)
fix_tei_header(tei_header[0], str(args.bibfile))
tei_header = xml_tree2.xpath("//t:teiHeader", namespaces=NS_MAP)
fix_tei_header(tei_header[0], str(args.bibfile))

etree.strip_tags(xml_tree2, "tagtobestripped")

Expand Down Expand Up @@ -473,14 +504,17 @@ def main():
# Pickle the 'data' dictionary using the highest protocol available.
pickle.dump(data_to_pickle, f, pickle.HIGHEST_PROTOCOL)

fix_document_structure(xml_tree2)
fix_document_structure(xml_tree2, highest_level)
# output
output = args.teifile.replace(".xml", "-out.xml")
tree = etree.ElementTree(xml_tree2)
tree.write(output, pretty_print=True, xml_declaration=True,encoding="utf-8")
logging.info("Wrote %s." % output)

evaluate_report(report)
if args.finalize:
pass
else:
evaluate_report(report)
# def main ends here

if __name__ == '__main__':
Expand Down

0 comments on commit ba29277

Please sign in to comment.