From 45d4a94e609c9b4b0b9744b6ea7784719f5d8699 Mon Sep 17 00:00:00 2001 From: Klaus Thoden Date: Tue, 20 Nov 2018 14:43:47 +0100 Subject: [PATCH] Get labels from XML file --- find_chapters.py | 45 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/find_chapters.py b/find_chapters.py index 5e97167..71353c2 100644 --- a/find_chapters.py +++ b/find_chapters.py @@ -26,6 +26,7 @@ __author__ = "kthoden@mpiwg-berlin.mpg.de" import re +import os import sys import argparse import logging @@ -33,12 +34,25 @@ logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') +OUTPUT_DIR = os.path.expanduser("chapter_files") + CHAPTER_LABEL_REGEX = r"chap\d+_.*?" # \newlabel{chapter01_caraffa}{{1}{5}{Objects of Value: Challenging Conventional Hierarchies in the Photo Archive\EOAauthor {Costanza Caraffa}}{section*.4}{}} NEWLABEL_REGEX = r"\\newlabel\{" + CHAPTER_LABEL_REGEX + "\}\{\{\d+\}\{(\d+)\}" # declare rules how to name chapter files +def get_labels_from_xml(xmlfile): + """Perform an xpath search for ids of chapters.""" + + from lxml import etree + + xml_tree = etree.parse(xmlfile) + eoa_chapters = xml_tree.xpath("//tei:div[@type='chapter']/@xml:id", namespaces={"tei" : "http://www.tei-c.org/ns/1.0"}) + + return eoa_chapters +# def get_labels_from_xml ends here + def write_pdf_file(output, filename): """Write PDF object into file.""" @@ -57,12 +71,26 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument("auxfile", help="The auxfile that is used to read the page numbers from.") parser.add_argument("pdffile", help="The PDF file that is going to be split in chapters.") + parser.add_argument("-t", "--teifile", help="The TEI file for getting the labels of chapters.") + parser.add_argument("-o", "--offset", help="An offset to be added to the pages. You have to manually find out the number. Should be around 7.", default=0) args = parser.parse_args() + offset = int(args.offset) + with open(args.auxfile) as aux_file: aux_lines = aux_file.read() - references = re.findall(NEWLABEL_REGEX, aux_lines) + if args.teifile: + references = [] + chapter_ids = get_labels_from_xml(args.teifile) + for chapter_id in chapter_ids: + newlabel_regex = r"\\newlabel\{" + chapter_id + "\}\{\{\d+\}\{(\d+)\}" + ding = re.findall(newlabel_regex, aux_lines)[0] + references.append(ding) + else: + references = re.findall(NEWLABEL_REGEX, aux_lines) + + print(references) if not references: logging.error("No chapter marks found. Exiting") @@ -73,30 +101,33 @@ def main(): pdf_object = PdfFileWriter() + if not os.path.exists(OUTPUT_DIR): + os.makedirs(OUTPUT_DIR) + # the preface - for pdfpage in range(0, int(references[0]) - 2): + for pdfpage in range(0, int(references[0]) - 2 + offset): pdf_object.addPage(input1.getPage(pdfpage)) - write_pdf_file(pdf_object, "chap_frontmatter.pdf") + write_pdf_file(pdf_object, OUTPUT_DIR + os.path.sep + "chap_frontmatter.pdf") del pdf_object for chapter_break in references[:-1]: pdf_object = PdfFileWriter() current_position = references.index(chapter_break) - for pdfpage in range(int(chapter_break) - 1, int(references[current_position + 1]) - 1): + for pdfpage in range(int(chapter_break) - 1 + offset, int(references[current_position + 1]) - 1 + offset): pdf_object.addPage(input1.getPage(pdfpage)) chapter_filename = "chap{:02d}.pdf".format(current_position + 1) - write_pdf_file(pdf_object, chapter_filename) + write_pdf_file(pdf_object, OUTPUT_DIR + os.path.sep + chapter_filename) # the last bit del pdf_object pdf_object = PdfFileWriter() - for pdfpage in range(int(references[-1]) -1, input1.getNumPages()): + for pdfpage in range(int(references[-1]) - 1 + offset, input1.getNumPages()): pdf_object.addPage(input1.getPage(pdfpage)) chapter_filename = "chap{:02d}.pdf".format(len(references)) - write_pdf_file(pdf_object, chapter_filename) + write_pdf_file(pdf_object, OUTPUT_DIR + os.path.sep + chapter_filename) # def main ends here if __name__ == '__main__':