Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Get labels from XML file
  • Loading branch information
Klaus Thoden committed Nov 20, 2018
1 parent c53cfd4 commit 45d4a94
Showing 1 changed file with 38 additions and 7 deletions.
45 changes: 38 additions & 7 deletions find_chapters.py
Expand Up @@ -26,19 +26,33 @@
__author__ = "kthoden@mpiwg-berlin.mpg.de" __author__ = "kthoden@mpiwg-berlin.mpg.de"


import re import re
import os
import sys import sys
import argparse import argparse
import logging import logging
from PyPDF2 import PdfFileWriter, PdfFileReader from PyPDF2 import PdfFileWriter, PdfFileReader


logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')


OUTPUT_DIR = os.path.expanduser("chapter_files")

CHAPTER_LABEL_REGEX = r"chap\d+_.*?" CHAPTER_LABEL_REGEX = r"chap\d+_.*?"
# \newlabel{chapter01_caraffa}{{1}{5}{Objects of Value: Challenging Conventional Hierarchies in the Photo Archive\EOAauthor {Costanza Caraffa}}{section*.4}{}} # \newlabel{chapter01_caraffa}{{1}{5}{Objects of Value: Challenging Conventional Hierarchies in the Photo Archive\EOAauthor {Costanza Caraffa}}{section*.4}{}}
NEWLABEL_REGEX = r"\\newlabel\{" + CHAPTER_LABEL_REGEX + "\}\{\{\d+\}\{(\d+)\}" NEWLABEL_REGEX = r"\\newlabel\{" + CHAPTER_LABEL_REGEX + "\}\{\{\d+\}\{(\d+)\}"


# declare rules how to name chapter files # declare rules how to name chapter files


def get_labels_from_xml(xmlfile):
"""Perform an xpath search for ids of chapters."""

from lxml import etree

xml_tree = etree.parse(xmlfile)
eoa_chapters = xml_tree.xpath("//tei:div[@type='chapter']/@xml:id", namespaces={"tei" : "http://www.tei-c.org/ns/1.0"})

return eoa_chapters
# def get_labels_from_xml ends here

def write_pdf_file(output, filename): def write_pdf_file(output, filename):
"""Write PDF object into file.""" """Write PDF object into file."""


Expand All @@ -57,12 +71,26 @@ def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("auxfile", help="The auxfile that is used to read the page numbers from.") parser.add_argument("auxfile", help="The auxfile that is used to read the page numbers from.")
parser.add_argument("pdffile", help="The PDF file that is going to be split in chapters.") parser.add_argument("pdffile", help="The PDF file that is going to be split in chapters.")
parser.add_argument("-t", "--teifile", help="The TEI file for getting the labels of chapters.")
parser.add_argument("-o", "--offset", help="An offset to be added to the pages. You have to manually find out the number. Should be around 7.", default=0)
args = parser.parse_args() args = parser.parse_args()


offset = int(args.offset)

with open(args.auxfile) as aux_file: with open(args.auxfile) as aux_file:
aux_lines = aux_file.read() aux_lines = aux_file.read()


references = re.findall(NEWLABEL_REGEX, aux_lines) if args.teifile:
references = []
chapter_ids = get_labels_from_xml(args.teifile)
for chapter_id in chapter_ids:
newlabel_regex = r"\\newlabel\{" + chapter_id + "\}\{\{\d+\}\{(\d+)\}"
ding = re.findall(newlabel_regex, aux_lines)[0]
references.append(ding)
else:
references = re.findall(NEWLABEL_REGEX, aux_lines)

print(references)


if not references: if not references:
logging.error("No chapter marks found. Exiting") logging.error("No chapter marks found. Exiting")
Expand All @@ -73,30 +101,33 @@ def main():


pdf_object = PdfFileWriter() pdf_object = PdfFileWriter()


if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)

# the preface # the preface
for pdfpage in range(0, int(references[0]) - 2): for pdfpage in range(0, int(references[0]) - 2 + offset):
pdf_object.addPage(input1.getPage(pdfpage)) pdf_object.addPage(input1.getPage(pdfpage))
write_pdf_file(pdf_object, "chap_frontmatter.pdf") write_pdf_file(pdf_object, OUTPUT_DIR + os.path.sep + "chap_frontmatter.pdf")


del pdf_object del pdf_object


for chapter_break in references[:-1]: for chapter_break in references[:-1]:
pdf_object = PdfFileWriter() pdf_object = PdfFileWriter()
current_position = references.index(chapter_break) current_position = references.index(chapter_break)
for pdfpage in range(int(chapter_break) - 1, int(references[current_position + 1]) - 1): for pdfpage in range(int(chapter_break) - 1 + offset, int(references[current_position + 1]) - 1 + offset):
pdf_object.addPage(input1.getPage(pdfpage)) pdf_object.addPage(input1.getPage(pdfpage))


chapter_filename = "chap{:02d}.pdf".format(current_position + 1) chapter_filename = "chap{:02d}.pdf".format(current_position + 1)
write_pdf_file(pdf_object, chapter_filename) write_pdf_file(pdf_object, OUTPUT_DIR + os.path.sep + chapter_filename)


# the last bit # the last bit
del pdf_object del pdf_object
pdf_object = PdfFileWriter() pdf_object = PdfFileWriter()
for pdfpage in range(int(references[-1]) -1, input1.getNumPages()): for pdfpage in range(int(references[-1]) - 1 + offset, input1.getNumPages()):
pdf_object.addPage(input1.getPage(pdfpage)) pdf_object.addPage(input1.getPage(pdfpage))


chapter_filename = "chap{:02d}.pdf".format(len(references)) chapter_filename = "chap{:02d}.pdf".format(len(references))
write_pdf_file(pdf_object, chapter_filename) write_pdf_file(pdf_object, OUTPUT_DIR + os.path.sep + chapter_filename)
# def main ends here # def main ends here


if __name__ == '__main__': if __name__ == '__main__':
Expand Down

0 comments on commit 45d4a94

Please sign in to comment.