Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Some changes
  • Loading branch information
Klaus Thoden committed Nov 20, 2018
1 parent 78961df commit 37cc481
Showing 1 changed file with 7 additions and 2 deletions.
9 changes: 7 additions & 2 deletions find_chapters.py
Expand Up @@ -26,13 +26,14 @@
__author__ = "kthoden@mpiwg-berlin.mpg.de"

import re
import sys
import argparse
import logging
from PyPDF2 import PdfFileWriter, PdfFileReader

logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')

CHAPTER_LABEL_REGEX = r"chapter\d+_.*?"
CHAPTER_LABEL_REGEX = r"chap\d+_.*?"
# \newlabel{chapter01_caraffa}{{1}{5}{Objects of Value: Challenging Conventional Hierarchies in the Photo Archive\EOAauthor {Costanza Caraffa}}{section*.4}{}}
NEWLABEL_REGEX = r"\\newlabel\{" + CHAPTER_LABEL_REGEX + "\}\{\{\d+\}\{(\d+)\}"

Expand All @@ -55,14 +56,18 @@ def main():

parser = argparse.ArgumentParser()
parser.add_argument("auxfile", help="The auxfile that is used to read the page numbers from.")
parser.add_argument("pdffile", help="The PDF file that is going to be split in chapters..")
parser.add_argument("pdffile", help="The PDF file that is going to be split in chapters.")
args = parser.parse_args()

with open(args.auxfile) as aux_file:
aux_lines = aux_file.read()

references = re.findall(NEWLABEL_REGEX, aux_lines)

if not references:
logging.error("No chapter marks found. Exiting")
sys.exit()

input1 = PdfFileReader(open(args.pdffile, "rb"))
logging.debug("Input has %d pages.", input1.getNumPages())

Expand Down

0 comments on commit 37cc481

Please sign in to comment.