Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Rough work: PDF splitter by chapter
  • Loading branch information
Klaus Thoden committed Aug 10, 2018
1 parent e6ca2f0 commit 8ed53a6
Showing 1 changed file with 93 additions and 0 deletions.
93 changes: 93 additions & 0 deletions find_chapters.py
@@ -0,0 +1,93 @@
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-

"""
Find pages of a chapter in an LaTeX aux file
This is from Hack #92 from the O'Reilly Book "LaTeX Hacks"
(9783897214774) where it was implemented in Perl. It depends on a
label command per chapter and the book suggests to include an
automatic label in the titlesec command for consistency.
"""

__version__ = "1.0"
__date__ = "20180810"
__author__ = "kthoden@mpiwg-berlin.mpg.de"

import re
import argparse
import logging
from PyPDF2 import PdfFileWriter, PdfFileReader

logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')

CHAPTER_LABEL_REGEX = r"chapter\d+_.*?"
# \newlabel{chapter01_caraffa}{{1}{5}{Objects of Value: Challenging Conventional Hierarchies in the Photo Archive\EOAauthor {Costanza Caraffa}}{section*.4}{}}
NEWLABEL_REGEX = r"\\newlabel\{" + CHAPTER_LABEL_REGEX + "\}\{\{\d+\}\{(\d+)\}"

def main():
"""The main bit"""

parser = argparse.ArgumentParser()
parser.add_argument("auxfile", help="The auxfile that is used to read the page numbers from.")
parser.add_argument("pdffile", help="The PDF file that is going to be split in chapters..")
args = parser.parse_args()

with open(args.auxfile) as aux_file:
aux_lines = aux_file.read()

references = re.findall(NEWLABEL_REGEX, aux_lines)

print(references)

input1 = PdfFileReader(open(args.pdffile, "rb"))
logging.debug("Input is %s and has %d pages." % (args.pdffile, input1.getNumPages()))

output = PdfFileWriter()

for pdfpage in range(int(references[0]) - 1, int(references[1]) - 1):
output.addPage(input1.getPage(pdfpage))

output_filename = "chapter.pdf"
output_stream = open(output_filename, 'wb')
output.write(output_stream)
output_stream.close()
logging.debug("Wrote %s." % output_filename)

# def main ends here

if __name__ == '__main__':
main()
# finis

# my $offset = 0;

# sub read_chap {
# my ($filename) = $_[0];
# print "looking at $filename\n";
# my ($fh) = new IO::File "$filename";
# while (<$fh>) {
# push @chapters, $1 + $offset
# if /^\\newlabel\{chapter\d+_.*?\}\{\{\d+\}\{(\d+)\}/;
# $offset = $1
# if /^\\newlabel\{find-chap-offset\}\{\{(\d+)\}/;
# read_chap($1)
# if /^\\\@input\{(.*)\}/;
# }
# $fh->close;
# }

# my $filename = shift;
# $filename =~ s/\..*?$//;

# read_chap($filename.".aux");
# push @chapters, '';

# my $from = $chapters[$ch];
# my $to = $chapters[$ch+1] ? $chapters[$ch+1]-1 : 'end';

# print "$from-$to"

# # !system
# # "pdftk $filename.pdf cat $from-$to output $filename-$ch.pdf";
# # or die "$0: Problems with pdftk\n";

0 comments on commit 8ed53a6

Please sign in to comment.