Rough work: PDF splitter by chapter

EditionOpenAccess · Aug 10, 2018 · 8ed53a6 · 8ed53a6
1 parent e6ca2f0
commit 8ed53a6
Showing 1 changed file with 93 additions and 0 deletions.
diff --git a/find_chapters.py b/find_chapters.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8; mode: python -*-
+
+"""
+Find pages of a chapter in an LaTeX aux file
+
+This is from Hack #92 from the O'Reilly Book "LaTeX Hacks"
+(9783897214774) where it was implemented in Perl. It depends on a
+label command per chapter and the book suggests to include an
+automatic label in the titlesec command for consistency.
+"""
+
+__version__ = "1.0"
+__date__ = "20180810"
+__author__ = "kthoden@mpiwg-berlin.mpg.de"
+
+import re
+import argparse
+import logging
+from PyPDF2 import PdfFileWriter, PdfFileReader
+
+logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
+
+CHAPTER_LABEL_REGEX = r"chapter\d+_.*?"
+# \newlabel{chapter01_caraffa}{{1}{5}{Objects of Value: Challenging Conventional Hierarchies in the Photo Archive\EOAauthor {Costanza Caraffa}}{section*.4}{}}
+NEWLABEL_REGEX = r"\\newlabel\{" + CHAPTER_LABEL_REGEX + "\}\{\{\d+\}\{(\d+)\}"
+
+def main():
+    """The main bit"""
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("auxfile", help="The auxfile that is used to read the page numbers from.")
+    parser.add_argument("pdffile", help="The PDF file that is going to be split in chapters..")
+    args = parser.parse_args()
+
+    with open(args.auxfile) as aux_file:
+        aux_lines = aux_file.read()
+
+    references = re.findall(NEWLABEL_REGEX, aux_lines)
+
+    print(references)
+
+    input1 = PdfFileReader(open(args.pdffile, "rb"))
+    logging.debug("Input is %s and has %d pages." % (args.pdffile, input1.getNumPages()))
+
+    output = PdfFileWriter()
+
+    for pdfpage in range(int(references[0]) - 1, int(references[1]) - 1):
+        output.addPage(input1.getPage(pdfpage))
+
+    output_filename = "chapter.pdf"
+    output_stream = open(output_filename, 'wb')
+    output.write(output_stream)
+    output_stream.close()
+    logging.debug("Wrote %s." % output_filename)
+
+# def main ends here
+
+if __name__ == '__main__':
+    main()
+# finis
+
+# my $offset = 0;
+
+# sub read_chap {
+#     my ($filename) = $_[0];
+#     print "looking at $filename\n";
+#     my ($fh) = new IO::File "$filename";
+#     while (<$fh>) {
+#         push @chapters, $1 + $offset
+#             if /^\\newlabel\{chapter\d+_.*?\}\{\{\d+\}\{(\d+)\}/;
+#         $offset = $1
+#             if /^\\newlabel\{find-chap-offset\}\{\{(\d+)\}/;
+#         read_chap($1)
+#             if /^\\\@input\{(.*)\}/;
+#     }
+#     $fh->close;
+# }
+
+# my $filename = shift;
+# $filename =~ s/\..*?$//;
+
+# read_chap($filename.".aux");
+# push @chapters, '';
+
+# my $from = $chapters[$ch];
+# my $to = $chapters[$ch+1] ? $chapters[$ch+1]-1 : 'end';
+
+# print "$from-$to"
+
+# # !system
+# #     "pdftk $filename.pdf cat $from-$to output $filename-$ch.pdf";
+# # or die "$0: Problems with pdftk\n";