From 1372275edd80caaaa38ff00b3c90f0f70008d376 Mon Sep 17 00:00:00 2001
From: Klaus Thoden <kthoden@mpiwg-berlin.mpg.de>
Date: Thu, 13 Sep 2018 17:14:20 +0200
Subject: [PATCH] Adding find_chapters.py

---
 doc/COSMOS.md    |  14 +----
 find_chapters.py | 133 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 135 insertions(+), 12 deletions(-)
 create mode 100644 find_chapters.py

diff --git a/doc/COSMOS.md b/doc/COSMOS.md
index 526b1dd..b326d4f 100644
--- a/doc/COSMOS.md
+++ b/doc/COSMOS.md
@@ -16,17 +16,7 @@ Contains the most important programs for the whole document conversion workflow.
  - `mkimage.py` : Create an automatically generated dummy cover to be used during testing.
  - `tei2eoatex.xsl` : An XSL converter from TEI-XML to EOATeX
  - `tei2imxml.py` : A converter from TEI to customized DocBook XML.
-
-## chapterdownload_frontmatter
-A program for generating LaTeX frontmatters for chapter downloads. In
-fact, it provides functions for a general query on book metadata. A
-variant creates ONIX type XML.
-
-The main functionality has been built into the new eoa-django platform
-and can be run via `manage.py`. A recent addition is
-`find_chapters.py` which uses data from the LaTeX auxiliary files to
-determine the chapter breakpoints and splits the book pdf into the
-single chapters that are made available for download, as well.
+ - `find_chapters.py` : Use LaTeX auxiliary files to split a PDF into chapters
 
 ## eoa-csl
 A nearly-abandoned version of a CSL configuration for EOA. Currently
@@ -38,7 +28,7 @@ functionality offered via the `manage.py` interface is currently
 expanded and offers
  - `bib2tei` : Converts a bibtex bibliography file to a TEI-XML `listBibl` structure
  - `check_tei_output`
- - `publication_add_chapter_frontpages`
+ - `publication_add_chapter_frontpages` : Generates frontmatters for chapter downloads
  - `publication_export_tei` : Exports a publication from the database as an TEI-XML file
  - `publication_list`
  - `publicationimport`
diff --git a/find_chapters.py b/find_chapters.py
new file mode 100644
index 0000000..108690d
--- /dev/null
+++ b/find_chapters.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8; mode: python -*-
+
+"""
+Find pages of a chapter in an LaTeX aux file
+
+This is from Hack #92 from the O'Reilly Book "LaTeX Hacks"
+(9783897214774) where it was implemented in Perl. It depends on a
+label command per chapter and the book suggests to include an
+automatic label in the titlesec command for consistency.
+
+The question to be answered is whether this method corresponds to how
+imxml2django splits the files.
+
+If built into Django, the whole process of cutting up the book and
+uploading chapter files and prefixing them with chapter info could be
+further automatted. However, output needs to be checked!
+
+Maybe as a preprocessing step (and not to schlepp the aux file
+around), include the chapter page information in a config file before
+uploading (or use a PDF metadata field for that?)
+"""
+
+__version__ = "1.0"
+__date__ = "20180810"
+__author__ = "kthoden@mpiwg-berlin.mpg.de"
+
+import re
+import argparse
+import logging
+from PyPDF2 import PdfFileWriter, PdfFileReader
+
+logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
+
+CHAPTER_LABEL_REGEX = r"chapter\d+_.*?"
+# \newlabel{chapter01_caraffa}{{1}{5}{Objects of Value: Challenging Conventional Hierarchies in the Photo Archive\EOAauthor {Costanza Caraffa}}{section*.4}{}}
+NEWLABEL_REGEX = r"\\newlabel\{" + CHAPTER_LABEL_REGEX + "\}\{\{\d+\}\{(\d+)\}"
+
+# declare rules how to name chapter files
+
+def write_pdf_file(output, filename):
+    """Write PDF object into file."""
+
+    output_filename = filename
+    output_stream = open(output_filename, 'wb')
+    output.write(output_stream)
+    output_stream.close()
+    logging.debug("Wrote %s.", output_filename)
+
+    return
+# def write_pdf_file ends here
+
+def main():
+    """The main bit"""
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("auxfile", help="The auxfile that is used to read the page numbers from.")
+    parser.add_argument("pdffile", help="The PDF file that is going to be split in chapters..")
+    args = parser.parse_args()
+
+    with open(args.auxfile) as aux_file:
+        aux_lines = aux_file.read()
+
+    references = re.findall(NEWLABEL_REGEX, aux_lines)
+
+    input1 = PdfFileReader(open(args.pdffile, "rb"))
+    logging.debug("Input has %d pages.", input1.getNumPages())
+
+    pdf_object = PdfFileWriter()
+
+    # the preface
+    for pdfpage in range(0, int(references[0]) - 2):
+        pdf_object.addPage(input1.getPage(pdfpage))
+    write_pdf_file(pdf_object, "chap_frontmatter.pdf")
+
+    del pdf_object
+
+    for chapter_break in references[:-1]:
+        pdf_object = PdfFileWriter()
+        current_position = references.index(chapter_break)
+        for pdfpage in range(int(chapter_break) - 1, int(references[current_position + 1]) - 1):
+            pdf_object.addPage(input1.getPage(pdfpage))
+
+        chapter_filename = "chap{:02d}.pdf".format(current_position + 1)
+        write_pdf_file(pdf_object, chapter_filename)
+
+    # the last bit
+    del pdf_object
+    pdf_object = PdfFileWriter()
+    for pdfpage in range(int(references[-1]) -1, input1.getNumPages()):
+        pdf_object.addPage(input1.getPage(pdfpage))
+
+    chapter_filename = "chap{:02d}.pdf".format(len(references))
+    write_pdf_file(pdf_object, chapter_filename)
+# def main ends here
+
+if __name__ == '__main__':
+    main()
+# finis
+
+# What follows is the original source code from above mentioned book
+
+# my $offset = 0;
+
+# sub read_chap {
+#     my ($filename) = $_[0];
+#     print "looking at $filename\n";
+#     my ($fh) = new IO::File "$filename";
+#     while (<$fh>) {
+#         push @chapters, $1 + $offset
+#             if /^\\newlabel\{chapter\d+_.*?\}\{\{\d+\}\{(\d+)\}/;
+#         $offset = $1
+#             if /^\\newlabel\{find-chap-offset\}\{\{(\d+)\}/;
+#         read_chap($1)
+#             if /^\\\@input\{(.*)\}/;
+#     }
+#     $fh->close;
+# }
+
+# my $filename = shift;
+# $filename =~ s/\..*?$//;
+
+# read_chap($filename.".aux");
+# push @chapters, '';
+
+# my $from = $chapters[$ch];
+# my $to = $chapters[$ch+1] ? $chapters[$ch+1]-1 : 'end';
+
+# print "$from-$to"
+
+# # !system
+# #     "pdftk $filename.pdf cat $from-$to output $filename-$ch.pdf";
+# # or die "$0: Problems with pdftk\n";