Stable version

EditionOpenAccess · Aug 23, 2018 · a2794fc · a2794fc
1 parent 8ed53a6
commit a2794fc
Showing 1 changed file with 51 additions and 11 deletions.
diff --git a/find_chapters.py b/find_chapters.py
@@ -8,6 +8,17 @@
 (9783897214774) where it was implemented in Perl. It depends on a
 label command per chapter and the book suggests to include an
 automatic label in the titlesec command for consistency.
+
+The question to be answered is whether this method corresponds to how
+imxml2django splits the files.
+
+If built into Django, the whole process of cutting up the book and
+uploading chapter files and prefixing them with chapter info could be
+further automatted. However, output needs to be checked!
+
+Maybe as a preprocessing step (and not to schlepp the aux file
+around), include the chapter page information in a config file before
+uploading (or use a PDF metadata field for that?)
 """
 
 __version__ = "1.0"
@@ -25,6 +36,20 @@
 # \newlabel{chapter01_caraffa}{{1}{5}{Objects of Value: Challenging Conventional Hierarchies in the Photo Archive\EOAauthor {Costanza Caraffa}}{section*.4}{}}
 NEWLABEL_REGEX = r"\\newlabel\{" + CHAPTER_LABEL_REGEX + "\}\{\{\d+\}\{(\d+)\}"
 
+# declare rules how to name chapter files
+
+def write_pdf_file(output, filename):
+    """Write PDF object into file."""
+
+    output_filename = filename
+    output_stream = open(output_filename, 'wb')
+    output.write(output_stream)
+    output_stream.close()
+    logging.debug("Wrote %s.", output_filename)
+
+    return
+# def write_pdf_file ends here
+
 def main():
     """The main bit"""
 
@@ -38,28 +63,43 @@ def main():
 
     references = re.findall(NEWLABEL_REGEX, aux_lines)
 
-    print(references)
-
     input1 = PdfFileReader(open(args.pdffile, "rb"))
-    logging.debug("Input is %s and has %d pages." % (args.pdffile, input1.getNumPages()))
+    logging.debug("Input has %d pages.", input1.getNumPages())
 
-    output = PdfFileWriter()
+    pdf_object = PdfFileWriter()
 
-    for pdfpage in range(int(references[0]) - 1, int(references[1]) - 1):
-        output.addPage(input1.getPage(pdfpage))
+    # the preface
+    for pdfpage in range(0, int(references[0]) - 2):
+        pdf_object.addPage(input1.getPage(pdfpage))
+    write_pdf_file(pdf_object, "chap_frontmatter.pdf")
 
-    output_filename = "chapter.pdf"
-    output_stream = open(output_filename, 'wb')
-    output.write(output_stream)
-    output_stream.close()
-    logging.debug("Wrote %s." % output_filename)
+    del pdf_object
 
+    for chapter_break in references[:-1]:
+        pdf_object = PdfFileWriter()
+        current_position = references.index(chapter_break)
+        for pdfpage in range(int(chapter_break) - 1, int(references[current_position + 1]) - 1):
+            pdf_object.addPage(input1.getPage(pdfpage))
+
+        chapter_filename = "chap{:02d}.pdf".format(current_position + 1)
+        write_pdf_file(pdf_object, chapter_filename)
+
+    # the last bit
+    del pdf_object
+    pdf_object = PdfFileWriter()
+    for pdfpage in range(int(references[-1]) -1, input1.getNumPages()):
+        pdf_object.addPage(input1.getPage(pdfpage))
+
+    chapter_filename = "chap{:02d}.pdf".format(len(references))
+    write_pdf_file(pdf_object, chapter_filename)
 # def main ends here
 
 if __name__ == '__main__':
     main()
 # finis
 
+# What follows is the original source code from above mentioned book
+
 # my $offset = 0;
 
 # sub read_chap {