diff --git a/find_chapters.py b/find_chapters.py index 6765a3d..108690d 100644 --- a/find_chapters.py +++ b/find_chapters.py @@ -8,6 +8,17 @@ (9783897214774) where it was implemented in Perl. It depends on a label command per chapter and the book suggests to include an automatic label in the titlesec command for consistency. + +The question to be answered is whether this method corresponds to how +imxml2django splits the files. + +If built into Django, the whole process of cutting up the book and +uploading chapter files and prefixing them with chapter info could be +further automatted. However, output needs to be checked! + +Maybe as a preprocessing step (and not to schlepp the aux file +around), include the chapter page information in a config file before +uploading (or use a PDF metadata field for that?) """ __version__ = "1.0" @@ -25,6 +36,20 @@ # \newlabel{chapter01_caraffa}{{1}{5}{Objects of Value: Challenging Conventional Hierarchies in the Photo Archive\EOAauthor {Costanza Caraffa}}{section*.4}{}} NEWLABEL_REGEX = r"\\newlabel\{" + CHAPTER_LABEL_REGEX + "\}\{\{\d+\}\{(\d+)\}" +# declare rules how to name chapter files + +def write_pdf_file(output, filename): + """Write PDF object into file.""" + + output_filename = filename + output_stream = open(output_filename, 'wb') + output.write(output_stream) + output_stream.close() + logging.debug("Wrote %s.", output_filename) + + return +# def write_pdf_file ends here + def main(): """The main bit""" @@ -38,28 +63,43 @@ def main(): references = re.findall(NEWLABEL_REGEX, aux_lines) - print(references) - input1 = PdfFileReader(open(args.pdffile, "rb")) - logging.debug("Input is %s and has %d pages." % (args.pdffile, input1.getNumPages())) + logging.debug("Input has %d pages.", input1.getNumPages()) - output = PdfFileWriter() + pdf_object = PdfFileWriter() - for pdfpage in range(int(references[0]) - 1, int(references[1]) - 1): - output.addPage(input1.getPage(pdfpage)) + # the preface + for pdfpage in range(0, int(references[0]) - 2): + pdf_object.addPage(input1.getPage(pdfpage)) + write_pdf_file(pdf_object, "chap_frontmatter.pdf") - output_filename = "chapter.pdf" - output_stream = open(output_filename, 'wb') - output.write(output_stream) - output_stream.close() - logging.debug("Wrote %s." % output_filename) + del pdf_object + for chapter_break in references[:-1]: + pdf_object = PdfFileWriter() + current_position = references.index(chapter_break) + for pdfpage in range(int(chapter_break) - 1, int(references[current_position + 1]) - 1): + pdf_object.addPage(input1.getPage(pdfpage)) + + chapter_filename = "chap{:02d}.pdf".format(current_position + 1) + write_pdf_file(pdf_object, chapter_filename) + + # the last bit + del pdf_object + pdf_object = PdfFileWriter() + for pdfpage in range(int(references[-1]) -1, input1.getNumPages()): + pdf_object.addPage(input1.getPage(pdfpage)) + + chapter_filename = "chap{:02d}.pdf".format(len(references)) + write_pdf_file(pdf_object, chapter_filename) # def main ends here if __name__ == '__main__': main() # finis +# What follows is the original source code from above mentioned book + # my $offset = 0; # sub read_chap {