From 1372275edd80caaaa38ff00b3c90f0f70008d376 Mon Sep 17 00:00:00 2001 From: Klaus Thoden Date: Thu, 13 Sep 2018 17:14:20 +0200 Subject: [PATCH] Adding find_chapters.py --- doc/COSMOS.md | 14 +---- find_chapters.py | 133 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+), 12 deletions(-) create mode 100644 find_chapters.py diff --git a/doc/COSMOS.md b/doc/COSMOS.md index 526b1dd..b326d4f 100644 --- a/doc/COSMOS.md +++ b/doc/COSMOS.md @@ -16,17 +16,7 @@ Contains the most important programs for the whole document conversion workflow. - `mkimage.py` : Create an automatically generated dummy cover to be used during testing. - `tei2eoatex.xsl` : An XSL converter from TEI-XML to EOATeX - `tei2imxml.py` : A converter from TEI to customized DocBook XML. - -## chapterdownload_frontmatter -A program for generating LaTeX frontmatters for chapter downloads. In -fact, it provides functions for a general query on book metadata. A -variant creates ONIX type XML. - -The main functionality has been built into the new eoa-django platform -and can be run via `manage.py`. A recent addition is -`find_chapters.py` which uses data from the LaTeX auxiliary files to -determine the chapter breakpoints and splits the book pdf into the -single chapters that are made available for download, as well. + - `find_chapters.py` : Use LaTeX auxiliary files to split a PDF into chapters ## eoa-csl A nearly-abandoned version of a CSL configuration for EOA. Currently @@ -38,7 +28,7 @@ functionality offered via the `manage.py` interface is currently expanded and offers - `bib2tei` : Converts a bibtex bibliography file to a TEI-XML `listBibl` structure - `check_tei_output` - - `publication_add_chapter_frontpages` + - `publication_add_chapter_frontpages` : Generates frontmatters for chapter downloads - `publication_export_tei` : Exports a publication from the database as an TEI-XML file - `publication_list` - `publicationimport` diff --git a/find_chapters.py b/find_chapters.py new file mode 100644 index 0000000..108690d --- /dev/null +++ b/find_chapters.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8; mode: python -*- + +""" +Find pages of a chapter in an LaTeX aux file + +This is from Hack #92 from the O'Reilly Book "LaTeX Hacks" +(9783897214774) where it was implemented in Perl. It depends on a +label command per chapter and the book suggests to include an +automatic label in the titlesec command for consistency. + +The question to be answered is whether this method corresponds to how +imxml2django splits the files. + +If built into Django, the whole process of cutting up the book and +uploading chapter files and prefixing them with chapter info could be +further automatted. However, output needs to be checked! + +Maybe as a preprocessing step (and not to schlepp the aux file +around), include the chapter page information in a config file before +uploading (or use a PDF metadata field for that?) +""" + +__version__ = "1.0" +__date__ = "20180810" +__author__ = "kthoden@mpiwg-berlin.mpg.de" + +import re +import argparse +import logging +from PyPDF2 import PdfFileWriter, PdfFileReader + +logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') + +CHAPTER_LABEL_REGEX = r"chapter\d+_.*?" +# \newlabel{chapter01_caraffa}{{1}{5}{Objects of Value: Challenging Conventional Hierarchies in the Photo Archive\EOAauthor {Costanza Caraffa}}{section*.4}{}} +NEWLABEL_REGEX = r"\\newlabel\{" + CHAPTER_LABEL_REGEX + "\}\{\{\d+\}\{(\d+)\}" + +# declare rules how to name chapter files + +def write_pdf_file(output, filename): + """Write PDF object into file.""" + + output_filename = filename + output_stream = open(output_filename, 'wb') + output.write(output_stream) + output_stream.close() + logging.debug("Wrote %s.", output_filename) + + return +# def write_pdf_file ends here + +def main(): + """The main bit""" + + parser = argparse.ArgumentParser() + parser.add_argument("auxfile", help="The auxfile that is used to read the page numbers from.") + parser.add_argument("pdffile", help="The PDF file that is going to be split in chapters..") + args = parser.parse_args() + + with open(args.auxfile) as aux_file: + aux_lines = aux_file.read() + + references = re.findall(NEWLABEL_REGEX, aux_lines) + + input1 = PdfFileReader(open(args.pdffile, "rb")) + logging.debug("Input has %d pages.", input1.getNumPages()) + + pdf_object = PdfFileWriter() + + # the preface + for pdfpage in range(0, int(references[0]) - 2): + pdf_object.addPage(input1.getPage(pdfpage)) + write_pdf_file(pdf_object, "chap_frontmatter.pdf") + + del pdf_object + + for chapter_break in references[:-1]: + pdf_object = PdfFileWriter() + current_position = references.index(chapter_break) + for pdfpage in range(int(chapter_break) - 1, int(references[current_position + 1]) - 1): + pdf_object.addPage(input1.getPage(pdfpage)) + + chapter_filename = "chap{:02d}.pdf".format(current_position + 1) + write_pdf_file(pdf_object, chapter_filename) + + # the last bit + del pdf_object + pdf_object = PdfFileWriter() + for pdfpage in range(int(references[-1]) -1, input1.getNumPages()): + pdf_object.addPage(input1.getPage(pdfpage)) + + chapter_filename = "chap{:02d}.pdf".format(len(references)) + write_pdf_file(pdf_object, chapter_filename) +# def main ends here + +if __name__ == '__main__': + main() +# finis + +# What follows is the original source code from above mentioned book + +# my $offset = 0; + +# sub read_chap { +# my ($filename) = $_[0]; +# print "looking at $filename\n"; +# my ($fh) = new IO::File "$filename"; +# while (<$fh>) { +# push @chapters, $1 + $offset +# if /^\\newlabel\{chapter\d+_.*?\}\{\{\d+\}\{(\d+)\}/; +# $offset = $1 +# if /^\\newlabel\{find-chap-offset\}\{\{(\d+)\}/; +# read_chap($1) +# if /^\\\@input\{(.*)\}/; +# } +# $fh->close; +# } + +# my $filename = shift; +# $filename =~ s/\..*?$//; + +# read_chap($filename.".aux"); +# push @chapters, ''; + +# my $from = $chapters[$ch]; +# my $to = $chapters[$ch+1] ? $chapters[$ch+1]-1 : 'end'; + +# print "$from-$to" + +# # !system +# # "pdftk $filename.pdf cat $from-$to output $filename-$ch.pdf"; +# # or die "$0: Problems with pdftk\n";