find_chapters.py

#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-

"""
Find pages of a chapter in an LaTeX aux file

This is from Hack #92 from the O'Reilly Book "LaTeX Hacks"
(9783897214774) where it was implemented in Perl. It depends on a
label command per chapter and the book suggests to include an
automatic label in the titlesec command for consistency.

The question to be answered is whether this method corresponds to how
imxml2django splits the files.

If built into Django, the whole process of cutting up the book and
uploading chapter files and prefixing them with chapter info could be
further automatted. However, output needs to be checked!

Maybe as a preprocessing step (and not to schlepp the aux file
around), include the chapter page information in a config file before
uploading (or use a PDF metadata field for that?)
"""

__version__ = "1.0"
__date__ = "20180810"
__author__ = "kthoden@mpiwg-berlin.mpg.de"

import re
import sys
import argparse
import logging
from PyPDF2 import PdfFileWriter, PdfFileReader

logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')

CHAPTER_LABEL_REGEX = r"chap\d+_.*?"
# \newlabel{chapter01_caraffa}{{1}{5}{Objects of Value: Challenging Conventional Hierarchies in the Photo Archive\EOAauthor {Costanza Caraffa}}{section*.4}{}}
NEWLABEL_REGEX = r"\\newlabel\{" + CHAPTER_LABEL_REGEX + "\}\{\{\d+\}\{(\d+)\}"

# declare rules how to name chapter files

def write_pdf_file(output, filename):
    """Write PDF object into file."""

    output_filename = filename
    output_stream = open(output_filename, 'wb')
    output.write(output_stream)
    output_stream.close()
    logging.debug("Wrote %s.", output_filename)

    return
# def write_pdf_file ends here

def main():
    """The main bit"""

    parser = argparse.ArgumentParser()
    parser.add_argument("auxfile", help="The auxfile that is used to read the page numbers from.")
    parser.add_argument("pdffile", help="The PDF file that is going to be split in chapters.")
    args = parser.parse_args()

    with open(args.auxfile) as aux_file:
        aux_lines = aux_file.read()

    references = re.findall(NEWLABEL_REGEX, aux_lines)

    if not references:
        logging.error("No chapter marks found. Exiting")
        sys.exit()

    input1 = PdfFileReader(open(args.pdffile, "rb"))
    logging.debug("Input has %d pages.", input1.getNumPages())

    pdf_object = PdfFileWriter()

    # the preface
    for pdfpage in range(0, int(references[0]) - 2):
        pdf_object.addPage(input1.getPage(pdfpage))
    write_pdf_file(pdf_object, "chap_frontmatter.pdf")

    del pdf_object

    for chapter_break in references[:-1]:
        pdf_object = PdfFileWriter()
        current_position = references.index(chapter_break)
        for pdfpage in range(int(chapter_break) - 1, int(references[current_position + 1]) - 1):
            pdf_object.addPage(input1.getPage(pdfpage))

        chapter_filename = "chap{:02d}.pdf".format(current_position + 1)
        write_pdf_file(pdf_object, chapter_filename)

    # the last bit
    del pdf_object
    pdf_object = PdfFileWriter()
    for pdfpage in range(int(references[-1]) -1, input1.getNumPages()):
        pdf_object.addPage(input1.getPage(pdfpage))

    chapter_filename = "chap{:02d}.pdf".format(len(references))
    write_pdf_file(pdf_object, chapter_filename)
# def main ends here

if __name__ == '__main__':
    main()
# finis

# What follows is the original source code from above mentioned book

# my $offset = 0;

# sub read_chap {
#     my ($filename) = $_[0];
#     print "looking at $filename\n";
#     my ($fh) = new IO::File "$filename";
#     while (<$fh>) {
#         push @chapters, $1 + $offset
#             if /^\\newlabel\{chapter\d+_.*?\}\{\{\d+\}\{(\d+)\}/;
#         $offset = $1
#             if /^\\newlabel\{find-chap-offset\}\{\{(\d+)\}/;
#         read_chap($1)
#             if /^\\\@input\{(.*)\}/;
#     }
#     $fh->close;
# }

# my $filename = shift;
# $filename =~ s/\..*?$//;

# read_chap($filename.".aux");
# push @chapters, '';

# my $from = $chapters[$ch];
# my $to = $chapters[$ch+1] ? $chapters[$ch+1]-1 : 'end';

# print "$from-$to"

# # !system
# #     "pdftk $filename.pdf cat $from-$to output $filename-$ch.pdf";
# # or die "$0: Problems with pdftk\n";
	#!/usr/bin/env python3
	# -- coding: utf-8; mode: python --

	"""
	Find pages of a chapter in an LaTeX aux file

	This is from Hack #92 from the O'Reilly Book "LaTeX Hacks"
	(9783897214774) where it was implemented in Perl. It depends on a
	label command per chapter and the book suggests to include an
	automatic label in the titlesec command for consistency.

	The question to be answered is whether this method corresponds to how
	imxml2django splits the files.

	If built into Django, the whole process of cutting up the book and
	uploading chapter files and prefixing them with chapter info could be
	further automatted. However, output needs to be checked!

	Maybe as a preprocessing step (and not to schlepp the aux file
	around), include the chapter page information in a config file before
	uploading (or use a PDF metadata field for that?)
	"""

	__version__ = "1.0"
	__date__ = "20180810"
	__author__ = "kthoden@mpiwg-berlin.mpg.de"

	import re
	import sys
	import argparse
	import logging
	from PyPDF2 import PdfFileWriter, PdfFileReader

	logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')

	CHAPTER_LABEL_REGEX = r"chap\d+_.*?"
	# \newlabel{chapter01_caraffa}{{1}{5}{Objects of Value: Challenging Conventional Hierarchies in the Photo Archive\EOAauthor {Costanza Caraffa}}{section*.4}{}}
	NEWLABEL_REGEX = r"\\newlabel\{" + CHAPTER_LABEL_REGEX + "\}\{\{\d+\}\{(\d+)\}"

	# declare rules how to name chapter files

	def write_pdf_file(output, filename):
	"""Write PDF object into file."""

	output_filename = filename
	output_stream = open(output_filename, 'wb')
	output.write(output_stream)
	output_stream.close()
	logging.debug("Wrote %s.", output_filename)

	return
	# def write_pdf_file ends here

	def main():
	"""The main bit"""

	parser = argparse.ArgumentParser()
	parser.add_argument("auxfile", help="The auxfile that is used to read the page numbers from.")
	parser.add_argument("pdffile", help="The PDF file that is going to be split in chapters.")
	args = parser.parse_args()

	with open(args.auxfile) as aux_file:
	aux_lines = aux_file.read()

	references = re.findall(NEWLABEL_REGEX, aux_lines)

	if not references:
	logging.error("No chapter marks found. Exiting")
	sys.exit()

	input1 = PdfFileReader(open(args.pdffile, "rb"))
	logging.debug("Input has %d pages.", input1.getNumPages())

	pdf_object = PdfFileWriter()

	# the preface
	for pdfpage in range(0, int(references[0]) - 2):
	pdf_object.addPage(input1.getPage(pdfpage))
	write_pdf_file(pdf_object, "chap_frontmatter.pdf")

	del pdf_object

	for chapter_break in references[:-1]:
	pdf_object = PdfFileWriter()
	current_position = references.index(chapter_break)
	for pdfpage in range(int(chapter_break) - 1, int(references[current_position + 1]) - 1):
	pdf_object.addPage(input1.getPage(pdfpage))

	chapter_filename = "chap{:02d}.pdf".format(current_position + 1)
	write_pdf_file(pdf_object, chapter_filename)

	# the last bit
	del pdf_object
	pdf_object = PdfFileWriter()
	for pdfpage in range(int(references[-1]) -1, input1.getNumPages()):
	pdf_object.addPage(input1.getPage(pdfpage))

	chapter_filename = "chap{:02d}.pdf".format(len(references))
	write_pdf_file(pdf_object, chapter_filename)
	# def main ends here

	if __name__ == '__main__':
	main()
	# finis

	# What follows is the original source code from above mentioned book

	# my $offset = 0;

	# sub read_chap {
	# my ($filename) = $_[0];
	# print "looking at $filename\n";
	# my ($fh) = new IO::File "$filename";
	# while (<$fh>) {
	# push @chapters, $1 + $offset
	# if /^\\newlabel\{chapter\d+_.*?\}\{\{\d+\}\{(\d+)\}/;
	# $offset = $1
	# if /^\\newlabel\{find-chap-offset\}\{\{(\d+)\}/;
	# read_chap($1)
	# if /^\\\@input\{(.*)\}/;
	# }
	# $fh->close;
	# }

	# my $filename = shift;
	# $filename =~ s/\..*?$//;

	# read_chap($filename.".aux");
	# push @chapters, '';

	# my $from = $chapters[$ch];
	# my $to = $chapters[$ch+1] ? $chapters[$ch+1]-1 : 'end';

	# print "$from-$to"

	# # !system
	# # "pdftk $filename.pdf cat $from-$to output $filename-$ch.pdf";
	# # or die "$0: Problems with pdftk\n";