Skip to content
Permalink
dcde1bd2ed
Switch branches/tags
Go to file
 
 
Cannot retrieve contributors at this time
138 lines (103 sloc) 4.2 KB
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
"""
Find pages of a chapter in an LaTeX aux file
This is from Hack #92 from the O'Reilly Book "LaTeX Hacks"
(9783897214774) where it was implemented in Perl. It depends on a
label command per chapter and the book suggests to include an
automatic label in the titlesec command for consistency.
The question to be answered is whether this method corresponds to how
imxml2django splits the files.
If built into Django, the whole process of cutting up the book and
uploading chapter files and prefixing them with chapter info could be
further automatted. However, output needs to be checked!
Maybe as a preprocessing step (and not to schlepp the aux file
around), include the chapter page information in a config file before
uploading (or use a PDF metadata field for that?)
"""
__version__ = "1.0"
__date__ = "20180810"
__author__ = "kthoden@mpiwg-berlin.mpg.de"
import re
import sys
import argparse
import logging
from PyPDF2 import PdfFileWriter, PdfFileReader
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
CHAPTER_LABEL_REGEX = r"chap\d+_.*?"
# \newlabel{chapter01_caraffa}{{1}{5}{Objects of Value: Challenging Conventional Hierarchies in the Photo Archive\EOAauthor {Costanza Caraffa}}{section*.4}{}}
NEWLABEL_REGEX = r"\\newlabel\{" + CHAPTER_LABEL_REGEX + "\}\{\{\d+\}\{(\d+)\}"
# declare rules how to name chapter files
def write_pdf_file(output, filename):
"""Write PDF object into file."""
output_filename = filename
output_stream = open(output_filename, 'wb')
output.write(output_stream)
output_stream.close()
logging.debug("Wrote %s.", output_filename)
return
# def write_pdf_file ends here
def main():
"""The main bit"""
parser = argparse.ArgumentParser()
parser.add_argument("auxfile", help="The auxfile that is used to read the page numbers from.")
parser.add_argument("pdffile", help="The PDF file that is going to be split in chapters.")
args = parser.parse_args()
with open(args.auxfile) as aux_file:
aux_lines = aux_file.read()
references = re.findall(NEWLABEL_REGEX, aux_lines)
if not references:
logging.error("No chapter marks found. Exiting")
sys.exit()
input1 = PdfFileReader(open(args.pdffile, "rb"))
logging.debug("Input has %d pages.", input1.getNumPages())
pdf_object = PdfFileWriter()
# the preface
for pdfpage in range(0, int(references[0]) - 2):
pdf_object.addPage(input1.getPage(pdfpage))
write_pdf_file(pdf_object, "chap_frontmatter.pdf")
del pdf_object
for chapter_break in references[:-1]:
pdf_object = PdfFileWriter()
current_position = references.index(chapter_break)
for pdfpage in range(int(chapter_break) - 1, int(references[current_position + 1]) - 1):
pdf_object.addPage(input1.getPage(pdfpage))
chapter_filename = "chap{:02d}.pdf".format(current_position + 1)
write_pdf_file(pdf_object, chapter_filename)
# the last bit
del pdf_object
pdf_object = PdfFileWriter()
for pdfpage in range(int(references[-1]) -1, input1.getNumPages()):
pdf_object.addPage(input1.getPage(pdfpage))
chapter_filename = "chap{:02d}.pdf".format(len(references))
write_pdf_file(pdf_object, chapter_filename)
# def main ends here
if __name__ == '__main__':
main()
# finis
# What follows is the original source code from above mentioned book
# my $offset = 0;
# sub read_chap {
# my ($filename) = $_[0];
# print "looking at $filename\n";
# my ($fh) = new IO::File "$filename";
# while (<$fh>) {
# push @chapters, $1 + $offset
# if /^\\newlabel\{chapter\d+_.*?\}\{\{\d+\}\{(\d+)\}/;
# $offset = $1
# if /^\\newlabel\{find-chap-offset\}\{\{(\d+)\}/;
# read_chap($1)
# if /^\\\@input\{(.*)\}/;
# }
# $fh->close;
# }
# my $filename = shift;
# $filename =~ s/\..*?$//;
# read_chap($filename.".aux");
# push @chapters, '';
# my $from = $chapters[$ch];
# my $to = $chapters[$ch+1] ? $chapters[$ch+1]-1 : 'end';
# print "$from-$to"
# # !system
# # "pdftk $filename.pdf cat $from-$to output $filename-$ch.pdf";
# # or die "$0: Problems with pdftk\n";