Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Adding find_chapters.py
- Loading branch information
Klaus Thoden
committed
Sep 13, 2018
1 parent
a2a4ac5
commit 1372275
Showing
2 changed files
with
135 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8; mode: python -*- | ||
|
||
""" | ||
Find pages of a chapter in an LaTeX aux file | ||
This is from Hack #92 from the O'Reilly Book "LaTeX Hacks" | ||
(9783897214774) where it was implemented in Perl. It depends on a | ||
label command per chapter and the book suggests to include an | ||
automatic label in the titlesec command for consistency. | ||
The question to be answered is whether this method corresponds to how | ||
imxml2django splits the files. | ||
If built into Django, the whole process of cutting up the book and | ||
uploading chapter files and prefixing them with chapter info could be | ||
further automatted. However, output needs to be checked! | ||
Maybe as a preprocessing step (and not to schlepp the aux file | ||
around), include the chapter page information in a config file before | ||
uploading (or use a PDF metadata field for that?) | ||
""" | ||
|
||
__version__ = "1.0" | ||
__date__ = "20180810" | ||
__author__ = "kthoden@mpiwg-berlin.mpg.de" | ||
|
||
import re | ||
import argparse | ||
import logging | ||
from PyPDF2 import PdfFileWriter, PdfFileReader | ||
|
||
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') | ||
|
||
CHAPTER_LABEL_REGEX = r"chapter\d+_.*?" | ||
# \newlabel{chapter01_caraffa}{{1}{5}{Objects of Value: Challenging Conventional Hierarchies in the Photo Archive\EOAauthor {Costanza Caraffa}}{section*.4}{}} | ||
NEWLABEL_REGEX = r"\\newlabel\{" + CHAPTER_LABEL_REGEX + "\}\{\{\d+\}\{(\d+)\}" | ||
|
||
# declare rules how to name chapter files | ||
|
||
def write_pdf_file(output, filename): | ||
"""Write PDF object into file.""" | ||
|
||
output_filename = filename | ||
output_stream = open(output_filename, 'wb') | ||
output.write(output_stream) | ||
output_stream.close() | ||
logging.debug("Wrote %s.", output_filename) | ||
|
||
return | ||
# def write_pdf_file ends here | ||
|
||
def main(): | ||
"""The main bit""" | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("auxfile", help="The auxfile that is used to read the page numbers from.") | ||
parser.add_argument("pdffile", help="The PDF file that is going to be split in chapters..") | ||
args = parser.parse_args() | ||
|
||
with open(args.auxfile) as aux_file: | ||
aux_lines = aux_file.read() | ||
|
||
references = re.findall(NEWLABEL_REGEX, aux_lines) | ||
|
||
input1 = PdfFileReader(open(args.pdffile, "rb")) | ||
logging.debug("Input has %d pages.", input1.getNumPages()) | ||
|
||
pdf_object = PdfFileWriter() | ||
|
||
# the preface | ||
for pdfpage in range(0, int(references[0]) - 2): | ||
pdf_object.addPage(input1.getPage(pdfpage)) | ||
write_pdf_file(pdf_object, "chap_frontmatter.pdf") | ||
|
||
del pdf_object | ||
|
||
for chapter_break in references[:-1]: | ||
pdf_object = PdfFileWriter() | ||
current_position = references.index(chapter_break) | ||
for pdfpage in range(int(chapter_break) - 1, int(references[current_position + 1]) - 1): | ||
pdf_object.addPage(input1.getPage(pdfpage)) | ||
|
||
chapter_filename = "chap{:02d}.pdf".format(current_position + 1) | ||
write_pdf_file(pdf_object, chapter_filename) | ||
|
||
# the last bit | ||
del pdf_object | ||
pdf_object = PdfFileWriter() | ||
for pdfpage in range(int(references[-1]) -1, input1.getNumPages()): | ||
pdf_object.addPage(input1.getPage(pdfpage)) | ||
|
||
chapter_filename = "chap{:02d}.pdf".format(len(references)) | ||
write_pdf_file(pdf_object, chapter_filename) | ||
# def main ends here | ||
|
||
if __name__ == '__main__': | ||
main() | ||
# finis | ||
|
||
# What follows is the original source code from above mentioned book | ||
|
||
# my $offset = 0; | ||
|
||
# sub read_chap { | ||
# my ($filename) = $_[0]; | ||
# print "looking at $filename\n"; | ||
# my ($fh) = new IO::File "$filename"; | ||
# while (<$fh>) { | ||
# push @chapters, $1 + $offset | ||
# if /^\\newlabel\{chapter\d+_.*?\}\{\{\d+\}\{(\d+)\}/; | ||
# $offset = $1 | ||
# if /^\\newlabel\{find-chap-offset\}\{\{(\d+)\}/; | ||
# read_chap($1) | ||
# if /^\\\@input\{(.*)\}/; | ||
# } | ||
# $fh->close; | ||
# } | ||
|
||
# my $filename = shift; | ||
# $filename =~ s/\..*?$//; | ||
|
||
# read_chap($filename.".aux"); | ||
# push @chapters, ''; | ||
|
||
# my $from = $chapters[$ch]; | ||
# my $to = $chapters[$ch+1] ? $chapters[$ch+1]-1 : 'end'; | ||
|
||
# print "$from-$to" | ||
|
||
# # !system | ||
# # "pdftk $filename.pdf cat $from-$to output $filename-$ch.pdf"; | ||
# # or die "$0: Problems with pdftk\n"; |