Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
eoa_metadator/find_chapters.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
138 lines (103 sloc)
4.2 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8; mode: python -*- | |
""" | |
Find pages of a chapter in an LaTeX aux file | |
This is from Hack #92 from the O'Reilly Book "LaTeX Hacks" | |
(9783897214774) where it was implemented in Perl. It depends on a | |
label command per chapter and the book suggests to include an | |
automatic label in the titlesec command for consistency. | |
The question to be answered is whether this method corresponds to how | |
imxml2django splits the files. | |
If built into Django, the whole process of cutting up the book and | |
uploading chapter files and prefixing them with chapter info could be | |
further automatted. However, output needs to be checked! | |
Maybe as a preprocessing step (and not to schlepp the aux file | |
around), include the chapter page information in a config file before | |
uploading (or use a PDF metadata field for that?) | |
""" | |
__version__ = "1.0" | |
__date__ = "20180810" | |
__author__ = "kthoden@mpiwg-berlin.mpg.de" | |
import re | |
import sys | |
import argparse | |
import logging | |
from PyPDF2 import PdfFileWriter, PdfFileReader | |
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') | |
CHAPTER_LABEL_REGEX = r"chap\d+_.*?" | |
# \newlabel{chapter01_caraffa}{{1}{5}{Objects of Value: Challenging Conventional Hierarchies in the Photo Archive\EOAauthor {Costanza Caraffa}}{section*.4}{}} | |
NEWLABEL_REGEX = r"\\newlabel\{" + CHAPTER_LABEL_REGEX + "\}\{\{\d+\}\{(\d+)\}" | |
# declare rules how to name chapter files | |
def write_pdf_file(output, filename): | |
"""Write PDF object into file.""" | |
output_filename = filename | |
output_stream = open(output_filename, 'wb') | |
output.write(output_stream) | |
output_stream.close() | |
logging.debug("Wrote %s.", output_filename) | |
return | |
# def write_pdf_file ends here | |
def main(): | |
"""The main bit""" | |
parser = argparse.ArgumentParser() | |
parser.add_argument("auxfile", help="The auxfile that is used to read the page numbers from.") | |
parser.add_argument("pdffile", help="The PDF file that is going to be split in chapters.") | |
args = parser.parse_args() | |
with open(args.auxfile) as aux_file: | |
aux_lines = aux_file.read() | |
references = re.findall(NEWLABEL_REGEX, aux_lines) | |
if not references: | |
logging.error("No chapter marks found. Exiting") | |
sys.exit() | |
input1 = PdfFileReader(open(args.pdffile, "rb")) | |
logging.debug("Input has %d pages.", input1.getNumPages()) | |
pdf_object = PdfFileWriter() | |
# the preface | |
for pdfpage in range(0, int(references[0]) - 2): | |
pdf_object.addPage(input1.getPage(pdfpage)) | |
write_pdf_file(pdf_object, "chap_frontmatter.pdf") | |
del pdf_object | |
for chapter_break in references[:-1]: | |
pdf_object = PdfFileWriter() | |
current_position = references.index(chapter_break) | |
for pdfpage in range(int(chapter_break) - 1, int(references[current_position + 1]) - 1): | |
pdf_object.addPage(input1.getPage(pdfpage)) | |
chapter_filename = "chap{:02d}.pdf".format(current_position + 1) | |
write_pdf_file(pdf_object, chapter_filename) | |
# the last bit | |
del pdf_object | |
pdf_object = PdfFileWriter() | |
for pdfpage in range(int(references[-1]) -1, input1.getNumPages()): | |
pdf_object.addPage(input1.getPage(pdfpage)) | |
chapter_filename = "chap{:02d}.pdf".format(len(references)) | |
write_pdf_file(pdf_object, chapter_filename) | |
# def main ends here | |
if __name__ == '__main__': | |
main() | |
# finis | |
# What follows is the original source code from above mentioned book | |
# my $offset = 0; | |
# sub read_chap { | |
# my ($filename) = $_[0]; | |
# print "looking at $filename\n"; | |
# my ($fh) = new IO::File "$filename"; | |
# while (<$fh>) { | |
# push @chapters, $1 + $offset | |
# if /^\\newlabel\{chapter\d+_.*?\}\{\{\d+\}\{(\d+)\}/; | |
# $offset = $1 | |
# if /^\\newlabel\{find-chap-offset\}\{\{(\d+)\}/; | |
# read_chap($1) | |
# if /^\\\@input\{(.*)\}/; | |
# } | |
# $fh->close; | |
# } | |
# my $filename = shift; | |
# $filename =~ s/\..*?$//; | |
# read_chap($filename.".aux"); | |
# push @chapters, ''; | |
# my $from = $chapters[$ch]; | |
# my $to = $chapters[$ch+1] ? $chapters[$ch+1]-1 : 'end'; | |
# print "$from-$to" | |
# # !system | |
# # "pdftk $filename.pdf cat $from-$to output $filename-$ch.pdf"; | |
# # or die "$0: Problems with pdftk\n"; |