Skip to content
Permalink
Browse files

Get labels from XML file

  • Loading branch information...
kthoden committed Nov 20, 2018
1 parent c53cfd4 commit 45d4a94e609c9b4b0b9744b6ea7784719f5d8699
Showing with 38 additions and 7 deletions.
  1. +38 −7 find_chapters.py
@@ -26,19 +26,33 @@
__author__ = "kthoden@mpiwg-berlin.mpg.de"

import re
import os
import sys
import argparse
import logging
from PyPDF2 import PdfFileWriter, PdfFileReader

logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')

OUTPUT_DIR = os.path.expanduser("chapter_files")

CHAPTER_LABEL_REGEX = r"chap\d+_.*?"
# \newlabel{chapter01_caraffa}{{1}{5}{Objects of Value: Challenging Conventional Hierarchies in the Photo Archive\EOAauthor {Costanza Caraffa}}{section*.4}{}}
NEWLABEL_REGEX = r"\\newlabel\{" + CHAPTER_LABEL_REGEX + "\}\{\{\d+\}\{(\d+)\}"

# declare rules how to name chapter files

def get_labels_from_xml(xmlfile):
"""Perform an xpath search for ids of chapters."""

from lxml import etree

xml_tree = etree.parse(xmlfile)
eoa_chapters = xml_tree.xpath("//tei:div[@type='chapter']/@xml:id", namespaces={"tei" : "http://www.tei-c.org/ns/1.0"})

return eoa_chapters
# def get_labels_from_xml ends here

def write_pdf_file(output, filename):
"""Write PDF object into file."""

@@ -57,12 +71,26 @@ def main():
parser = argparse.ArgumentParser()
parser.add_argument("auxfile", help="The auxfile that is used to read the page numbers from.")
parser.add_argument("pdffile", help="The PDF file that is going to be split in chapters.")
parser.add_argument("-t", "--teifile", help="The TEI file for getting the labels of chapters.")
parser.add_argument("-o", "--offset", help="An offset to be added to the pages. You have to manually find out the number. Should be around 7.", default=0)
args = parser.parse_args()

offset = int(args.offset)

with open(args.auxfile) as aux_file:
aux_lines = aux_file.read()

references = re.findall(NEWLABEL_REGEX, aux_lines)
if args.teifile:
references = []
chapter_ids = get_labels_from_xml(args.teifile)
for chapter_id in chapter_ids:
newlabel_regex = r"\\newlabel\{" + chapter_id + "\}\{\{\d+\}\{(\d+)\}"
ding = re.findall(newlabel_regex, aux_lines)[0]
references.append(ding)
else:
references = re.findall(NEWLABEL_REGEX, aux_lines)

print(references)

if not references:
logging.error("No chapter marks found. Exiting")
@@ -73,30 +101,33 @@ def main():

pdf_object = PdfFileWriter()

if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)

# the preface
for pdfpage in range(0, int(references[0]) - 2):
for pdfpage in range(0, int(references[0]) - 2 + offset):
pdf_object.addPage(input1.getPage(pdfpage))
write_pdf_file(pdf_object, "chap_frontmatter.pdf")
write_pdf_file(pdf_object, OUTPUT_DIR + os.path.sep + "chap_frontmatter.pdf")

del pdf_object

for chapter_break in references[:-1]:
pdf_object = PdfFileWriter()
current_position = references.index(chapter_break)
for pdfpage in range(int(chapter_break) - 1, int(references[current_position + 1]) - 1):
for pdfpage in range(int(chapter_break) - 1 + offset, int(references[current_position + 1]) - 1 + offset):
pdf_object.addPage(input1.getPage(pdfpage))

chapter_filename = "chap{:02d}.pdf".format(current_position + 1)
write_pdf_file(pdf_object, chapter_filename)
write_pdf_file(pdf_object, OUTPUT_DIR + os.path.sep + chapter_filename)

# the last bit
del pdf_object
pdf_object = PdfFileWriter()
for pdfpage in range(int(references[-1]) -1, input1.getNumPages()):
for pdfpage in range(int(references[-1]) - 1 + offset, input1.getNumPages()):
pdf_object.addPage(input1.getPage(pdfpage))

chapter_filename = "chap{:02d}.pdf".format(len(references))
write_pdf_file(pdf_object, chapter_filename)
write_pdf_file(pdf_object, OUTPUT_DIR + os.path.sep + chapter_filename)
# def main ends here

if __name__ == '__main__':

0 comments on commit 45d4a94

Please sign in to comment.
You can’t perform that action at this time.