build_frontmatter.py

#!/usr/bin/python3
# -*- coding: utf-8; mode: python -*-
__version__ = "1.0"
__date__ = "20170208"
__author__ = "kthoden@mpiwg-berlin.mpg.de"
__doc__ = """Generates LaTeX code for informative frontmatters. These
will be attached to the chapter PDFs of EOA publications we offer for
download"""

import string
import sys
import re
import os
from datetime import datetime
import psycopg2
import psycopg2.extras

# using https://wiki.postgresql.org/wiki/Psycopg2_Tutorial

PUBLICATION_PATTERN = re.compile("(?P<series>studies|sources|proceedings|textbooks)(?P<number>[1-9]?[0-9]{1})$")
MEDIA_DIR = "/home/editionopenaccess/eoa/website/website/media/"
PRODUCTION_URL = "http://eoa-production.rz-berlin.mpg.de"

def print_error(message):
    """Print error message to stderr """

    message_string = "[ERROR] %s\n" % message
    sys.stderr.write(message_string)
# def print_error ends here

def check_publication(input_string):
    """Checks the input string for a valid name."""

    publication_match = re.match(PUBLICATION_PATTERN, input_string)
    if publication_match is not None:
        sys.stdout.write("Name of the publication %s is valid.\n" % input_string)
        return(True)
    else:
        print_error("Name of publication %s is not valid. Exiting." % input_string)
        sys.exit()
# def check_publication ends here

def connect_db():
    """Connect to the database"""

    try:
        connection = psycopg2.connect("dbname='eoa_2017' user='kthoden' host='localhost'")# password=''")
        sys.stdout.write("Connection established.\n")
        return(connection)
    except:
        print_error("[ERROR] Could not connect.\n")
# def connect_db ends here

def get_publication_id(input_string, eoa_cursor):
    """Queries the database for the publication id.

    Input is the input string, return value the id."""

    id_match = re.match(PUBLICATION_PATTERN, input_string)
    eoa_series = id_match.group('series')
    eoa_number = id_match.group('number')

    query_string = """SELECT "Title", "id" FROM publications_publication WHERE "Serie" = '%s' AND "Number" = '%s'""" % (eoa_series, eoa_number)

    eoa_cursor.execute(query_string)

    rows = eoa_cursor.fetchall()
    if len(rows) > 1:
        print_error("There should be only one database entry that matches the input. Found %s" % len(rows))
    elif len(rows) == 0:
        print_error("It seems like there is no such as publication %s %s. Exiting." % (eoa_series.title(), eoa_number))
        sys.exit()
    else:
        print("The title of the publication you selected is '%s'." % rows[0][0])

    return(rows[0][1])
# def get_publication_id ends here

def get_publication_info(eoa_pub_id, eoa_cursor):
    """Get more information from one publication"""

    query_string = """SELECT * FROM publications_publication WHERE "id" = '%s' """ % eoa_pub_id

    eoa_cursor.execute(query_string)

    rows = eoa_cursor.fetchall()

    return(rows)
# def get_publication_info ends here

def get_chapters(eoa_pub_id, eoa_cursor):
    """Queries database for information about the individual chapters.

    Return the first row including all the fields below"""

    query_string = """SELECT "Order", "Title", "Chapterauthor1",
    "Chapterauthor2", "Chapterauthor3", "Chapterauthor4",
    "Chapterauthor5", "Chapterpdffile" FROM publications_chapter WHERE "Publication_id"
    = '%s' ORDER BY "Order" ASC""" % eoa_pub_id

    eoa_cursor.execute(query_string)

    rows = eoa_cursor.fetchall()

    for row in rows:
        if len(row[7]) == 0:
            print_error("There seems to be no file attached to chapter '%s'. Removing it." % row[1])

    rows[:] = [row for row in rows if len(row[7]) > 0]

    return(rows)
# get_chapters ends here

def format_authors(result_list, start_range, end_range):
    """Format the list of authors

    Input is the start and end point of the authors in a list. Return
    both a formatted string and the pure list of authors.
    """
    # now dealing with the authors
    authors = []

    for author in range(start_range, end_range):
        if len(result_list[author]) > 0:
            authors.append(result_list[author])

    if len(authors) == 0:
        authors_as_string = ""
    if len(authors) == 1:
        authors_as_string = """%s""" % (authors[0])
    elif len(authors) == 2:
        authors_as_string = """%s and %s""" % (authors[0], authors[1])
    elif len(authors) > 2:
        authors_as_string = """%s""" % authors[0]
        for author in range(1, len(authors) - 1):
            authors_as_string += ", " + authors[author]
        authors_as_string += " and %s" % (authors[-1])

    return(authors_as_string, authors)
# def format_authors ends here

def format_authors_xml(eoa_publication_info):
    """Format the list of authors

    Input is the start and end point of the authors in a list. Return
    an XML element as a string.
    """

    authors_as_string = ""

    for authornumber in range(1, 6):
        author_key = "Publicationauthor" + str(authornumber)
        tmp_author = eoa_publication_info[author_key]
        if len(tmp_author) > 0:
            nameparts = tmp_author.split(" ")
            author_string = """
    <author primary_contact="true" user_group_ref="Author">
      <firstname>%s</firstname>
      <lastname>%s</lastname>
      <email>author@example.com</email>
    </author>""" % (nameparts[0], nameparts[-1])
            authors_as_string += author_string

    return(authors_as_string)
# def format_authors_xml ends here

def format_title(title_string, is_book_subtitle=False, unformatted=False):
    """Convert html tags to their LaTeX counterpart."""

    EMPHASIS = re.compile("<em>(.*?)</em>")
    LOWER = re.compile("<sub>(.*?)</sub>")
    BOLD = re.compile("<b>(.*?)</b>")

    chapter_title = title_string.replace("<br/>", " ")
    chapter_title = unescape(chapter_title)

    if is_book_subtitle is True and len(title_string) > 0:
        chapter_title = r" : " + chapter_title

    if unformatted is True:
        chapter_title = re.sub(EMPHASIS, r"\g<1>", chapter_title)
        chapter_title = re.sub(LOWER, r"\g<1>", chapter_title)
        chapter_title = re.sub(BOLD, r"\g<1>", chapter_title)
    else:
        chapter_title = re.sub(EMPHASIS, r"\\emph{\g<1>}", chapter_title)
        chapter_title = re.sub(LOWER, r"\\textsubscript{\g<1>}", chapter_title)
        chapter_title = re.sub(BOLD, r"\\textbf{\g<1>}", chapter_title)

    return(chapter_title)
# def format_title ends here

def unescape(text):
    """Remove HTML or XML character references and entities from a text
    string. Return a Unicode string.

    With thanks to http://effbot.org/zone/re-sub.htm#unescape-html.
    Modified to work with Python3.
    """
    import html.entities

    def fixup(character):
        """Fix one character."""

        text = character.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return chr(int(text[3:-1], 16))
                else:
                    return chr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            try:
                text = chr(html.entities.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text # leave as is
    return re.sub(r"&#?\w+;", fixup, text)
# def unescape ends here

def format_chapter_info(chapter_result):
    """Return a tuple with infos for the chapter bit."""

    chapter_title = chapter_result[1].rstrip()

    formatted_chapter_title = format_title(chapter_title)

    list_of_authors_string = format_authors(chapter_result, 2, 7)[0]

    authors_line = ""

    if len(list_of_authors_string) > 0:
        authors_line = list_of_authors_string

    return(formatted_chapter_title, authors_line)
# def format_chapter_info ends here

def which_publisher(series):
    """Make layout dependant on series

    Return base url as well as publisher string"""

    if series == "sources":
        base_url = "http://edition-open-sources.org"
        publisher_name = "Edition Open Sources"
    else:
        base_url = "http://edition-open-access.de"
        publisher_name = r"Max Planck Research Library for the History and Development\\of Knowledge"

    return(base_url, publisher_name)
# def which_publisher ends here

def choose_geometry(eoa_series):
    """Adjust page geometry and paper size to series"""

    fontsize_string = "fontsize=10pt"

    if eoa_series == "Sources":
        geometry_string = r"\usepackage[a4paper,inner=30mm,outer=30mm,top=14mm,bottom=20mm,includehead]{geometry}"
        fontsize_string = "fontsize=12pt"
    elif eoa_series == "Studies":
        geometry_string = r"\usepackage[paperwidth=170mm,paperheight=240mm,inner=22mm,outer=20mm,top=14mm,bottom=20mm,includehead]{geometry}"
    elif eoa_series == "Proceedings" or eoa_series == "Textbooks":
        geometry_string = r"\usepackage[paperwidth=148mm,paperheight=210mm,inner=20mm,outer=15mm,top=13mm,bottom=15mm,includehead]{geometry}"
        fontsize_string = "fontsize=9pt"
    return(geometry_string, fontsize_string)
# def choose_geometry ends here

def format_publication_info(eoa_publication_info):
    """Provide strings for the publication info.

    Return a dictionary of items.
    """

    pub_suffix = ""

    base_url, publisher_string = which_publisher(eoa_publication_info["Serie"])
    cover_url = "%s/media/%s" % (PRODUCTION_URL, eoa_publication_info["Coverbig"])
    download_cover_image(cover_url)

    publication_url = "%s/%s/%s/" % (base_url, eoa_publication_info["Serie"], eoa_publication_info["Number"])

    if len(eoa_publication_info["Publicationauthorsuffix"]) > 0:
        pub_suffix = " " + eoa_publication_info["Publicationauthorsuffix"]

    licence_string = format_licence(eoa_publication_info["Publicationlicense"])
    shoplink_string = format_shoplink(eoa_publication_info['Shoplink'])
    book_authors_string = format_authors_xml(eoa_publication_info)

    items_to_return = {"bookauthors" : book_authors_string,
                       "pubsuffix" : pub_suffix,
                       "booktitle": format_title(eoa_publication_info["Title"]),
                       "booksubtitle" : format_title(eoa_publication_info["Subtitle"], is_book_subtitle=True,unformatted=True),
                       "publisher" : publisher_string,
                       "series" : eoa_publication_info["Serie"].title(),
                       "number" : eoa_publication_info["Number"],
                       "isbn" : eoa_publication_info["Isbn"],
                       "pubdate" : eoa_publication_info["Datepublished"].strftime("%Y-%m-%d"),
                       "url" : publication_url,
                       "licence" : licence_string,
                       "shoplink" : shoplink_string}

    return(items_to_return)
# def format_publication_info ends here

def format_licence(publication_licence):
    """Provide a string for the licence used."""

    if publication_licence == "by-nc-sa":
        licence_string = r" under Creative Commons by-nc-sa 3.0 Germany Licence.\\\url{http://creativecommons.org/licenses/by-nc-sa/3.0/de/}"
    else:
        licence_string = "."

    return(licence_string)
# def format_licence ends here

def format_shoplink(input_string, raw=False):
    """Parse the shoplink entry"""

    SHOPLINK_PATTERN = re.compile('<a href="(?P<book_url>.*?)">(?P<company>.*?)</a>')

    shoplink_match = re.match(SHOPLINK_PATTERN, input_string)

    company = shop_url = "k.A."

    try:
        shop_url = shoplink_match.group('book_url')
        company = shoplink_match.group('company')
    except AttributeError:
        pass

    if company == "epubli.de":
        shoplink_line = r"Neopubli GmbH, Berlin\par\url{%s}" % shop_url
    elif company == "pro-business.com":
        shoplink_line = r"PRO BUSINESS digital printing Deutschland GmbH, Berlin\par\url{%s}" % shop_url
    else:
        shoplink_line = ""

    if raw == False:
        return(shoplink_line)
    else:
        return(shop_url, company)
# def format_shoplink ends here

def download_cover_image(image_url):
    """Download image from website.

    Code from
    https://stackoverflow.com/questions/8286352
    """

    import urllib.request

    urllib.request.urlretrieve(image_url, "./Coverimage.jpg")
# def download_cover_image

def download_chapter_pdf(chapter_url, destination):
    """Download the individual chapters from website.

    Also return the pdf_filename.
    """

    import urllib.request

    try:
        urllib.request.urlretrieve(chapter_url.replace(" ", "%20"), "./" + destination)
    except urllib.error.HTTPError:
        print_error("Program received an HTTP Error 403: Forbidden. Maybe there are no chapter files?")
# def download_chapter_pdf

def file_base64(filepath):
    """Base64 encode a file

    https://code.tutsplus.com/tutorials/base64-encoding-and-decoding-using-python--cms-25588
    """
    import base64

    read_file = open(filepath, "rb").read()
    base_64_encode = base64.encodestring(read_file)

    return(base_64_encode)
# def file_base64 ends here

def run_latex(command):
    """Compile the latex"""

    import shlex
    import subprocess

    arguments = shlex.split(command)

    subprocess.call(arguments)
# def run_latex ends here

def add_pdf_info(pdf_filename, list_of_authors, title_for_pdf, subject_string):
    """Add metadata to PDF file"""
    # code taken from http://kitchingroup.cheme.cmu.edu/blog/2013/06/13/Reading-and-writing-pdf-metadata/

    from PyPDF2 import PdfFileWriter, PdfFileReader
    from PyPDF2.generic import NameObject, createStringObject

    # ('Proc1Pre', 'Proc1Pre-orig.pdf', '', 'Vorwort')

    original_pdf = open(pdf_filename + '_orig.pdf', 'rb')
    frontmatter_pdf = open(pdf_filename + '_frontmatter.pdf', 'rb')

    pdf_orig = PdfFileReader(original_pdf)
    pdf_frontmatter = PdfFileReader(frontmatter_pdf)

    writer = PdfFileWriter()

    writer.addPage(pdf_frontmatter.getPage(0))

    for page in range(pdf_orig.getNumPages()):
        writer.addPage(pdf_orig.getPage(page))

    info_dict = writer._info.getObject()

    info = pdf_orig.documentInfo
    for key in info:
        info_dict.update({NameObject(key): createStringObject(info[key])})

    info_dict.update({NameObject('/Title'): createStringObject(title_for_pdf)})
    info_dict.update({NameObject('/Subject'): createStringObject(subject_string)})
    info_dict.update({NameObject('/Author'): createStringObject(list_of_authors)})

    # It does not appear possible to alter in place.
    new_pdf = open(pdf_filename + '_out.pdf', 'wb')

    writer.write(new_pdf)
    original_pdf.close()
    new_pdf.close()

    os.unlink(pdf_filename + '_orig.pdf')
    os.rename(pdf_filename + '_out.pdf', pdf_filename + '.pdf')
# def add_pdf_info ends here

def create_chapter_frontmatter(eoa_publication):
    """Main function"""

    # validate input
    check_publication(eoa_publication)

    # setting up database
    eoa_db = connect_db()
    eoa_cursor = eoa_db.cursor()

    eoa_pub_id = get_publication_id(eoa_publication, eoa_cursor)
    eoa_publication_info = get_publication_info(eoa_pub_id, eoa_cursor)

    base_url = which_publisher(eoa_publication_info[0][8])[0]

    # the template file
    tmp_template = open("./frontpage_template.tex", "r")
    frontmatter_template = tmp_template.read()
    tmp_template.close()

    chapter_files = get_chapters(eoa_pub_id, eoa_cursor)

    if len(chapter_files) == 0:
        print_error("Found no chapter files.")
        sys.exit("Exiting")

    print("Found %d chapter files." % (len(chapter_files)))

    os.chdir("./generated_files/")

    command_file = open(eoa_publication + "_copycommand.sh", "w")
    command_file.write("#!/bin/bash\n")

    list_of_auxfiles = []

    for chapter in chapter_files:
        # get the original pdf file
        chapter_url = "%s/media/%s" % (PRODUCTION_URL, chapter[7])

        pdf_filename = chapter_url.split("/")[-1]
        pdffilename_front, pdffilename_ext = os.path.splitext(pdf_filename)
        original_pdf_file = pdffilename_front + "_orig" + pdffilename_ext

        list_of_auxfiles.append(pdffilename_front + '_frontmatter.pdf')

        print("download_chapter_pdf from", chapter_url)
        download_chapter_pdf(chapter_url, original_pdf_file)

        tex_filename = eoa_publication + "ch" + str(chapter[0]) + ".tex"

        outfile = open("./" + tex_filename, "w")

        # getting data for the template
        item_for_template = format_publication_info(eoa_publication_info[0])

        formatted_chapter_title, authors_line = format_chapter_info(chapter)

        if len(authors_line) == 0:
            authors_line = item_for_template["bookauthors"]

        formatted_chapter_authors = r"\emph{%s:}" % authors_line

        geometry_string, fontsize_string = choose_geometry(item_for_template["series"])

        frontmatter_template_string = string.Template(frontmatter_template)
        # fill in the blanks
        frontmatter_replacement = frontmatter_template_string.substitute(
            FONTSIZE=fontsize_string,
            GEOMETRY_SETTINGS=geometry_string,
            FORMATTED_CHAPTER_TITLE=formatted_chapter_title,
            CHAPTER_AUTHORS_LINE=formatted_chapter_authors,
            FORMAT_AUTHORS=item_for_template["bookauthors"],
            FORMATTED_SHOPLINK=item_for_template["shoplink"],
            LICENCE=item_for_template["licence"],
            PUB_SUFFIX=item_for_template["pubsuffix"],
            FORMAT_TITLE=item_for_template["booktitle"],
            FORMAT_SUBTITLE=item_for_template["booksubtitle"].replace(" : ", "~:~"),
            PUBLISHER_STRING=item_for_template["publisher"],
            EOA_SERIES=item_for_template["series"],
            SERIES_NUMBER=item_for_template["number"],
            ISBN_CODE=item_for_template["isbn"],
            PUB_DATE=item_for_template["pubdate"],
            PUBLICATION_URL=item_for_template["url"])

        outfile.write(frontmatter_replacement)
        outfile.close()

        unformatted_chapter_title = format_title(chapter[1].rstrip(), unformatted=True)

        subject_string = "A chapter from %s%s by %s (%s). %s, %s %s. %s" % (item_for_template[2], item_for_template[3], item_for_template[0], item_for_template[8], item_for_template[4], item_for_template[5], item_for_template[6], item_for_template[9])

        # generate PDF file
        print("Typsetting the frontmatter to chapter '%s'" % unformatted_chapter_title)

        latex_command = "xelatex --interaction=batchmode -jobname='%s_frontmatter' %s" % (pdffilename_front, tex_filename)
        run_latex(latex_command)

        add_pdf_info(pdffilename_front, authors_line, unformatted_chapter_title, subject_string)

        command_file.write("cp %s%s %s%s.bak\n" % (MEDIA_DIR, chapter[7], MEDIA_DIR, chapter[7]))
        command_file.write("cp '%s' %s%s\n" % (pdf_filename, MEDIA_DIR, chapter[7]))

    # back to normal
    command_file.close()

    print("Removing aux files.")

    for auxfile in os.listdir("."):
        if re.search(r'.*\.(aux|log|tex)', auxfile):
            list_of_auxfiles.append(auxfile)

    for file_for_deletion in list_of_auxfiles:
        os.unlink(file_for_deletion)

    print("Removing other files.")
    os.unlink("Coverimage.jpg")

    os.chdir("..")
# def create_chapter_frontmatter ends here

def create_omp_native_xml(eoa_publication):
    """Use the database infos for creating input for OMP"""
    # validate input
    check_publication(eoa_publication)

    # setting up database
    eoa_db = connect_db()
    eoa_cursor = eoa_db.cursor(cursor_factory=psycopg2.extras.DictCursor)

    eoa_pub_id = get_publication_id(eoa_publication, eoa_cursor)
    eoa_publication_info = get_publication_info(eoa_pub_id, eoa_cursor)[0]

    base_url = which_publisher(eoa_publication_info["Serie"])

    # the template file
    tmp_template = open("./data/native_template.xml", "r")
    frontmatter_template = tmp_template.read()
    tmp_template.close()

    xml_filename = eoa_publication + ".xml"

    outfile = open("./generated_files/" + xml_filename, "w")

    item_for_template = format_publication_info(eoa_publication_info)

    supplierurl, suppliercomp = format_shoplink(eoa_publication_info["Shoplink"], raw=True)

    if len(eoa_publication_info["Subtitle"]) > 0:
        onix_subtitle = """<onix:Subtitle>%s</onix:Subtitle>""" % eoa_publication_info["Subtitle"]
        omp_subtitle = """<subtitle locale="en_US">%s</subtitle>""" % eoa_publication_info["Subtitle"]
    else:
        omp_subtitle = onix_subtitle = ""

    frontmatter_template_string = string.Template(frontmatter_template)
    # fill in the blanks
    frontmatter_replacement = frontmatter_template_string.substitute(
        INTERNAL_ID=item_for_template["number"],
        FORMAT_AUTHORS=item_for_template["bookauthors"],
        FORMAT_TITLE=item_for_template["booktitle"],
        OMP_SUBTITLE=omp_subtitle,
        ONIX_SUBTITLE=onix_subtitle,
        PUBLISHER_STRING=item_for_template["publisher"],
        EOA_SERIES=item_for_template["series"],
        SERIES_NUMBER=item_for_template["number"],
        ISBN_CODE=item_for_template["isbn"],
        PUB_DATE=item_for_template["pubdate"],
        PUBLICATION_URL=item_for_template["url"],
        ABSTRACT=eoa_publication_info["Descriptionlong"].replace("<br/>", ""),
        BASE64_PDF=file_base64("./data/dummy.pdf"),
        PRICE=eoa_publication_info["Price"],
        TODAY=datetime.today().strftime("%Y-%m-%d"),
        SUPPLIER_COMP=suppliercomp,
        SUPPLIER_URL=supplierurl,
        PAGES=eoa_publication_info["Pages"],
        SUBMISSION_NAME="%s_%d_submission" % (item_for_template["series"], item_for_template["number"]),
        PUBDATE_00=eoa_publication_info["Datepublished"].strftime("%Y%m%d")
)

    outfile.write(frontmatter_replacement)
    outfile.close()
# def create_omp_native_xml ends here

if __name__ == '__main__':
    if len(sys.argv) == 1:
        print_error("You must specify a publication!")
        sys.exit()
    elif len(sys.argv) > 2:
        print_error("You can work with only one publication at a time!")
        sys.exit()
    create_omp_native_xml(sys.argv[-1])
    # create_chapter_frontmatter(sys.argv[-1])
# finis
	#!/usr/bin/python3
	# -- coding: utf-8; mode: python --
	__version__ = "1.0"
	__date__ = "20170208"
	__author__ = "kthoden@mpiwg-berlin.mpg.de"
	__doc__ = """Generates LaTeX code for informative frontmatters. These
	will be attached to the chapter PDFs of EOA publications we offer for
	download"""

	import string
	import sys
	import re
	import os
	from datetime import datetime
	import psycopg2
	import psycopg2.extras

	# using https://wiki.postgresql.org/wiki/Psycopg2_Tutorial

	PUBLICATION_PATTERN = re.compile("(?P<series>studies\|sources\|proceedings\|textbooks)(?P<number>[1-9]?[0-9]{1})$")
	MEDIA_DIR = "/home/editionopenaccess/eoa/website/website/media/"
	PRODUCTION_URL = "http://eoa-production.rz-berlin.mpg.de"

	def print_error(message):
	"""Print error message to stderr """

	message_string = "[ERROR] %s\n" % message
	sys.stderr.write(message_string)
	# def print_error ends here

	def check_publication(input_string):
	"""Checks the input string for a valid name."""

	publication_match = re.match(PUBLICATION_PATTERN, input_string)
	if publication_match is not None:
	sys.stdout.write("Name of the publication %s is valid.\n" % input_string)
	return(True)
	else:
	print_error("Name of publication %s is not valid. Exiting." % input_string)
	sys.exit()
	# def check_publication ends here

	def connect_db():
	"""Connect to the database"""

	try:
	connection = psycopg2.connect("dbname='eoa_2017' user='kthoden' host='localhost'")# password=''")
	sys.stdout.write("Connection established.\n")
	return(connection)
	except:
	print_error("[ERROR] Could not connect.\n")
	# def connect_db ends here

	def get_publication_id(input_string, eoa_cursor):
	"""Queries the database for the publication id.

	Input is the input string, return value the id."""

	id_match = re.match(PUBLICATION_PATTERN, input_string)
	eoa_series = id_match.group('series')
	eoa_number = id_match.group('number')

	query_string = """SELECT "Title", "id" FROM publications_publication WHERE "Serie" = '%s' AND "Number" = '%s'""" % (eoa_series, eoa_number)

	eoa_cursor.execute(query_string)

	rows = eoa_cursor.fetchall()
	if len(rows) > 1:
	print_error("There should be only one database entry that matches the input. Found %s" % len(rows))
	elif len(rows) == 0:
	print_error("It seems like there is no such as publication %s %s. Exiting." % (eoa_series.title(), eoa_number))
	sys.exit()
	else:
	print("The title of the publication you selected is '%s'." % rows[0][0])

	return(rows[0][1])
	# def get_publication_id ends here

	def get_publication_info(eoa_pub_id, eoa_cursor):
	"""Get more information from one publication"""

	query_string = """SELECT * FROM publications_publication WHERE "id" = '%s' """ % eoa_pub_id

	eoa_cursor.execute(query_string)

	rows = eoa_cursor.fetchall()

	return(rows)
	# def get_publication_info ends here

	def get_chapters(eoa_pub_id, eoa_cursor):
	"""Queries database for information about the individual chapters.

	Return the first row including all the fields below"""

	query_string = """SELECT "Order", "Title", "Chapterauthor1",
	"Chapterauthor2", "Chapterauthor3", "Chapterauthor4",
	"Chapterauthor5", "Chapterpdffile" FROM publications_chapter WHERE "Publication_id"
	= '%s' ORDER BY "Order" ASC""" % eoa_pub_id

	eoa_cursor.execute(query_string)

	rows = eoa_cursor.fetchall()

	for row in rows:
	if len(row[7]) == 0:
	print_error("There seems to be no file attached to chapter '%s'. Removing it." % row[1])

	rows[:] = [row for row in rows if len(row[7]) > 0]

	return(rows)
	# get_chapters ends here

	def format_authors(result_list, start_range, end_range):
	"""Format the list of authors

	Input is the start and end point of the authors in a list. Return
	both a formatted string and the pure list of authors.
	"""
	# now dealing with the authors
	authors = []

	for author in range(start_range, end_range):
	if len(result_list[author]) > 0:
	authors.append(result_list[author])

	if len(authors) == 0:
	authors_as_string = ""
	if len(authors) == 1:
	authors_as_string = """%s""" % (authors[0])
	elif len(authors) == 2:
	authors_as_string = """%s and %s""" % (authors[0], authors[1])
	elif len(authors) > 2:
	authors_as_string = """%s""" % authors[0]
	for author in range(1, len(authors) - 1):
	authors_as_string += ", " + authors[author]
	authors_as_string += " and %s" % (authors[-1])

	return(authors_as_string, authors)
	# def format_authors ends here

	def format_authors_xml(eoa_publication_info):
	"""Format the list of authors

	Input is the start and end point of the authors in a list. Return
	an XML element as a string.
	"""

	authors_as_string = ""

	for authornumber in range(1, 6):
	author_key = "Publicationauthor" + str(authornumber)
	tmp_author = eoa_publication_info[author_key]
	if len(tmp_author) > 0:
	nameparts = tmp_author.split(" ")
	author_string = """
	<author primary_contact="true" user_group_ref="Author">
	<firstname>%s</firstname>
	<lastname>%s</lastname>
	<email>author@example.com</email>
	</author>""" % (nameparts[0], nameparts[-1])
	authors_as_string += author_string

	return(authors_as_string)
	# def format_authors_xml ends here

	def format_title(title_string, is_book_subtitle=False, unformatted=False):
	"""Convert html tags to their LaTeX counterpart."""

	EMPHASIS = re.compile("<em>(.*?)</em>")
	LOWER = re.compile("<sub>(.*?)</sub>")
	BOLD = re.compile("<b>(.*?)</b>")

	chapter_title = title_string.replace("<br/>", " ")
	chapter_title = unescape(chapter_title)

	if is_book_subtitle is True and len(title_string) > 0:
	chapter_title = r" : " + chapter_title

	if unformatted is True:
	chapter_title = re.sub(EMPHASIS, r"\g<1>", chapter_title)
	chapter_title = re.sub(LOWER, r"\g<1>", chapter_title)
	chapter_title = re.sub(BOLD, r"\g<1>", chapter_title)
	else:
	chapter_title = re.sub(EMPHASIS, r"\\emph{\g<1>}", chapter_title)
	chapter_title = re.sub(LOWER, r"\\textsubscript{\g<1>}", chapter_title)
	chapter_title = re.sub(BOLD, r"\\textbf{\g<1>}", chapter_title)

	return(chapter_title)
	# def format_title ends here

	def unescape(text):
	"""Remove HTML or XML character references and entities from a text
	string. Return a Unicode string.

	With thanks to http://effbot.org/zone/re-sub.htm#unescape-html.
	Modified to work with Python3.
	"""
	import html.entities

	def fixup(character):
	"""Fix one character."""

	text = character.group(0)
	if text[:2] == "&#":
	# character reference
	try:
	if text[:3] == "&#x":
	return chr(int(text[3:-1], 16))
	else:
	return chr(int(text[2:-1]))
	except ValueError:
	pass
	else:
	# named entity
	try:
	text = chr(html.entities.name2codepoint[text[1:-1]])
	except KeyError:
	pass
	return text # leave as is
	return re.sub(r"&#?\w+;", fixup, text)
	# def unescape ends here

	def format_chapter_info(chapter_result):
	"""Return a tuple with infos for the chapter bit."""

	chapter_title = chapter_result[1].rstrip()

	formatted_chapter_title = format_title(chapter_title)

	list_of_authors_string = format_authors(chapter_result, 2, 7)[0]

	authors_line = ""

	if len(list_of_authors_string) > 0:
	authors_line = list_of_authors_string

	return(formatted_chapter_title, authors_line)
	# def format_chapter_info ends here

	def which_publisher(series):
	"""Make layout dependant on series

	Return base url as well as publisher string"""

	if series == "sources":
	base_url = "http://edition-open-sources.org"
	publisher_name = "Edition Open Sources"
	else:
	base_url = "http://edition-open-access.de"
	publisher_name = r"Max Planck Research Library for the History and Development\\of Knowledge"

	return(base_url, publisher_name)
	# def which_publisher ends here

	def choose_geometry(eoa_series):
	"""Adjust page geometry and paper size to series"""

	fontsize_string = "fontsize=10pt"

	if eoa_series == "Sources":
	geometry_string = r"\usepackage[a4paper,inner=30mm,outer=30mm,top=14mm,bottom=20mm,includehead]{geometry}"
	fontsize_string = "fontsize=12pt"
	elif eoa_series == "Studies":
	geometry_string = r"\usepackage[paperwidth=170mm,paperheight=240mm,inner=22mm,outer=20mm,top=14mm,bottom=20mm,includehead]{geometry}"
	elif eoa_series == "Proceedings" or eoa_series == "Textbooks":
	geometry_string = r"\usepackage[paperwidth=148mm,paperheight=210mm,inner=20mm,outer=15mm,top=13mm,bottom=15mm,includehead]{geometry}"
	fontsize_string = "fontsize=9pt"
	return(geometry_string, fontsize_string)
	# def choose_geometry ends here

	def format_publication_info(eoa_publication_info):
	"""Provide strings for the publication info.

	Return a dictionary of items.
	"""

	pub_suffix = ""

	base_url, publisher_string = which_publisher(eoa_publication_info["Serie"])
	cover_url = "%s/media/%s" % (PRODUCTION_URL, eoa_publication_info["Coverbig"])
	download_cover_image(cover_url)

	publication_url = "%s/%s/%s/" % (base_url, eoa_publication_info["Serie"], eoa_publication_info["Number"])

	if len(eoa_publication_info["Publicationauthorsuffix"]) > 0:
	pub_suffix = " " + eoa_publication_info["Publicationauthorsuffix"]

	licence_string = format_licence(eoa_publication_info["Publicationlicense"])
	shoplink_string = format_shoplink(eoa_publication_info['Shoplink'])
	book_authors_string = format_authors_xml(eoa_publication_info)

	items_to_return = {"bookauthors" : book_authors_string,
	"pubsuffix" : pub_suffix,
	"booktitle": format_title(eoa_publication_info["Title"]),
	"booksubtitle" : format_title(eoa_publication_info["Subtitle"], is_book_subtitle=True,unformatted=True),
	"publisher" : publisher_string,
	"series" : eoa_publication_info["Serie"].title(),
	"number" : eoa_publication_info["Number"],
	"isbn" : eoa_publication_info["Isbn"],
	"pubdate" : eoa_publication_info["Datepublished"].strftime("%Y-%m-%d"),
	"url" : publication_url,
	"licence" : licence_string,
	"shoplink" : shoplink_string}

	return(items_to_return)
	# def format_publication_info ends here

	def format_licence(publication_licence):
	"""Provide a string for the licence used."""

	if publication_licence == "by-nc-sa":
	licence_string = r" under Creative Commons by-nc-sa 3.0 Germany Licence.\\\url{http://creativecommons.org/licenses/by-nc-sa/3.0/de/}"
	else:
	licence_string = "."

	return(licence_string)
	# def format_licence ends here

	def format_shoplink(input_string, raw=False):
	"""Parse the shoplink entry"""

	SHOPLINK_PATTERN = re.compile('<a href="(?P<book_url>.?)">(?P<company>.?)</a>')

	shoplink_match = re.match(SHOPLINK_PATTERN, input_string)

	company = shop_url = "k.A."

	try:
	shop_url = shoplink_match.group('book_url')
	company = shoplink_match.group('company')
	except AttributeError:
	pass

	if company == "epubli.de":
	shoplink_line = r"Neopubli GmbH, Berlin\par\url{%s}" % shop_url
	elif company == "pro-business.com":
	shoplink_line = r"PRO BUSINESS digital printing Deutschland GmbH, Berlin\par\url{%s}" % shop_url
	else:
	shoplink_line = ""

	if raw == False:
	return(shoplink_line)
	else:
	return(shop_url, company)
	# def format_shoplink ends here

	def download_cover_image(image_url):
	"""Download image from website.

	Code from
	https://stackoverflow.com/questions/8286352
	"""

	import urllib.request

	urllib.request.urlretrieve(image_url, "./Coverimage.jpg")
	# def download_cover_image

	def download_chapter_pdf(chapter_url, destination):
	"""Download the individual chapters from website.

	Also return the pdf_filename.
	"""

	import urllib.request

	try:
	urllib.request.urlretrieve(chapter_url.replace(" ", "%20"), "./" + destination)
	except urllib.error.HTTPError:
	print_error("Program received an HTTP Error 403: Forbidden. Maybe there are no chapter files?")
	# def download_chapter_pdf

	def file_base64(filepath):
	"""Base64 encode a file

	https://code.tutsplus.com/tutorials/base64-encoding-and-decoding-using-python--cms-25588
	"""
	import base64

	read_file = open(filepath, "rb").read()
	base_64_encode = base64.encodestring(read_file)

	return(base_64_encode)
	# def file_base64 ends here

	def run_latex(command):
	"""Compile the latex"""

	import shlex
	import subprocess

	arguments = shlex.split(command)

	subprocess.call(arguments)
	# def run_latex ends here

	def add_pdf_info(pdf_filename, list_of_authors, title_for_pdf, subject_string):
	"""Add metadata to PDF file"""
	# code taken from http://kitchingroup.cheme.cmu.edu/blog/2013/06/13/Reading-and-writing-pdf-metadata/

	from PyPDF2 import PdfFileWriter, PdfFileReader
	from PyPDF2.generic import NameObject, createStringObject

	# ('Proc1Pre', 'Proc1Pre-orig.pdf', '', 'Vorwort')

	original_pdf = open(pdf_filename + '_orig.pdf', 'rb')
	frontmatter_pdf = open(pdf_filename + '_frontmatter.pdf', 'rb')

	pdf_orig = PdfFileReader(original_pdf)
	pdf_frontmatter = PdfFileReader(frontmatter_pdf)

	writer = PdfFileWriter()

	writer.addPage(pdf_frontmatter.getPage(0))

	for page in range(pdf_orig.getNumPages()):
	writer.addPage(pdf_orig.getPage(page))

	info_dict = writer._info.getObject()

	info = pdf_orig.documentInfo
	for key in info:
	info_dict.update({NameObject(key): createStringObject(info[key])})

	info_dict.update({NameObject('/Title'): createStringObject(title_for_pdf)})
	info_dict.update({NameObject('/Subject'): createStringObject(subject_string)})
	info_dict.update({NameObject('/Author'): createStringObject(list_of_authors)})

	# It does not appear possible to alter in place.
	new_pdf = open(pdf_filename + '_out.pdf', 'wb')

	writer.write(new_pdf)
	original_pdf.close()
	new_pdf.close()

	os.unlink(pdf_filename + '_orig.pdf')
	os.rename(pdf_filename + '_out.pdf', pdf_filename + '.pdf')
	# def add_pdf_info ends here

	def create_chapter_frontmatter(eoa_publication):
	"""Main function"""

	# validate input
	check_publication(eoa_publication)

	# setting up database
	eoa_db = connect_db()
	eoa_cursor = eoa_db.cursor()

	eoa_pub_id = get_publication_id(eoa_publication, eoa_cursor)
	eoa_publication_info = get_publication_info(eoa_pub_id, eoa_cursor)

	base_url = which_publisher(eoa_publication_info[0][8])[0]

	# the template file
	tmp_template = open("./frontpage_template.tex", "r")
	frontmatter_template = tmp_template.read()
	tmp_template.close()

	chapter_files = get_chapters(eoa_pub_id, eoa_cursor)

	if len(chapter_files) == 0:
	print_error("Found no chapter files.")
	sys.exit("Exiting")

	print("Found %d chapter files." % (len(chapter_files)))

	os.chdir("./generated_files/")

	command_file = open(eoa_publication + "_copycommand.sh", "w")
	command_file.write("#!/bin/bash\n")

	list_of_auxfiles = []

	for chapter in chapter_files:
	# get the original pdf file
	chapter_url = "%s/media/%s" % (PRODUCTION_URL, chapter[7])

	pdf_filename = chapter_url.split("/")[-1]
	pdffilename_front, pdffilename_ext = os.path.splitext(pdf_filename)
	original_pdf_file = pdffilename_front + "_orig" + pdffilename_ext

	list_of_auxfiles.append(pdffilename_front + '_frontmatter.pdf')

	print("download_chapter_pdf from", chapter_url)
	download_chapter_pdf(chapter_url, original_pdf_file)

	tex_filename = eoa_publication + "ch" + str(chapter[0]) + ".tex"

	outfile = open("./" + tex_filename, "w")

	# getting data for the template
	item_for_template = format_publication_info(eoa_publication_info[0])

	formatted_chapter_title, authors_line = format_chapter_info(chapter)

	if len(authors_line) == 0:
	authors_line = item_for_template["bookauthors"]

	formatted_chapter_authors = r"\emph{%s:}" % authors_line

	geometry_string, fontsize_string = choose_geometry(item_for_template["series"])

	frontmatter_template_string = string.Template(frontmatter_template)
	# fill in the blanks
	frontmatter_replacement = frontmatter_template_string.substitute(
	FONTSIZE=fontsize_string,
	GEOMETRY_SETTINGS=geometry_string,
	FORMATTED_CHAPTER_TITLE=formatted_chapter_title,
	CHAPTER_AUTHORS_LINE=formatted_chapter_authors,
	FORMAT_AUTHORS=item_for_template["bookauthors"],
	FORMATTED_SHOPLINK=item_for_template["shoplink"],
	LICENCE=item_for_template["licence"],
	PUB_SUFFIX=item_for_template["pubsuffix"],
	FORMAT_TITLE=item_for_template["booktitle"],
	FORMAT_SUBTITLE=item_for_template["booksubtitle"].replace(" : ", "~:~"),
	PUBLISHER_STRING=item_for_template["publisher"],
	EOA_SERIES=item_for_template["series"],
	SERIES_NUMBER=item_for_template["number"],
	ISBN_CODE=item_for_template["isbn"],
	PUB_DATE=item_for_template["pubdate"],
	PUBLICATION_URL=item_for_template["url"])

	outfile.write(frontmatter_replacement)
	outfile.close()

	unformatted_chapter_title = format_title(chapter[1].rstrip(), unformatted=True)

	subject_string = "A chapter from %s%s by %s (%s). %s, %s %s. %s" % (item_for_template[2], item_for_template[3], item_for_template[0], item_for_template[8], item_for_template[4], item_for_template[5], item_for_template[6], item_for_template[9])

	# generate PDF file
	print("Typsetting the frontmatter to chapter '%s'" % unformatted_chapter_title)

	latex_command = "xelatex --interaction=batchmode -jobname='%s_frontmatter' %s" % (pdffilename_front, tex_filename)
	run_latex(latex_command)

	add_pdf_info(pdffilename_front, authors_line, unformatted_chapter_title, subject_string)

	command_file.write("cp %s%s %s%s.bak\n" % (MEDIA_DIR, chapter[7], MEDIA_DIR, chapter[7]))
	command_file.write("cp '%s' %s%s\n" % (pdf_filename, MEDIA_DIR, chapter[7]))

	# back to normal
	command_file.close()

	print("Removing aux files.")

	for auxfile in os.listdir("."):
	if re.search(r'.*\.(aux\|log\|tex)', auxfile):
	list_of_auxfiles.append(auxfile)

	for file_for_deletion in list_of_auxfiles:
	os.unlink(file_for_deletion)

	print("Removing other files.")
	os.unlink("Coverimage.jpg")

	os.chdir("..")
	# def create_chapter_frontmatter ends here

	def create_omp_native_xml(eoa_publication):
	"""Use the database infos for creating input for OMP"""
	# validate input
	check_publication(eoa_publication)

	# setting up database
	eoa_db = connect_db()
	eoa_cursor = eoa_db.cursor(cursor_factory=psycopg2.extras.DictCursor)

	eoa_pub_id = get_publication_id(eoa_publication, eoa_cursor)
	eoa_publication_info = get_publication_info(eoa_pub_id, eoa_cursor)[0]

	base_url = which_publisher(eoa_publication_info["Serie"])

	# the template file
	tmp_template = open("./data/native_template.xml", "r")
	frontmatter_template = tmp_template.read()
	tmp_template.close()

	xml_filename = eoa_publication + ".xml"

	outfile = open("./generated_files/" + xml_filename, "w")

	item_for_template = format_publication_info(eoa_publication_info)

	supplierurl, suppliercomp = format_shoplink(eoa_publication_info["Shoplink"], raw=True)

	if len(eoa_publication_info["Subtitle"]) > 0:
	onix_subtitle = """<onix:Subtitle>%s</onix:Subtitle>""" % eoa_publication_info["Subtitle"]
	omp_subtitle = """<subtitle locale="en_US">%s</subtitle>""" % eoa_publication_info["Subtitle"]
	else:
	omp_subtitle = onix_subtitle = ""

	frontmatter_template_string = string.Template(frontmatter_template)
	# fill in the blanks
	frontmatter_replacement = frontmatter_template_string.substitute(
	INTERNAL_ID=item_for_template["number"],
	FORMAT_AUTHORS=item_for_template["bookauthors"],
	FORMAT_TITLE=item_for_template["booktitle"],
	OMP_SUBTITLE=omp_subtitle,
	ONIX_SUBTITLE=onix_subtitle,
	PUBLISHER_STRING=item_for_template["publisher"],
	EOA_SERIES=item_for_template["series"],
	SERIES_NUMBER=item_for_template["number"],
	ISBN_CODE=item_for_template["isbn"],
	PUB_DATE=item_for_template["pubdate"],
	PUBLICATION_URL=item_for_template["url"],
	ABSTRACT=eoa_publication_info["Descriptionlong"].replace("<br/>", ""),
	BASE64_PDF=file_base64("./data/dummy.pdf"),
	PRICE=eoa_publication_info["Price"],
	TODAY=datetime.today().strftime("%Y-%m-%d"),
	SUPPLIER_COMP=suppliercomp,
	SUPPLIER_URL=supplierurl,
	PAGES=eoa_publication_info["Pages"],
	SUBMISSION_NAME="%s_%d_submission" % (item_for_template["series"], item_for_template["number"]),
	PUBDATE_00=eoa_publication_info["Datepublished"].strftime("%Y%m%d")
	)

	outfile.write(frontmatter_replacement)
	outfile.close()
	# def create_omp_native_xml ends here

	if __name__ == '__main__':
	if len(sys.argv) == 1:
	print_error("You must specify a publication!")
	sys.exit()
	elif len(sys.argv) > 2:
	print_error("You can work with only one publication at a time!")
	sys.exit()
	create_omp_native_xml(sys.argv[-1])
	# create_chapter_frontmatter(sys.argv[-1])
	# finis