build_frontmatter.py

#!/usr/bin/python3
# -*- coding: utf-8; mode: python -*-
__version__ = "2.0"
__date__ = "20170208"
__author__ = "kthoden@mpiwg-berlin.mpg.de"
__doc__ = """Generates LaTeX code for informative frontmatters. These will be
attached to the chapter PDFs of EOA publications we offer for
download"""

import string
import sys
import re
import os
import json
import csv
import datetime
from pathlib import Path
import urllib.request
import logging
import argparse
import configparser

logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')

BASE_DIR = Path( os.path.realpath(__file__) ).parent
SCRIPT_NAME = Path( __file__).stem

# using https://wiki.postgresql.org/wiki/Psycopg2_Tutorial

PUBLICATION_PATTERN = re.compile("(?P<series>studies|sources|proceedings|textbooks)(?P<number>[1-9]?[0-9]{1})$")


def check_publication(input_string):
    """Checks the input string for a valid name."""

    publication_match = re.match(PUBLICATION_PATTERN, input_string)
    if publication_match is not None:
        logging.info("The name of the publication is valid: %s", input_string)
        return True
    else:
        logging.error("The name of publication is not valid: %s. Exiting.", input_string)
        sys.exit()
# def check_publication ends here


def get_db_cursor(DB_FORMAT, CONFIG):
    """Depending on selected database, connect to it and get a cursor."""

    def get_credentials_postgres(CONFIG):
        """Parse login information from config file"""

        db_cred = CONFIG["postgres"]

        credentials = "dbname=%s user=%s host=%s password=%s" % (db_cred["database_name"], db_cred["user"], db_cred["host"], db_cred["password"])

        return credentials.replace('"', "'")
    # def get_credentials_postgres ends here


    def connect_db_postgres(credentials):
        """Connect to the database"""

        import psycopg2

        try:
            connection = psycopg2.connect(credentials)
            logging.info("Connection established.")
            return connection
        except:
            logging.error("Could not connect. Exiting.")
            sys.exit()
    # def connect_db_postgres ends here


    if DB_FORMAT == "postgres":
        import psycopg2.extras
        credentials = get_credentials_postgres(CONFIG)
        eoa_db = connect_db_postgres(credentials)
        eoa_cursor = eoa_db.cursor(cursor_factory=psycopg2.extras.DictCursor)
    elif DB_FORMAT == "sqlite":
        import sqlite3
        logging.debug("Using database " + CONFIG["sqlite"]["database_file"])
        eoa_db = sqlite3.connect(CONFIG["sqlite"]["database_file"].replace('"', ""))
        eoa_db.row_factory = sqlite3.Row
        eoa_cursor = eoa_db.cursor()

    return eoa_cursor
# def get_db_cursor ends here


def get_publication_id(input_string, eoa_cursor, eoaclassic):
    """Queries the database for the publication id.

    Input is the input string, return value the id."""

    if eoaclassic:
        db_table = "publications_publication"
    else:
        db_table = "eoapublications_publication"

    id_match = re.match(PUBLICATION_PATTERN, input_string)
    eoa_series = id_match.group('series')
    eoa_number = id_match.group('number')

    query_string = """SELECT "Title", "id" FROM {} WHERE "Serie" = '{}' AND "Number" = '{}'""".format(db_table, eoa_series, eoa_number)

    eoa_cursor.execute(query_string)

    rows = eoa_cursor.fetchall()
    if len(rows) > 1:
        logging.warning("There should be only one database entry that matches the input. Found %s", len(rows))
    elif len(rows) == 0:
        series_and_title = eoa_series.title() + " " + eoa_number
        logging.error("It seems like there is no such as publication %s. Exiting.", series_and_title)
        sys.exit()
    else:
        logging.info("The title of the publication you selected is '%s'.", rows[0][0])

    return rows[0][1]
# def get_publication_id ends here


def get_publication_info(eoa_pub_id, eoa_cursor, eoaclassic):
    """Get more information from one publication"""

    if eoaclassic:
        db_table = "publications_publication"
    else:
        db_table = "eoapublications_publication"

    query_string = """SELECT * FROM {} WHERE "id" = '{}'""".format(db_table, eoa_pub_id)

    eoa_cursor.execute(query_string)

    rows = eoa_cursor.fetchall()

    return rows
# def get_publication_info ends here


def get_chapters(eoa_pub_id, eoa_cursor, eoaclassic, include_pdfless=False):
    """Queries database for information about the individual chapters.

    Return the first row including all the fields below"""

    if eoaclassic:
        db_table = "publications_chapter"
    else:
        db_table = "eoapublications_chapter"

    query_string = """SELECT "Order", "Title", "Chapterauthor1", "Chapterauthor2",
    "Chapterauthor3", "Chapterauthor4", "Chapterauthor5",
    "Chapterpdffile", "Doichapter", "Chapterlanguage", "id", "Number"
    FROM {} WHERE "Publication_id" = '{}' ORDER BY "Order" ASC""".format(db_table, eoa_pub_id)

    eoa_cursor.execute(query_string)

    rows = eoa_cursor.fetchall()

    if include_pdfless:
        return rows
    else:
        for row in rows:
            if len(row[7]) == 0:
                logging.warning("There seems to be no file attached to chapter '%s'. Removing it.", row[1])

        rows[:] = [row for row in rows if len(row[7]) > 0]

        return rows
# get_chapters ends here


def get_sections(eoa_chap_id, eoa_cursor, eoaclassic):
    """Get sections of a chapter from database"""

    if eoaclassic:
        db_table = "publications_element"
    else:
        db_table = "eoapublications_element"

    query_string = """SELECT "Number", "Fulltext" FROM {} WHERE "Kind" = "eoasection" AND "Chapter_id" =
    '{}' ORDER BY "Order" ASC""".format(db_table, eoa_chap_id)

    eoa_cursor.execute(query_string)

    rows = eoa_cursor.fetchall()

    return rows
# def get_sections ends here


def format_authors(eoa_publication_info, pubtype):
    """Format the list of authors

    Input is the start and end point of the authors in a list. Return
    a string version.
    """

    authors_as_string = ""
    authors = []

    if pubtype == "book":
        author_type = "Publicationauthor"
    elif pubtype == "chapter":
        author_type = "Chapterauthor"

    for authornumber in range(1, 6):
        author_key = author_type + str(authornumber)
        tmp_author = eoa_publication_info[author_key]
        if len(tmp_author) > 0:
            authors.append(tmp_author)

    if len(authors) == 0:
        authors_as_string = ""
    if len(authors) == 1:
        authors_as_string = """%s""" % (authors[0])
    elif len(authors) == 2:
        authors_as_string = """%s and %s""" % (authors[0], authors[1])
    elif len(authors) > 2:
        authors_as_string = """%s""" % authors[0]
        for author in range(1, len(authors) - 1):
            authors_as_string += ", " + authors[author]
        authors_as_string += ", and %s" % (authors[-1])

    return authors_as_string, authors
# def format_authors ends here


def format_authors_xml(eoa_publication_info):
    """Format the list of authors

    Input is the start and end point of the authors in a list. Return
    an XML element as a string.
    """

    authors_as_string = ""

    for authornumber in range(1, 6):
        author_key = "Publicationauthor" + str(authornumber)
        tmp_author = eoa_publication_info[author_key]
        if len(tmp_author) > 0:
            nameparts = tmp_author.split(" ")
            author_string = """
    <author primary_contact="true" user_group_ref="Author">
      <firstname>%s</firstname>
      <lastname>%s</lastname>
      <email>author@example.com</email>
    </author>""" % (nameparts[0], nameparts[-1])
            authors_as_string += author_string

    return authors_as_string
# def format_authors_xml ends here


def format_title(title_string, is_book_subtitle=False, unformatted=False):
    """Convert html tags to their LaTeX counterpart."""

    EMPHASIS = re.compile("<em>(.*?)</em>")
    LOWER = re.compile("<sub>(.*?)</sub>")
    BOLD = re.compile("<b>(.*?)</b>")

    chapter_title = title_string.replace("<br/>", " ")
    chapter_title = title_string.replace("\n", " ")
    chapter_title = re.sub(r" +", " ", chapter_title)
    chapter_title = unescape(chapter_title)

    if is_book_subtitle is True and len(title_string) > 0:
        chapter_title = r" : " + chapter_title

    if unformatted is True:
        chapter_title = re.sub(EMPHASIS, r"\g<1>", chapter_title)
        chapter_title = re.sub(LOWER, r"\g<1>", chapter_title)
        chapter_title = re.sub(BOLD, r"\g<1>", chapter_title)
    else:
        chapter_title = re.sub(EMPHASIS, r"\\emph{\g<1>}", chapter_title)
        chapter_title = re.sub(LOWER, r"\\textsubscript{\g<1>}", chapter_title)
        chapter_title = re.sub(BOLD, r"\\textbf{\g<1>}", chapter_title)

    return chapter_title
# def format_title ends here


def unescape(text):
    """Remove HTML or XML character references and entities from a text
    string. Return a Unicode string.

    With thanks to http://effbot.org/zone/re-sub.htm#unescape-html.
    Modified to work with Python3.
    """
    import html.entities

    def fixup(character):
        """Fix one character."""

        text = character.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return chr(int(text[3:-1], 16))
                else:
                    return chr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            try:
                text = chr(html.entities.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text # leave as is
    return re.sub(r"&#?\w+;", fixup, text)
# def unescape ends here


def format_chapter_info(chapter_result):
    """Return a tuple with infos for the chapter bit."""

    chapter_title = chapter_result["Title"].rstrip()

    formatted_chapter_title = format_title(chapter_title)

    list_of_authors_string = format_authors(chapter_result, "chapter")[0]

    authors_line = ""

    if len(list_of_authors_string) > 0:
        authors_line = list_of_authors_string

    return formatted_chapter_title, authors_line
# def format_chapter_info ends here


def which_publisher(series):
    """Make layout dependant on series

    Return base url as well as publisher string"""

    if series == "sources":
        base_url = "https://edition-open-sources.org"
        main_organisation = "Max-Planck-Gesellschaft zur Förderung der Wissenschaften"
        publisher_name = "Edition Open Sources"
    else:
        base_url = "https://www.mprl-series.mpg.de"
        # base_url = "http://edition-open-access.de"
        main_organisation = "Max-Planck-Gesellschaft zur Förderung der Wissenschaften"
        publisher_name = r"Max Planck Research Library for the History and Development of Knowledge"

    return base_url, main_organisation, publisher_name
# def which_publisher ends here


def format_date_string(input_date, date_format):
    """Format date according to specified format."""

    if isinstance(input_date, datetime.date):
        formatted_date = input_date.strftime(date_format)
    elif isinstance(input_date, str):
        date_object = datetime.datetime.strptime(input_date, "%Y-%m-%d")
        formatted_date = date_object.strftime(date_format)
    else:
        logging.error("Unrecognized date format: %s. Exiting.", type(input_date))
        sys.exit(0)

    return formatted_date
# def format_date_string ends here


def format_publication_info(eoa_publication_info, server_data):
    """Provide strings for the publication info.

    Return a dictionary of items.
    """

    pub_suffix = ""

    base_url, main_organisation, publisher_string = which_publisher(eoa_publication_info["Serie"])

    publication_url = "%s/%s/%s/" % (base_url, eoa_publication_info["Serie"], eoa_publication_info["Number"])

    if len(eoa_publication_info["Publicationauthorsuffix"]) > 0:
        pub_suffix = " " + eoa_publication_info["Publicationauthorsuffix"]

    licence_string = format_licence(eoa_publication_info["Publicationlicense"])
    shoplink_string = format_shoplink(eoa_publication_info["Shoplink"])

    if len(eoa_publication_info['Doipublication']) > 0:
        doi_string = f", DOI {eoa_publication_info['Doipublication']}"
    else:
        doi_string = ""

    # book_authors_string = format_authors_xml(eoa_publication_info)
    book_authors_string = format_authors(eoa_publication_info, "book")[0]

    items_to_return = {"bookauthors" : book_authors_string,
                       "pubsuffix" : pub_suffix,
                       "booktitle": format_title(eoa_publication_info["Title"]),
                       "booksubtitle" : format_title(eoa_publication_info["Subtitle"], is_book_subtitle=True,unformatted=True),
                       "mainorg" : main_organisation,
                       "publisher" : publisher_string,
                       "series" : eoa_publication_info["Serie"].title(),
                       "number" : eoa_publication_info["Number"],
                       "isbn" : eoa_publication_info["Isbn"],
                       "doistring" : doi_string,
                       "pubdate" : format_date_string(eoa_publication_info["Datepublished"], "%Y"),
                       "url" : publication_url,
                       "licence" : licence_string,
                       "shoplink" : shoplink_string}

    return items_to_return
# def format_publication_info ends here


def format_licence(publication_licence):
    """Provide a string for the licence used."""

    if publication_licence == "by-nc-sa":
        licence_string = r" under Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Germany Licence.\\\url{https://creativecommons.org/licenses/by-nc-sa/3.0/de/}"
    elif publication_licence == "by-sa":
        licence_string = r" under Creative Commons Attribution-ShareAlike 4.0 International License.\\\url{https://creativecommons.org/licenses/by-sa/4.0/}"
    else:
        licence_string = "."

    return licence_string
# def format_licence ends here


def format_shoplink(input_string, raw=False):
    """Parse the shoplink entry"""

    from lxml import html

    html_string = html.fromstring(input_string)
    company = html_string.text
    shop_url = html_string.get("href")

    # SHOPLINK_PATTERN = re.compile('<a href="(?P<book_url>.*?)">(?P<company>.*?)</a>')
    # shoplink_match = re.match(SHOPLINK_PATTERN, input_string)
    # company = shop_url = "k.A."

    # try:
    #     shop_url = shoplink_match.group('book_url')
    #     company = shoplink_match.group('company')
    # except AttributeError:
    #     pass

    if company == "epubli.de":
        shoplink_line = r"epubli\,/\,neopubli GmbH, Berlin\par\url{%s}" % shop_url
    elif company == "pro-business.com":
        shoplink_line = r"PRO BUSINESS digital printing Deutschland GmbH, Berlin\par\url{%s}" % shop_url
    else:
        shoplink_line = ""

    if raw == False:
        return shoplink_line
    else:
        return shop_url, company
# def format_shoplink ends here


def create_chapter_frontmatter(eoa_publication, remove_info_page, keep_source, eoaclassic, eoa_cursor, server_data, out_dir):
    """Main function"""

    def choose_geometry(eoa_series):
        """Adjust page geometry and paper size to series"""

        fontsize_string = "fontsize=10pt"

        if eoa_series == "Sources":
            geometry_string = r"\usepackage[a4paper,inner=30mm,outer=30mm,top=14mm,bottom=20mm,includehead]{geometry}"
            fontsize_string = "fontsize=12pt"
        elif eoa_series == "Studies":
            geometry_string = r"\usepackage[paperwidth=170mm,paperheight=240mm,inner=22mm,outer=20mm,top=14mm,bottom=20mm,includehead]{geometry}"
        elif eoa_series == "Proceedings" or eoa_series == "Textbooks":
            geometry_string = r"\usepackage[paperwidth=148mm,paperheight=210mm,inner=20mm,outer=15mm,top=13mm,bottom=15mm,includehead]{geometry}"
            fontsize_string = "fontsize=9pt"
        return geometry_string, fontsize_string
    # def choose_geometry ends here


    def add_pdf_info(pdf_filename, list_of_authors, title_for_pdf, subject_string, remove_info_page):
        """Add metadata to PDF file"""
        # code taken from http://kitchingroup.cheme.cmu.edu/blog/2013/06/13/Reading-and-writing-pdf-metadata/

        from PyPDF2 import PdfFileWriter, PdfFileReader
        from PyPDF2.generic import NameObject, createStringObject

        original_pdf = open(pdf_filename + '_orig.pdf', 'rb')
        frontmatter_pdf = open(pdf_filename + '_frontmatter.pdf', 'rb')

        pdf_orig = PdfFileReader(original_pdf)
        pdf_frontmatter = PdfFileReader(frontmatter_pdf)

        writer = PdfFileWriter()

        writer.addPage(pdf_frontmatter.getPage(0))

        if remove_info_page == True:
            pagerange = range(1, pdf_orig.getNumPages())
            logging.debug("Removing old info page.")
        else:
            pagerange = range(pdf_orig.getNumPages())
            logging.debug("Not removing first page.")

        for page in pagerange:
            writer.addPage(pdf_orig.getPage(page))

        info_dict = writer._info.getObject()

        info = pdf_orig.documentInfo
        for key in info:
            info_dict.update({NameObject(key): createStringObject(info[key])})

        info_dict.update({NameObject('/Title'): createStringObject(title_for_pdf)})
        info_dict.update({NameObject('/Subject'): createStringObject(subject_string)})
        info_dict.update({NameObject('/Author'): createStringObject(list_of_authors)})

        # It does not appear possible to alter in place.
        new_pdf = open(pdf_filename + '_out.pdf', 'wb')

        writer.write(new_pdf)
        original_pdf.close()
        new_pdf.close()

        os.unlink(pdf_filename + '_orig.pdf')
        os.rename(pdf_filename + '_out.pdf', pdf_filename + '.pdf')
    # def add_pdf_info ends here


    def download_cover_image(image_url, image_path):
        """Download image from website.

        Code from
        https://stackoverflow.com/questions/8286352
        """

        urllib.request.urlretrieve(image_url, image_path)
    # def download_cover_image


    def download_chapter_pdf(chapter_url, destination):
        """Download the individual chapters from website.

        Also return the pdf_filename.
        """

        try:
            urllib.request.urlretrieve(chapter_url.replace(" ", "%20"), "./" + destination)
        except urllib.error.HTTPError:
            logging.error("Program received an HTTP Error 403: Forbidden. Maybe there are no chapter files?. Exiting")
            sys.exit()
    # def download_chapter_pdf


    def run_latex(command):
        """Compile the latex"""

        import shlex
        import subprocess

        arguments = shlex.split(command)

        subprocess.call(arguments)
    # def run_latex ends here


    eoa_pub_id = get_publication_id(eoa_publication, eoa_cursor, eoaclassic)
    eoa_publication_info = get_publication_info(eoa_pub_id, eoa_cursor, eoaclassic)[0]

    base_url = which_publisher(eoa_publication_info["Serie"])
    cover_url = "{}/media/{}".format(server_data['url'], eoa_publication_info['Coverbig'])
    cover_image = str(out_dir) + f"/{eoa_publication}_cover.jpg"
    download_cover_image(cover_url, cover_image)

    # the template file
    tmp_template = open(BASE_DIR / "data/frontpage_template.tex", "r")
    frontmatter_template = tmp_template.read()
    tmp_template.close()

    chapter_files = get_chapters(eoa_pub_id, eoa_cursor, eoaclassic)

    if len(chapter_files) == 0:
        logging.error("Found no chapter files. Exiting.")
        sys.exit()

    logging.info("Found %d chapter files." % (len(chapter_files)))

    os.chdir(out_dir)

    command_filename = str(out_dir) + f"/{eoa_publication}_copycommand.sh"
    command_file= open(command_filename, "w")
    command_file.write("#!/bin/bash\n")

    list_of_auxfiles = []

    for chapter in chapter_files:
        # get the original pdf file
        chapter_url = "{}/media/{}".format(server_data['url'], chapter[7])

        pdf_filename = chapter_url.split("/")[-1]
        pdffilename_front, pdffilename_ext = os.path.splitext(pdf_filename)
        original_pdf_file = pdffilename_front + "_orig" + pdffilename_ext

        list_of_auxfiles.append(pdffilename_front + '_frontmatter.pdf')

        logging.debug("Download chapter PDF from %s." % chapter_url)

        download_chapter_pdf(chapter_url, original_pdf_file)

        tex_filename = eoa_publication + "ch" + str(chapter[0]) + ".tex"

        outfile = open("./" + tex_filename, "w")

        # getting data for the template
        item_for_template = format_publication_info(eoa_publication_info, server_data)

        formatted_chapter_title, authors_line = format_chapter_info(chapter)
        if len(authors_line) == 0:
            authors_line = item_for_template["bookauthors"]

        formatted_chapter_authors = r"\emph{%s:}" % authors_line

        geometry_string, fontsize_string = choose_geometry(item_for_template["series"])

        if len(chapter[8]) > 0:
            chapter_doi = f"DOI: {chapter[8].rstrip()}"
        else:
            chapter_doi = ""

        frontmatter_template_string = string.Template(frontmatter_template)
        # fill in the blanks
        frontmatter_replacement = frontmatter_template_string.substitute(
            FONTSIZE=fontsize_string,
            GEOMETRY_SETTINGS=geometry_string,
            FORMATTED_CHAPTER_TITLE=formatted_chapter_title,
            DOI_CHAPTER_STRING=chapter_doi,
            CHAPTER_AUTHORS_LINE=formatted_chapter_authors,
            FORMAT_AUTHORS=item_for_template["bookauthors"],
            FORMATTED_SHOPLINK=item_for_template["shoplink"],
            LICENCE=item_for_template["licence"],
            MAINORG=item_for_template["mainorg"],
            PUB_SUFFIX=item_for_template["pubsuffix"],
            FORMAT_TITLE=item_for_template["booktitle"],
            FORMAT_SUBTITLE=item_for_template["booksubtitle"].replace(" : ", "~:~"),
            PUBLISHER_STRING=item_for_template["publisher"].replace("Development of", r"Development\\of"),
            PUBLISHER_STRING_RAW=item_for_template["publisher"],
            EOA_SERIES=item_for_template["series"],
            SERIES_NUMBER=item_for_template["number"],
            ISBN_CODE=item_for_template["isbn"],
            DOI_STRING=item_for_template["doistring"],
            PUB_DATE=item_for_template["pubdate"],
            PUBLICATION_URL=item_for_template["url"],
            COVERIMAGE=cover_image)

        outfile.write(frontmatter_replacement)
        outfile.close()

        unformatted_chapter_title = format_title(chapter[1].rstrip(), unformatted=True)

        subject_string = "A chapter from {}{} by {} ({}). {}, {} {}. {}".format(item_for_template["booktitle"], item_for_template["booksubtitle"], item_for_template["bookauthors"][0], item_for_template["pubdate"], item_for_template["publisher"], item_for_template["series"], item_for_template["number"], item_for_template["url"])

        # generate PDF file
        logging.info("Typesetting the frontmatter to chapter '%s'" % unformatted_chapter_title)

        latex_command = "xelatex --interaction=batchmode -jobname='%s_frontmatter' %s" % (pdffilename_front, tex_filename)
        run_latex(latex_command)

        add_pdf_info(pdffilename_front, authors_line, unformatted_chapter_title, subject_string, remove_info_page)

        command_file.write("cp -v {0}{1} {0}{1}.bak\n".format(server_data['media'], chapter[7]))
        command_file.write(f"cp -v '{str(out_dir)}/{pdf_filename}' {server_data['media']}{chapter[7]}\n")

    # back to normal
    command_file.close()

    logging.debug("Removing aux files.")

    for auxfile in os.listdir("."):
        if re.search(r'.*\.(aux|log)', auxfile):
            list_of_auxfiles.append(auxfile)
        if not keep_source:
            if re.search(r'.*\.(tex)', auxfile):
                list_of_auxfiles.append(auxfile)

    for file_for_deletion in list_of_auxfiles:
        os.unlink(file_for_deletion)

    logging.debug("Removing other files.")
    if not keep_source:
        os.unlink(cover_image)

    os.chdir("..")
# def create_chapter_frontmatter ends here


def create_omp_native_xml(eoa_publication, eoaclassic, eoa_cursor, server_data, out_dir):
    """Use the database infos for creating input for OMP"""

    def file_base64(filepath):
        """Base64 encode a file

        https://code.tutsplus.com/tutorials/base64-encoding-and-decoding-using-python--cms-25588
        """

        # https://github.com/SeleniumHQ/selenium/pull/7446/files
        try:
            from base64 import encodebytes
        except ImportError:
            from base64 import encodestring as encodebytes

        read_file = open(filepath, "rb").read()
        base_64_encode = encodebytes(read_file)

        return base_64_encode
    # def file_base64 ends here

    eoa_pub_id = get_publication_id(eoa_publication, eoa_cursor, eoaclassic)
    eoa_publication_info = get_publication_info(eoa_pub_id, eoa_cursor, eoaclassic)[0]

    base_url = which_publisher(eoa_publication_info["Serie"])

    # the template file
    tmp_template = open(BASE_DIR / "data/omp_template.xml", "r")
    frontmatter_template = tmp_template.read()
    tmp_template.close()

    xml_filename = str(out_dir) + f"/{eoa_publication}.xml"

    outfile = open(xml_filename, "w")

    item_for_template = format_publication_info(eoa_publication_info, server_data)

    supplierurl, suppliercomp = format_shoplink(eoa_publication_info["Shoplink"], raw=True)

    if len(eoa_publication_info["Subtitle"]) > 0:
        onix_subtitle = """<onix:Subtitle>%s</onix:Subtitle>""" % eoa_publication_info["Subtitle"]
        omp_subtitle = """<subtitle locale="en_US">%s</subtitle>""" % eoa_publication_info["Subtitle"]
    else:
        omp_subtitle = onix_subtitle = ""

    frontmatter_template_string = string.Template(frontmatter_template)
    # fill in the blanks
    frontmatter_replacement = frontmatter_template_string.substitute(
        INTERNAL_ID=item_for_template["number"],
        FORMAT_AUTHORS=item_for_template["bookauthors"],
        FORMAT_TITLE=item_for_template["booktitle"],
        OMP_SUBTITLE=omp_subtitle,
        ONIX_SUBTITLE=onix_subtitle,
        PUBLISHER_STRING=item_for_template["publisher"],
        EOA_SERIES=item_for_template["series"],
        SERIES_NUMBER=item_for_template["number"],
        ISBN_CODE=item_for_template["isbn"],
        PUB_DATE=item_for_template["pubdate"],
        PUBLICATION_URL=item_for_template["url"],
        ABSTRACT=eoa_publication_info["Descriptionlong"].replace("<br/>", ""),
        BASE64_PDF=file_base64(BASE_DIR / "data/dummy.pdf"),
        PRICE=eoa_publication_info["Price"],
        TODAY=datetime.datetime.today().strftime("%Y-%m-%d"),
        SUPPLIER_COMP=suppliercomp,
        SUPPLIER_URL=supplierurl,
        PAGES=eoa_publication_info["Pages"],
        SUBMISSION_NAME="%s_%d_submission" % (item_for_template["series"], item_for_template["number"]),
        PUBDATE_00=format_date_string(eoa_publication_info["Datepublished"], "%Y%m%d"))

    outfile.write(frontmatter_replacement)
    logging.debug("Wrote {}".format(xml_filename))
    outfile.close()
# def create_omp_native_xml ends here


def create_datacite_json(eoa_publication, eoaclassic, eoa_cursor, server_data, use_test_repo, CONFIG, out_dir):
    """Create JSON documents for DOI registration"""

    def get_viaf_list():
        """Retrieve CSV with VIAF IDs and store as list of dictionaries"""

        online = False
        if online:
            viaf_csv_url = "https://github.molgen.mpg.de/EditionOpenAccess/eoa-utilities/blob/master/eoaviaf.csv"
            vcsv = urllib.request.urlopen(viaf_csv_url)
            reader = csv.DictReader(vcsv, delimiter=(","), fieldnames = ("Name", "VID"))
        else:
            viaf_csv = "/Users/kthoden/EOAKram/dev/eoa-utilities/eoaviaf.csv"
            with open(viaf_csv,'r') as vcsv:
                reader = csv.DictReader(vcsv, delimiter=(","), fieldnames = ("Name", "VID"))
                viaf_list = [row for row in reader]

        return viaf_list
    # def get_viaf_list ends here


    def get_viaf(creator, viaf_list):
        """Try to look up VIAF id from internal database"""

        for row in viaf_list:
            if row["Name"] != creator:
                viaf_id = None
            else:
                if "," in row["VID"]:
                    viaf_alternatives = row["VID"].split(",")
                    viaf_id = row["VID"].split(",")[0]
                else:
                    viaf_id = row["VID"]
                    if viaf_id in ["?", "-"]:
                        viaf_id = None
                break

        return viaf_id
    # def get_viaf ends here


    def create_creator_dict(creator):
        """Format creator names"""

        creator_parts = creator.split(" ")
        if len(creator_parts) == 2:
            firstname = creator_parts[0]
            lastname = creator_parts[1]
        elif len(creator_parts) > 2:
            firstname = " ".join(creator_parts[0:-1])
            lastname = creator_parts[-1]

        viaf_id = get_viaf(creator, viaf_list)

        creator_dict = {
            "nameType": "Personal",
            "name": f"{lastname}, {firstname}",
            "givenName": firstname,
            "familyName": lastname
        }

        if viaf_id:
            viaf_dict = {
                "schemeUri" : "http://viaf.org/viaf/",
                "nameIdentifierScheme" : "VIAF",
                "nameIdentifier": viaf_id
            }

            creator_dict["nameIdentifiers"] = [viaf_dict]
        else:
            logging.warning("Proceeding without VIAF for %s", creator)

        return creator_dict
    # def create_creator_dict ends here


    def get_types_dict(pub_type):
        """Create dictionary for publication types"""

        types_dict = {
            "ris": pub_type.upper()[0:4],
            "bibtex": pub_type,
            "citeproc": pub_type,
            "schemaOrg": pub_type.title(),
            "resourceType": pub_type.title(),
            "resourceTypeGeneral": "Text"
        }

        if pub_type == "chapter":
            types_dict.update(bibtex = "inbook")

        return types_dict
    # def get_types_dict ends here

    viaf_list = get_viaf_list()

    eoa_pub_id = get_publication_id(eoa_publication, eoa_cursor, eoaclassic)
    eoa_publication_info = get_publication_info(eoa_pub_id, eoa_cursor, eoaclassic)[0]
    item_for_template = format_publication_info(eoa_publication_info, server_data)

    doipublication = eoa_publication_info['Doipublication'].rstrip()
    if len(doipublication) == 0:
        logging.error("There is no DOI yet in the Django database. Please assign one first.")
        sys.exit()

    testprefix = CONFIG["doi"]["testprefix"][1:-1]

    if use_test_repo:
        batch_filename = str(out_dir) + f"/{eoa_publication}_doiupload_test.sh"
        doipublication = re.sub(r'(10\.[0-9]{4,})/',f'{testprefix}/', doipublication)
        api_url = "https://api.test.datacite.org/dois"
    else:
        batch_filename = str(out_dir) + f"/{eoa_publication}_doiupload.sh"
        api_url = "https://api.datacite.org/dois"

    curl_command = 'curl --verbose --netrc --request POST --header "Content-Type: application/vnd.api+json"'

    chapter_files = get_chapters(eoa_pub_id, eoa_cursor, eoaclassic, include_pdfless=True)

    creator_list_book = []
    creators = format_authors(eoa_publication_info, "book")[1]

    for creator in creators:
        creator_entry = create_creator_dict(creator)
        creator_list_book.append(creator_entry)

    main_title = {
        "lang": eoa_publication_info["Language"],
        "title": format_title(eoa_publication_info["Title"], unformatted=True)
    }

    subtitle = {
        "lang": eoa_publication_info["Language"],
        "title" : format_title(eoa_publication_info["Subtitle"], is_book_subtitle=True, unformatted=True)[3:],
        "titleType": "Subtitle"
    }

    title_list_book = [main_title, subtitle]

    subjects_dict = {}
    dates_dict = {
            "date": eoa_publication_info["Datepublished"],
            "dateType": "Issued"
    }

    types_dict = get_types_dict("book")


    is_part_of_dict = {
        "schemeUri": None,
        "schemeType": None,
        "relationType": "IsPartOf",
        "relatedIdentifier": doipublication,
        "resourceTypeGeneral": "Text",
        "relatedIdentifierType": "DOI",
        "relatedMetadataScheme": None
    }

    container_dict = {
        "type": "Book",
        "identifier": doipublication,
        "identifiertype": "DOI",
    }

    chapter_list = []
    first_chapter_iteration = True
    for chapter in chapter_files:

        doichapter = chapter[8].rstrip()
        if use_test_repo:
            doichapter = re.sub(r'(10\.[0-9]{4,})/',f'{testprefix}/', doichapter)

        chapter_dict = {
            "schemeUri": None,
            "schemeType": None,
            "relationType": "HasPart",
            "relatedIdentifier": doichapter,
            "resourceTypeGeneral": "Text",
            "relatedIdentifierType": "DOI",
            "relatedMetadataScheme": None
        }
        chapter_list.append(chapter_dict)
        unformatted_chapter_title = format_title(f"{chapter[11]} {chapter[1]}".strip(), unformatted=True)
        if first_chapter_iteration == True:
            chapter_title_string = unformatted_chapter_title
        else:
            chapter_title_string += f"<br/>{unformatted_chapter_title}"
        first_chapter_iteration = False

    publication_licence = eoa_publication_info["Publicationlicense"]
    if publication_licence == "by-nc-sa":
        rights_string = "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Germany"
        rights_uri = "https://creativecommons.org/licenses/by-nc-sa/3.0/de/legalcode"
        rights_id = "cc-by-nc-sa-4.0"
    elif publication_licence == "by-sa":
        rights_string = "Creative Commons Attribution-ShareAlike 4.0 International"
        rights_uri = "https://creativecommons.org/licenses/by-sa/4.0/legalcode"
        rights_id = "cc-by-sa-4.0"

    rights_dict = {
          "rights": rights_string,
          "rightsUri": rights_uri,
          "schemeUri": "https://spdx.org/licenses/",
          "rightsIdentifier": rights_id,
          "rightsIdentifierScheme": "SPDX"
    }

    abstract_dict = {
        "lang": eoa_publication_info["Language"],
          "description": eoa_publication_info["Descriptionlong"],
          "descriptionType": "Abstract"
    }
    toc_dict = {
        "lang": eoa_publication_info["Language"],
          "description": chapter_title_string,
          "descriptionType": "TableOfContents"
    }

    description_list_book = [abstract_dict, toc_dict]

    attributes_dict_book = {
        "event" : "hide",
        "doi" : doipublication,
        "creators" : creator_list_book,
        "titles" : title_list_book,
        "publisher" : item_for_template["mainorg"],
        "publicationYear" : int(item_for_template["pubdate"]),
        "subjects" : [subjects_dict],
        "dates" : [dates_dict],
        "language" : eoa_publication_info["Language"],
        "types" : types_dict,
        "relatedIdentifiers" : chapter_list,
        "sizes" : [],
        "formats" : [],
        "version" : None,
        "rightsList" : [rights_dict],
        "descriptions" : description_list_book,
        "geoLocations": [],
        "fundingReferences": [],
        "url": item_for_template["url"],
        "contentUrl": None,
        "metadataVersion": 4,
        "schemaVersion": "http://datacite.org/schema/kernel-4"
    }

    data_dict = {
        "id" : doipublication,
        "type" : "dois",
        "attributes" : attributes_dict_book
    }

    json_dict = {"data" : data_dict}
    json_object = json.dumps(json_dict)
    json_file = str(out_dir) + f"/{eoa_publication}.json"
    write_json_file(json_object, json_file)

    command_file = open(batch_filename, "w")
    command_file.write("#!/bin/bash\n")
    command_file.write(f"{curl_command} -d @{json_file} {api_url}\n")

    chaptercounter = 1
    for chapter in chapter_files:
        list_of_authors = format_authors(chapter, "chapter")[1]

        doichapter = chapter[8].rstrip()
        if use_test_repo:
            doichapter = re.sub(r'(10\.[0-9]{4,})/',f'{testprefix}/', doichapter)

        if len(list_of_authors) == 0:
            list_of_authors = format_authors(eoa_publication_info, "book")[1]

        creator_list_chapter = []
        for creator in list_of_authors:
            creator_entry = create_creator_dict(creator)
            creator_list_chapter.append(creator_entry)

        title_chapter = {
            "lang": chapter[9],
            "title": format_title(chapter[1].rstrip(), unformatted=True)
        }

        types_dict = get_types_dict("chapter")

        first_section_iteration = True
        section_title_string = ""
        sections = get_sections(chapter[10], eoa_cursor, eoaclassic)
        for section in sections:
            unformatted_section_title = format_title(f"{section[0]} {section[1]}".strip(), unformatted=True)
            if first_section_iteration == True:
                section_title_string = unformatted_section_title
            else:
                section_title_string += f"<br/>{unformatted_section_title}"
            first_section_iteration = False

        if len(section_title_string) > 0:
            description_list_chapter = [{
                "lang": chapter[9],
                "description": section_title_string,
                "descriptionType": "TableOfContents"
            }]
        else:
            description_list_chapter = None

        attributes_dict_chapter = {
            "event" : "hide",
            "doi" : doichapter,
            "creators" : creator_list_chapter,
            "titles" : [title_chapter],
            "publisher" : item_for_template["mainorg"],
            "publicationYear" : int(item_for_template["pubdate"]),
            "language" : chapter[9],
            "dates" : [dates_dict],
            "types" : types_dict,
            "relatedIdentifiers" : [is_part_of_dict],
            "container" : container_dict,
            "sizes" : [],
            "formats" : [],
            "version" : None,
            "rightsList" : [rights_dict],
            "descriptions" : description_list_chapter,
            "geoLocations": [],
            "fundingReferences": [],
            "url": f"{item_for_template['url']}{chapter[0]}/index.html",
            "schemaVersion": "http://datacite.org/schema/kernel-4"
        }

        data_dict = {
            "id" : doichapter,
            "type" : "dois",
            "attributes" : attributes_dict_chapter
        }
        json_dict = {"data" : data_dict}

        json_object = json.dumps(json_dict)
        json_file = out_dir / f"{eoa_publication}_chap{chaptercounter:02}.json"
        write_json_file(json_object, json_file)
        command_file.write(f"{curl_command} -d @{json_file} {api_url}\n")
        chaptercounter += 1

    # back to normal
    command_file.close()

    if use_test_repo:
        logging.info("Used the prefix for the test repository. Upload to api.test.datacite.org")
    else:
        logging.info("Used the prefix for the live repository. Upload to api.datacite.org")
# def create_datacite_json ends here


def write_json_file(json_object, filename):
    """Write JSON file to disk"""

    outfile = open(filename, "w")
    outfile.write(json_object)
    logging.debug("Wrote %s", filename)
    outfile.close()
# def write_json_file ends here


def create_bibtex(eoa_publication, eoaclassic, eoa_cursor, server_data, out_dir):
    """Use the database infos to create BibTeX files"""

    import html
    import shlex
    import subprocess
    from bibtexparser.bwriter import BibTexWriter
    from bibtexparser.bibdatabase import BibDatabase
    from lxml import html as lxhtml
    from lxml.html import clean

    db = BibDatabase()

    eoa_pub_id = get_publication_id(eoa_publication, eoa_cursor, eoaclassic)
    eoa_publication_info = get_publication_info(eoa_pub_id, eoa_cursor, eoaclassic)[0]
    item_for_template = format_publication_info(eoa_publication_info, server_data)

    creators = format_authors(eoa_publication_info, "book")[1]
    authors_concat = " and ".join(creators)

    db.entries = []
    main_item = {}

    main_item["location"] = "Berlin"
    main_item["title"] = f"{item_for_template['booktitle']}{format_title(eoa_publication_info['Subtitle'], is_book_subtitle=True, unformatted=True)[1:]}"
    main_item["url"] = item_for_template["url"]
    # main_item["series"] = item_for_template["series"]
    # main_item["number"] = str(item_for_template["number"])
    main_item["publisher"] = item_for_template["mainorg"]
    main_item["author"] = authors_concat
    # main_item["urldate"] = datetime.datetime.today().strftime("%Y-%m-%d")
    main_item["date"] = item_for_template["pubdate"]
    main_item["langid"] = eoa_publication_info["Language"]
    main_item["doi"] = eoa_publication_info['Doipublication']
    main_item["ID"] = eoa_publication
    main_item["ENTRYTYPE"] = "book"

    db.entries.append(main_item)

    nocitestring = f"---\ntitle: Citations\nnocite: |\n    @{eoa_publication}"

    chapter_files = get_chapters(eoa_pub_id, eoa_cursor, eoaclassic, include_pdfless=True)
    chaptercounter = 1
    for chapter in chapter_files:
        chap_item = {}
        chapter_creators = format_authors(chapter, "chapter")[1]

        if len(chapter_creators) == 0:
            authors_concat = " and ".join(creators)
        else:
            authors_concat = " and ".join(chapter_creators)
        chap_item["date"] = item_for_template["pubdate"]
        chap_item["location"] = "Berlin"
        chap_item["title"] = "{{" + format_title(chapter[1].rstrip()) + "}}"
        chap_item["booktitle"] = "{{" + f"{item_for_template['booktitle']}{format_title(eoa_publication_info['Subtitle'], is_book_subtitle=True, unformatted=True)[1:]}" + "}}"
        chap_item["url"] = f"{item_for_template['url']}{chapter[0]}/index.html"
        # chap_item["series"] = item_for_template["series"]
        # chap_item["number"] = str(item_for_template["number"])
        chap_item["publisher"] = item_for_template["mainorg"]
        chap_item["author"] = authors_concat
        # chap_item["urldate"] = datetime.datetime.today().strftime("%Y-%m-%d")
        chap_item["langid"] = chapter[9]
        chap_item["doi"] = chapter[8]
        # chap_item["crossref"] = eoa_publication
        citekey = f"{eoa_publication}_chap{chaptercounter:02}"
        chap_item["ID"] = citekey
        chap_item["ENTRYTYPE"] = "incollection"
        chaptercounter += 1

        db.entries.append(chap_item)

        nocitestring += f", @{citekey}"

    nocitestring += "\n---\n"

    writer = BibTexWriter()
    bib_filename = str(out_dir) + f"/{eoa_publication}.bib"
    with open(bib_filename, 'w') as bibfile:
        bibfile.write(writer.write(db))
    logging.info("Wrote %s", bib_filename)

    md_filename = str(out_dir) + f"/{eoa_publication}-citations.md"
    with open(md_filename, 'w') as mdfile:
        mdfile.write(nocitestring)
    logging.info("Wrote %s", md_filename)

    pandoc_command = f"pandoc -s -o tmp.html -t html --citeproc --csl=data/eoasc.csl --bibliography={bib_filename} {md_filename}"
    pandoc_command_arguments = shlex.split(pandoc_command)
    pandoc_result = subprocess.check_output(pandoc_command_arguments)
    pandoc_html = lxhtml.parse("tmp.html").getroot()
    entries = pandoc_html.find_class("csl-entry")

    md_filename = str(out_dir) + f"/{eoa_publication}.md"
    mdfile = open(md_filename, 'w')
    for entry in entries:
        citekey = entry.get('id')[4:]
        cc = clean.Cleaner(remove_tags=["span", "div"])
        clean_string = cc.clean_html(entry)
        citestring = lxhtml.tostring(clean_string)[5:-7].decode('utf-8')
        mdfile.write(f"{citekey}: {html.unescape(citestring)} \n")
    mdfile.close
    logging.info("Wrote %s", md_filename)
    os.unlink("tmp.html")
# def create_bibtex ends here


def create_xhtml(eoa_publication, edited, eoaclassic, eoa_cursor, server_data, out_dir):
    """Use the database infos for creating an XHTML file for pro business"""

    eoa_pub_id = get_publication_id(eoa_publication, eoa_cursor, eoaclassic)
    eoa_publication_info = get_publication_info(eoa_pub_id, eoa_cursor, eoaclassic)[0]

    base_url = which_publisher(eoa_publication_info["Serie"])

    chapter_files = get_chapters(eoa_pub_id, eoa_cursor, eoaclassic, include_pdfless=True)

    if len(chapter_files) == 0:
        logging.error("Found no chapter files. Exiting.")
        sys.exit()

    logging.debug("Found %d chapter files." % (len(chapter_files)))

    xhtml_string = "<ol>\n"

    for chapter in chapter_files:
        # getting data for the template
        chapter_title = chapter[1].replace("<br/>", " ")
        chapter_title = unescape(chapter_title)

        item_for_template = format_publication_info(eoa_publication_info, server_data)

        formatted_chapter_title, authors_line = format_chapter_info(chapter)
        if len(authors_line) == 0:
            authors_line = item_for_template["bookauthors"]

        if edited:
            xhtml_string += "<li>{} (<i>{}</i>)</li>\n".format(chapter_title, authors_line)
        else:
            xhtml_string += "<li>{}</li>\n".format(chapter_title)

    xhtml_string += "</ol>\n"

    html_filename = str(out_dir) + f"/{eoa_publication}.html"
    outfile = open(html_filename, "w")
    outfile.write(xhtml_string)
    logging.debug("Wrote {}".format(html_filename))
    outfile.close()
# def create_xhtml ends here

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("publication", help="Series and number of publication without space (e.g. sources7).")
    parser.add_argument("-f", "--config", help="Config file.", default = BASE_DIR / "config.cfg",)
    parser.add_argument("-k", "--keep", help="Keep the source files for manual intervention.", action="store_true")
    parser.add_argument("-r", "--remove", help="Remove the first page in case the chapter already contains an info page.", action="store_true")
    parser.add_argument("-e", "--edited", help="If set, the publication is treated as edited volume. This prints authors' names in the xhtml mode.", action="store_true")
    parser.add_argument("-c", "--classic", help="Use EOA 1.0 database layout.", action="store_true")
    parser.add_argument("-t", "--test", help="Use test prefix for DOIs.", action="store_true")
    db_group = parser.add_mutually_exclusive_group()
    db_group.add_argument("--postgres", action="store_true", help="Use postgres database.")
    db_group.add_argument("--sqlite", action="store_true", help="Use sqlite database.")
    format_group = parser.add_mutually_exclusive_group()
    format_group.add_argument("-j", "--json", action="store_true", help="DataCite JSON format.")
    format_group.add_argument("-b", "--bibtex", action="store_true", help="BibTeX format.")
    format_group.add_argument("-p", "--pdf", action="store_true", help="Frontpage for chapter download as PDF.")
    format_group.add_argument("-o", "--omp", action="store_true", help="ONIX metadata format.")
    format_group.add_argument("-x", "--xhtml", action="store_true")

    args = parser.parse_args()

    CONFIG = configparser.ConfigParser()
    try:
        CONFIG.read(args.config)
    except:
        logging.error("Config file missing. Exiting.")
        sys.exit()

    production_url = CONFIG["server"]["production_url"].replace('"', "")
    media_dir = CONFIG["server"]["media_dir"].replace('"', "")

    server_data = {"url" : production_url, "media" : media_dir}

    output_directory = CONFIG["output"]["output_directory"][1:-1]
    out_dir = Path(output_directory).resolve()

    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    if args.sqlite:
        DB_FORMAT = "sqlite"

    elif args.postgres:
        DB_FORMAT = "postgres"
    else:
        logging.error("Invalid database format specified. Exiting.")
        sys.exit(4)

    # validate input
    check_publication(args.publication)

    db_cursor = get_db_cursor(DB_FORMAT, CONFIG)

    if args.pdf:
        create_chapter_frontmatter(args.publication, args.remove, args.keep, args.classic, db_cursor, server_data, out_dir)
    elif args.json:
        create_datacite_json(args.publication, args.classic, db_cursor, server_data, args.test, CONFIG, out_dir)
    elif args.omp:
        create_omp_native_xml(args.publication, args.classic, db_cursor, server_data, out_dir)
    elif args.bibtex:
        create_bibtex(args.publication, args.classic, db_cursor, server_data, out_dir)
    elif args.xhtml:
        create_xhtml(args.publication, args.edited, args.classic, db_cursor, server_data, out_dir)
    else:
        logging.error("Specify whether you want the output as a PDF, OMP or XHTML input. Exiting.")
        sys.exit()
# def main ends here

if __name__ == '__main__':
    main()
# finis