Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
eoa_metadator/build_frontmatter.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
632 lines (470 sloc)
21.8 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# -*- coding: utf-8; mode: python -*- | |
__version__ = "1.0" | |
__date__ = "20170208" | |
__author__ = "kthoden@mpiwg-berlin.mpg.de" | |
__doc__ = """Generates LaTeX code for informative frontmatters. These | |
will be attached to the chapter PDFs of EOA publications we offer for | |
download""" | |
import string | |
import sys | |
import re | |
import os | |
from datetime import datetime | |
import psycopg2 | |
import psycopg2.extras | |
# using https://wiki.postgresql.org/wiki/Psycopg2_Tutorial | |
PUBLICATION_PATTERN = re.compile("(?P<series>studies|sources|proceedings|textbooks)(?P<number>[1-9]?[0-9]{1})$") | |
MEDIA_DIR = "/home/editionopenaccess/eoa/website/website/media/" | |
PRODUCTION_URL = "http://eoa-production.rz-berlin.mpg.de" | |
def print_error(message): | |
"""Print error message to stderr """ | |
message_string = "[ERROR] %s\n" % message | |
sys.stderr.write(message_string) | |
# def print_error ends here | |
def check_publication(input_string): | |
"""Checks the input string for a valid name.""" | |
publication_match = re.match(PUBLICATION_PATTERN, input_string) | |
if publication_match is not None: | |
sys.stdout.write("Name of the publication %s is valid.\n" % input_string) | |
return(True) | |
else: | |
print_error("Name of publication %s is not valid. Exiting." % input_string) | |
sys.exit() | |
# def check_publication ends here | |
def connect_db(): | |
"""Connect to the database""" | |
try: | |
connection = psycopg2.connect("dbname='eoa_2017' user='kthoden' host='localhost'")# password=''") | |
sys.stdout.write("Connection established.\n") | |
return(connection) | |
except: | |
print_error("[ERROR] Could not connect.\n") | |
# def connect_db ends here | |
def get_publication_id(input_string, eoa_cursor): | |
"""Queries the database for the publication id. | |
Input is the input string, return value the id.""" | |
id_match = re.match(PUBLICATION_PATTERN, input_string) | |
eoa_series = id_match.group('series') | |
eoa_number = id_match.group('number') | |
query_string = """SELECT "Title", "id" FROM publications_publication WHERE "Serie" = '%s' AND "Number" = '%s'""" % (eoa_series, eoa_number) | |
eoa_cursor.execute(query_string) | |
rows = eoa_cursor.fetchall() | |
if len(rows) > 1: | |
print_error("There should be only one database entry that matches the input. Found %s" % len(rows)) | |
elif len(rows) == 0: | |
print_error("It seems like there is no such as publication %s %s. Exiting." % (eoa_series.title(), eoa_number)) | |
sys.exit() | |
else: | |
print("The title of the publication you selected is '%s'." % rows[0][0]) | |
return(rows[0][1]) | |
# def get_publication_id ends here | |
def get_publication_info(eoa_pub_id, eoa_cursor): | |
"""Get more information from one publication""" | |
query_string = """SELECT * FROM publications_publication WHERE "id" = '%s' """ % eoa_pub_id | |
eoa_cursor.execute(query_string) | |
rows = eoa_cursor.fetchall() | |
return(rows) | |
# def get_publication_info ends here | |
def get_chapters(eoa_pub_id, eoa_cursor): | |
"""Queries database for information about the individual chapters. | |
Return the first row including all the fields below""" | |
query_string = """SELECT "Order", "Title", "Chapterauthor1", | |
"Chapterauthor2", "Chapterauthor3", "Chapterauthor4", | |
"Chapterauthor5", "Chapterpdffile" FROM publications_chapter WHERE "Publication_id" | |
= '%s' ORDER BY "Order" ASC""" % eoa_pub_id | |
eoa_cursor.execute(query_string) | |
rows = eoa_cursor.fetchall() | |
for row in rows: | |
if len(row[7]) == 0: | |
print_error("There seems to be no file attached to chapter '%s'. Removing it." % row[1]) | |
rows[:] = [row for row in rows if len(row[7]) > 0] | |
return(rows) | |
# get_chapters ends here | |
def format_authors(result_list, start_range, end_range): | |
"""Format the list of authors | |
Input is the start and end point of the authors in a list. Return | |
both a formatted string and the pure list of authors. | |
""" | |
# now dealing with the authors | |
authors = [] | |
for author in range(start_range, end_range): | |
if len(result_list[author]) > 0: | |
authors.append(result_list[author]) | |
if len(authors) == 0: | |
authors_as_string = "" | |
if len(authors) == 1: | |
authors_as_string = """%s""" % (authors[0]) | |
elif len(authors) == 2: | |
authors_as_string = """%s and %s""" % (authors[0], authors[1]) | |
elif len(authors) > 2: | |
authors_as_string = """%s""" % authors[0] | |
for author in range(1, len(authors) - 1): | |
authors_as_string += ", " + authors[author] | |
authors_as_string += " and %s" % (authors[-1]) | |
return(authors_as_string, authors) | |
# def format_authors ends here | |
def format_authors_xml(eoa_publication_info): | |
"""Format the list of authors | |
Input is the start and end point of the authors in a list. Return | |
an XML element as a string. | |
""" | |
authors_as_string = "" | |
for authornumber in range(1, 6): | |
author_key = "Publicationauthor" + str(authornumber) | |
tmp_author = eoa_publication_info[author_key] | |
if len(tmp_author) > 0: | |
nameparts = tmp_author.split(" ") | |
author_string = """ | |
<author primary_contact="true" user_group_ref="Author"> | |
<firstname>%s</firstname> | |
<lastname>%s</lastname> | |
<email>author@example.com</email> | |
</author>""" % (nameparts[0], nameparts[-1]) | |
authors_as_string += author_string | |
return(authors_as_string) | |
# def format_authors_xml ends here | |
def format_title(title_string, is_book_subtitle=False, unformatted=False): | |
"""Convert html tags to their LaTeX counterpart.""" | |
EMPHASIS = re.compile("<em>(.*?)</em>") | |
LOWER = re.compile("<sub>(.*?)</sub>") | |
BOLD = re.compile("<b>(.*?)</b>") | |
chapter_title = title_string.replace("<br/>", " ") | |
chapter_title = unescape(chapter_title) | |
if is_book_subtitle is True and len(title_string) > 0: | |
chapter_title = r" : " + chapter_title | |
if unformatted is True: | |
chapter_title = re.sub(EMPHASIS, r"\g<1>", chapter_title) | |
chapter_title = re.sub(LOWER, r"\g<1>", chapter_title) | |
chapter_title = re.sub(BOLD, r"\g<1>", chapter_title) | |
else: | |
chapter_title = re.sub(EMPHASIS, r"\\emph{\g<1>}", chapter_title) | |
chapter_title = re.sub(LOWER, r"\\textsubscript{\g<1>}", chapter_title) | |
chapter_title = re.sub(BOLD, r"\\textbf{\g<1>}", chapter_title) | |
return(chapter_title) | |
# def format_title ends here | |
def unescape(text): | |
"""Remove HTML or XML character references and entities from a text | |
string. Return a Unicode string. | |
With thanks to http://effbot.org/zone/re-sub.htm#unescape-html. | |
Modified to work with Python3. | |
""" | |
import html.entities | |
def fixup(character): | |
"""Fix one character.""" | |
text = character.group(0) | |
if text[:2] == "&#": | |
# character reference | |
try: | |
if text[:3] == "&#x": | |
return chr(int(text[3:-1], 16)) | |
else: | |
return chr(int(text[2:-1])) | |
except ValueError: | |
pass | |
else: | |
# named entity | |
try: | |
text = chr(html.entities.name2codepoint[text[1:-1]]) | |
except KeyError: | |
pass | |
return text # leave as is | |
return re.sub(r"&#?\w+;", fixup, text) | |
# def unescape ends here | |
def format_chapter_info(chapter_result): | |
"""Return a tuple with infos for the chapter bit.""" | |
chapter_title = chapter_result[1].rstrip() | |
formatted_chapter_title = format_title(chapter_title) | |
list_of_authors_string = format_authors(chapter_result, 2, 7)[0] | |
authors_line = "" | |
if len(list_of_authors_string) > 0: | |
authors_line = list_of_authors_string | |
return(formatted_chapter_title, authors_line) | |
# def format_chapter_info ends here | |
def which_publisher(series): | |
"""Make layout dependant on series | |
Return base url as well as publisher string""" | |
if series == "sources": | |
base_url = "http://edition-open-sources.org" | |
publisher_name = "Edition Open Sources" | |
else: | |
base_url = "http://edition-open-access.de" | |
publisher_name = r"Max Planck Research Library for the History and Development\\of Knowledge" | |
return(base_url, publisher_name) | |
# def which_publisher ends here | |
def choose_geometry(eoa_series): | |
"""Adjust page geometry and paper size to series""" | |
fontsize_string = "fontsize=10pt" | |
if eoa_series == "Sources": | |
geometry_string = r"\usepackage[a4paper,inner=30mm,outer=30mm,top=14mm,bottom=20mm,includehead]{geometry}" | |
fontsize_string = "fontsize=12pt" | |
elif eoa_series == "Studies": | |
geometry_string = r"\usepackage[paperwidth=170mm,paperheight=240mm,inner=22mm,outer=20mm,top=14mm,bottom=20mm,includehead]{geometry}" | |
elif eoa_series == "Proceedings" or eoa_series == "Textbooks": | |
geometry_string = r"\usepackage[paperwidth=148mm,paperheight=210mm,inner=20mm,outer=15mm,top=13mm,bottom=15mm,includehead]{geometry}" | |
fontsize_string = "fontsize=9pt" | |
return(geometry_string, fontsize_string) | |
# def choose_geometry ends here | |
def format_publication_info(eoa_publication_info): | |
"""Provide strings for the publication info. | |
Return a dictionary of items. | |
""" | |
pub_suffix = "" | |
base_url, publisher_string = which_publisher(eoa_publication_info["Serie"]) | |
cover_url = "%s/media/%s" % (PRODUCTION_URL, eoa_publication_info["Coverbig"]) | |
download_cover_image(cover_url) | |
publication_url = "%s/%s/%s/" % (base_url, eoa_publication_info["Serie"], eoa_publication_info["Number"]) | |
if len(eoa_publication_info["Publicationauthorsuffix"]) > 0: | |
pub_suffix = " " + eoa_publication_info["Publicationauthorsuffix"] | |
licence_string = format_licence(eoa_publication_info["Publicationlicense"]) | |
shoplink_string = format_shoplink(eoa_publication_info['Shoplink']) | |
book_authors_string = format_authors_xml(eoa_publication_info) | |
items_to_return = {"bookauthors" : book_authors_string, | |
"pubsuffix" : pub_suffix, | |
"booktitle": format_title(eoa_publication_info["Title"]), | |
"booksubtitle" : format_title(eoa_publication_info["Subtitle"], is_book_subtitle=True,unformatted=True), | |
"publisher" : publisher_string, | |
"series" : eoa_publication_info["Serie"].title(), | |
"number" : eoa_publication_info["Number"], | |
"isbn" : eoa_publication_info["Isbn"], | |
"pubdate" : eoa_publication_info["Datepublished"].strftime("%Y-%m-%d"), | |
"url" : publication_url, | |
"licence" : licence_string, | |
"shoplink" : shoplink_string} | |
return(items_to_return) | |
# def format_publication_info ends here | |
def format_licence(publication_licence): | |
"""Provide a string for the licence used.""" | |
if publication_licence == "by-nc-sa": | |
licence_string = r" under Creative Commons by-nc-sa 3.0 Germany Licence.\\\url{http://creativecommons.org/licenses/by-nc-sa/3.0/de/}" | |
else: | |
licence_string = "." | |
return(licence_string) | |
# def format_licence ends here | |
def format_shoplink(input_string, raw=False): | |
"""Parse the shoplink entry""" | |
SHOPLINK_PATTERN = re.compile('<a href="(?P<book_url>.*?)">(?P<company>.*?)</a>') | |
shoplink_match = re.match(SHOPLINK_PATTERN, input_string) | |
company = shop_url = "k.A." | |
try: | |
shop_url = shoplink_match.group('book_url') | |
company = shoplink_match.group('company') | |
except AttributeError: | |
pass | |
if company == "epubli.de": | |
shoplink_line = r"Neopubli GmbH, Berlin\par\url{%s}" % shop_url | |
elif company == "pro-business.com": | |
shoplink_line = r"PRO BUSINESS digital printing Deutschland GmbH, Berlin\par\url{%s}" % shop_url | |
else: | |
shoplink_line = "" | |
if raw == False: | |
return(shoplink_line) | |
else: | |
return(shop_url, company) | |
# def format_shoplink ends here | |
def download_cover_image(image_url): | |
"""Download image from website. | |
Code from | |
https://stackoverflow.com/questions/8286352 | |
""" | |
import urllib.request | |
urllib.request.urlretrieve(image_url, "./Coverimage.jpg") | |
# def download_cover_image | |
def download_chapter_pdf(chapter_url, destination): | |
"""Download the individual chapters from website. | |
Also return the pdf_filename. | |
""" | |
import urllib.request | |
try: | |
urllib.request.urlretrieve(chapter_url.replace(" ", "%20"), "./" + destination) | |
except urllib.error.HTTPError: | |
print_error("Program received an HTTP Error 403: Forbidden. Maybe there are no chapter files?") | |
# def download_chapter_pdf | |
def file_base64(filepath): | |
"""Base64 encode a file | |
https://code.tutsplus.com/tutorials/base64-encoding-and-decoding-using-python--cms-25588 | |
""" | |
import base64 | |
read_file = open(filepath, "rb").read() | |
base_64_encode = base64.encodestring(read_file) | |
return(base_64_encode) | |
# def file_base64 ends here | |
def run_latex(command): | |
"""Compile the latex""" | |
import shlex | |
import subprocess | |
arguments = shlex.split(command) | |
subprocess.call(arguments) | |
# def run_latex ends here | |
def add_pdf_info(pdf_filename, list_of_authors, title_for_pdf, subject_string): | |
"""Add metadata to PDF file""" | |
# code taken from http://kitchingroup.cheme.cmu.edu/blog/2013/06/13/Reading-and-writing-pdf-metadata/ | |
from PyPDF2 import PdfFileWriter, PdfFileReader | |
from PyPDF2.generic import NameObject, createStringObject | |
# ('Proc1Pre', 'Proc1Pre-orig.pdf', '', 'Vorwort') | |
original_pdf = open(pdf_filename + '_orig.pdf', 'rb') | |
frontmatter_pdf = open(pdf_filename + '_frontmatter.pdf', 'rb') | |
pdf_orig = PdfFileReader(original_pdf) | |
pdf_frontmatter = PdfFileReader(frontmatter_pdf) | |
writer = PdfFileWriter() | |
writer.addPage(pdf_frontmatter.getPage(0)) | |
for page in range(pdf_orig.getNumPages()): | |
writer.addPage(pdf_orig.getPage(page)) | |
info_dict = writer._info.getObject() | |
info = pdf_orig.documentInfo | |
for key in info: | |
info_dict.update({NameObject(key): createStringObject(info[key])}) | |
info_dict.update({NameObject('/Title'): createStringObject(title_for_pdf)}) | |
info_dict.update({NameObject('/Subject'): createStringObject(subject_string)}) | |
info_dict.update({NameObject('/Author'): createStringObject(list_of_authors)}) | |
# It does not appear possible to alter in place. | |
new_pdf = open(pdf_filename + '_out.pdf', 'wb') | |
writer.write(new_pdf) | |
original_pdf.close() | |
new_pdf.close() | |
os.unlink(pdf_filename + '_orig.pdf') | |
os.rename(pdf_filename + '_out.pdf', pdf_filename + '.pdf') | |
# def add_pdf_info ends here | |
def create_chapter_frontmatter(eoa_publication): | |
"""Main function""" | |
# validate input | |
check_publication(eoa_publication) | |
# setting up database | |
eoa_db = connect_db() | |
eoa_cursor = eoa_db.cursor() | |
eoa_pub_id = get_publication_id(eoa_publication, eoa_cursor) | |
eoa_publication_info = get_publication_info(eoa_pub_id, eoa_cursor) | |
base_url = which_publisher(eoa_publication_info[0][8])[0] | |
# the template file | |
tmp_template = open("./frontpage_template.tex", "r") | |
frontmatter_template = tmp_template.read() | |
tmp_template.close() | |
chapter_files = get_chapters(eoa_pub_id, eoa_cursor) | |
if len(chapter_files) == 0: | |
print_error("Found no chapter files.") | |
sys.exit("Exiting") | |
print("Found %d chapter files." % (len(chapter_files))) | |
os.chdir("./generated_files/") | |
command_file = open(eoa_publication + "_copycommand.sh", "w") | |
command_file.write("#!/bin/bash\n") | |
list_of_auxfiles = [] | |
for chapter in chapter_files: | |
# get the original pdf file | |
chapter_url = "%s/media/%s" % (PRODUCTION_URL, chapter[7]) | |
pdf_filename = chapter_url.split("/")[-1] | |
pdffilename_front, pdffilename_ext = os.path.splitext(pdf_filename) | |
original_pdf_file = pdffilename_front + "_orig" + pdffilename_ext | |
list_of_auxfiles.append(pdffilename_front + '_frontmatter.pdf') | |
print("download_chapter_pdf from", chapter_url) | |
download_chapter_pdf(chapter_url, original_pdf_file) | |
tex_filename = eoa_publication + "ch" + str(chapter[0]) + ".tex" | |
outfile = open("./" + tex_filename, "w") | |
# getting data for the template | |
item_for_template = format_publication_info(eoa_publication_info[0]) | |
formatted_chapter_title, authors_line = format_chapter_info(chapter) | |
if len(authors_line) == 0: | |
authors_line = item_for_template["bookauthors"] | |
formatted_chapter_authors = r"\emph{%s:}" % authors_line | |
geometry_string, fontsize_string = choose_geometry(item_for_template["series"]) | |
frontmatter_template_string = string.Template(frontmatter_template) | |
# fill in the blanks | |
frontmatter_replacement = frontmatter_template_string.substitute( | |
FONTSIZE=fontsize_string, | |
GEOMETRY_SETTINGS=geometry_string, | |
FORMATTED_CHAPTER_TITLE=formatted_chapter_title, | |
CHAPTER_AUTHORS_LINE=formatted_chapter_authors, | |
FORMAT_AUTHORS=item_for_template["bookauthors"], | |
FORMATTED_SHOPLINK=item_for_template["shoplink"], | |
LICENCE=item_for_template["licence"], | |
PUB_SUFFIX=item_for_template["pubsuffix"], | |
FORMAT_TITLE=item_for_template["booktitle"], | |
FORMAT_SUBTITLE=item_for_template["booksubtitle"].replace(" : ", "~:~"), | |
PUBLISHER_STRING=item_for_template["publisher"], | |
EOA_SERIES=item_for_template["series"], | |
SERIES_NUMBER=item_for_template["number"], | |
ISBN_CODE=item_for_template["isbn"], | |
PUB_DATE=item_for_template["pubdate"], | |
PUBLICATION_URL=item_for_template["url"]) | |
outfile.write(frontmatter_replacement) | |
outfile.close() | |
unformatted_chapter_title = format_title(chapter[1].rstrip(), unformatted=True) | |
subject_string = "A chapter from %s%s by %s (%s). %s, %s %s. %s" % (item_for_template[2], item_for_template[3], item_for_template[0], item_for_template[8], item_for_template[4], item_for_template[5], item_for_template[6], item_for_template[9]) | |
# generate PDF file | |
print("Typsetting the frontmatter to chapter '%s'" % unformatted_chapter_title) | |
latex_command = "xelatex --interaction=batchmode -jobname='%s_frontmatter' %s" % (pdffilename_front, tex_filename) | |
run_latex(latex_command) | |
add_pdf_info(pdffilename_front, authors_line, unformatted_chapter_title, subject_string) | |
command_file.write("cp %s%s %s%s.bak\n" % (MEDIA_DIR, chapter[7], MEDIA_DIR, chapter[7])) | |
command_file.write("cp '%s' %s%s\n" % (pdf_filename, MEDIA_DIR, chapter[7])) | |
# back to normal | |
command_file.close() | |
print("Removing aux files.") | |
for auxfile in os.listdir("."): | |
if re.search(r'.*\.(aux|log|tex)', auxfile): | |
list_of_auxfiles.append(auxfile) | |
for file_for_deletion in list_of_auxfiles: | |
os.unlink(file_for_deletion) | |
print("Removing other files.") | |
os.unlink("Coverimage.jpg") | |
os.chdir("..") | |
# def create_chapter_frontmatter ends here | |
def create_omp_native_xml(eoa_publication): | |
"""Use the database infos for creating input for OMP""" | |
# validate input | |
check_publication(eoa_publication) | |
# setting up database | |
eoa_db = connect_db() | |
eoa_cursor = eoa_db.cursor(cursor_factory=psycopg2.extras.DictCursor) | |
eoa_pub_id = get_publication_id(eoa_publication, eoa_cursor) | |
eoa_publication_info = get_publication_info(eoa_pub_id, eoa_cursor)[0] | |
base_url = which_publisher(eoa_publication_info["Serie"]) | |
# the template file | |
tmp_template = open("./data/native_template.xml", "r") | |
frontmatter_template = tmp_template.read() | |
tmp_template.close() | |
xml_filename = eoa_publication + ".xml" | |
outfile = open("./generated_files/" + xml_filename, "w") | |
item_for_template = format_publication_info(eoa_publication_info) | |
supplierurl, suppliercomp = format_shoplink(eoa_publication_info["Shoplink"], raw=True) | |
if len(eoa_publication_info["Subtitle"]) > 0: | |
onix_subtitle = """<onix:Subtitle>%s</onix:Subtitle>""" % eoa_publication_info["Subtitle"] | |
omp_subtitle = """<subtitle locale="en_US">%s</subtitle>""" % eoa_publication_info["Subtitle"] | |
else: | |
omp_subtitle = onix_subtitle = "" | |
frontmatter_template_string = string.Template(frontmatter_template) | |
# fill in the blanks | |
frontmatter_replacement = frontmatter_template_string.substitute( | |
INTERNAL_ID=item_for_template["number"], | |
FORMAT_AUTHORS=item_for_template["bookauthors"], | |
FORMAT_TITLE=item_for_template["booktitle"], | |
OMP_SUBTITLE=omp_subtitle, | |
ONIX_SUBTITLE=onix_subtitle, | |
PUBLISHER_STRING=item_for_template["publisher"], | |
EOA_SERIES=item_for_template["series"], | |
SERIES_NUMBER=item_for_template["number"], | |
ISBN_CODE=item_for_template["isbn"], | |
PUB_DATE=item_for_template["pubdate"], | |
PUBLICATION_URL=item_for_template["url"], | |
ABSTRACT=eoa_publication_info["Descriptionlong"].replace("<br/>", ""), | |
BASE64_PDF=file_base64("./data/dummy.pdf"), | |
PRICE=eoa_publication_info["Price"], | |
TODAY=datetime.today().strftime("%Y-%m-%d"), | |
SUPPLIER_COMP=suppliercomp, | |
SUPPLIER_URL=supplierurl, | |
PAGES=eoa_publication_info["Pages"], | |
SUBMISSION_NAME="%s_%d_submission" % (item_for_template["series"], item_for_template["number"]), | |
PUBDATE_00=eoa_publication_info["Datepublished"].strftime("%Y%m%d") | |
) | |
outfile.write(frontmatter_replacement) | |
outfile.close() | |
# def create_omp_native_xml ends here | |
if __name__ == '__main__': | |
if len(sys.argv) == 1: | |
print_error("You must specify a publication!") | |
sys.exit() | |
elif len(sys.argv) > 2: | |
print_error("You can work with only one publication at a time!") | |
sys.exit() | |
create_omp_native_xml(sys.argv[-1]) | |
# create_chapter_frontmatter(sys.argv[-1]) | |
# finis |