publicationexport.py

#!/usr/bin/python3
# -*- coding: utf-8; mode: python -*-
__version__ = "1.0"
__date__ = "20170315"
__author__ = "kthoden@mpiwg-berlin.mpg.de"
__doc__ = """Export document out of database"""

import sys
import re
import os
import psycopg2
import psycopg2.extras
from lxml import etree

PUBLICATION_PATTERN = re.compile("(?P<series>studies|sources|proceedings|textbooks)(?P<number>1?[1-9]?[0-9]{1})$")
OUTPUT_DIR = "./output"

def print_error(message):
    """Print error message to stderr """

    message_string = "[ERROR] %s\n" % message
    sys.stderr.write(message_string)
# def print_error ends here

def check_publication(input_string):
    """Checks the input string for a valid name."""

    publication_match = re.match(PUBLICATION_PATTERN, input_string)
    if publication_match is not None:
        sys.stdout.write("Name of the publication %s is valid.\n" % input_string)
        return(True)
    else:
        print_error("Name of publication %s is not valid. Exiting." % input_string)
        sys.exit()
# def check_publication ends here

def connect_db():
    """Connect to the database"""

    try:
        connection = psycopg2.connect("dbname='eoa_2017' user='kthoden' host='localhost'")
        sys.stdout.write("Connection established.\n")
        return(connection)
    except AttributeError:
        print_error("[ERROR] Could not connect.\n")
# def connect_db ends here

def get_publication_id(input_string, eoa_cursor):
    """Queries the database for the publication id.

    Input is the input string, return value the id."""

    id_match = re.match(PUBLICATION_PATTERN, input_string)
    eoa_series = id_match.group('series')
    eoa_number = id_match.group('number')

    query_string = """SELECT "Title", "id" FROM publications_publication WHERE "Serie" = '%s' AND "Number" = '%s'""" % (eoa_series, eoa_number)

    eoa_cursor.execute(query_string)

    rows = eoa_cursor.fetchall()
    if len(rows) > 1:
        print_error("There should be only one database entry that matches the input. Found %s" % len(rows))
    elif len(rows) == 0:
        print_error("It seems like there is no such as publication %s %s. Exiting." % (eoa_series.title(), eoa_number))
        sys.exit()
    else:
        print("The title of the publication you selected is '%s'." % rows[0][0])

    return(rows[0][1])
# def get_publication_id ends here

def get_paragraphs(chapter_id, eoa_cursor):
    """Get the ordered list of paragraphs per chapter."""

    query_string = """SELECT * FROM publications_element WHERE "Chapter_id" = %s ORDER BY "Order" ASC;""" % chapter_id

    eoa_cursor.execute(query_string)

    chapter_contents = eoa_cursor.fetchall()

    return(chapter_contents)
# def get_paragraphs ends here

def build_paragraph_xml(paragraph_info):
    """Return full text and other info of paragraphs and equivalents."""

    fulltext = paragraph_info["Fulltext"]

    eoa_paragraph_string = "<EOAparagraph>%s</EOAparagraph>" % fulltext

    eoa_paragraph = etree.fromstring(eoa_paragraph_string)
    eoa_paragraph.set("order", str(paragraph_info["Order"]))

    return(eoa_paragraph)
# def build_paragraph_xml ends here

def get_chapters(eoa_pub_id, eoa_cursor):
    """Queries database for information about the individual chapters.

    Return chapter id, order, title and number."""

    query_string = """SELECT * FROM publications_chapter WHERE
    "Publication_id" = '%s' ORDER BY "Order" ASC""" % eoa_pub_id

    eoa_cursor.execute(query_string)

    chapter_information = eoa_cursor.fetchall()

    return(chapter_information)
# get_chapters ends here

def build_chapter_xml(chap_dict):
    """Return an XML tree consisting of a chapter"""

    eoa_chapter = etree.Element("EOAchapter", language=chap_dict["Chapterlanguage"], order=str(chap_dict["Order"]))
    if len(chap_dict["Number"]) > 0:
        eoa_chapter.set("number", chap_dict["Number"])

    chapter_head = etree.SubElement(eoa_chapter, "head")
    chapter_head.text = chap_dict["Title"]

    return(eoa_chapter)
# def build_chapter_xml ends here

def main(eoa_publication):
    """Main function"""

    # setting up database
    eoa_db = connect_db()
    eoa_cursor = eoa_db.cursor(cursor_factory=psycopg2.extras.DictCursor)

    eoa_pub_id = get_publication_id(eoa_publication, eoa_cursor)

    chapters = get_chapters(eoa_pub_id, eoa_cursor)

    eoa_document = etree.Element("EOAdocument")

    for chapter in chapters:
        print("   ", chapter['Title'])
        chapter_xml = build_chapter_xml(chapter)
        chapter_paragraphs = get_paragraphs(chapter['id'], eoa_cursor)

        for paragraph in chapter_paragraphs:
            paragraph_xml = build_paragraph_xml(paragraph)
            chapter_xml.append(paragraph_xml)

        eoa_document.append(chapter_xml)

    if not os.path.exists(OUTPUT_DIR):
        os.mkdir(os.path.expanduser(OUTPUT_DIR))

    xml_file = OUTPUT_DIR + "/" + eoa_publication + ".xml"
    tree = etree.ElementTree(eoa_document)
    tree.write(xml_file, pretty_print=True, encoding="utf-8", xml_declaration=False)
# def main ends here

if __name__ == '__main__':
    if len(sys.argv) == 1:
        print_error("You must specify a publication!")
        sys.exit()
    elif len(sys.argv) > 2:
        print_error("You can work with only one publication at a time!")
        sys.exit()
    main(sys.argv[-1])
# finis