From 3d66bced0b1e03c2203bf6667237496f1f611913 Mon Sep 17 00:00:00 2001
From: Klaus Thoden <kthoden@mpiwg-berlin.mpg.de>
Date: Wed, 15 Mar 2017 16:20:56 +0100
Subject: [PATCH] Initial commit

---
 publicationexport.py | 166 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 publicationexport.py
diff --git a/publicationexport.py b/publicationexport.py
new file mode 100644
index 0000000..03f68e1
--- /dev/null
+++ b/publicationexport.py
@@ -0,0 +1,166 @@
+#!/usr/bin/python3
+# -*- coding: utf-8; mode: python -*-
+__version__ = "1.0"
+__date__ = "20170315"
+__author__ = "kthoden@mpiwg-berlin.mpg.de"
+__doc__ = """Export document out of database"""
+
+import sys
+import re
+import os
+import psycopg2
+import psycopg2.extras
+from lxml import etree
+
+PUBLICATION_PATTERN = re.compile("(?P<series>studies|sources|proceedings|textbooks)(?P<number>1?[1-9]?[0-9]{1})$")
+OUTPUT_DIR = "./output"
+
+def print_error(message):
+    """Print error message to stderr """
+
+    message_string = "[ERROR] %s\n" % message
+    sys.stderr.write(message_string)
+# def print_error ends here
+
+def check_publication(input_string):
+    """Checks the input string for a valid name."""
+
+    publication_match = re.match(PUBLICATION_PATTERN, input_string)
+    if publication_match is not None:
+        sys.stdout.write("Name of the publication %s is valid.\n" % input_string)
+        return(True)
+    else:
+        print_error("Name of publication %s is not valid. Exiting." % input_string)
+        sys.exit()
+# def check_publication ends here
+
+def connect_db():
+    """Connect to the database"""
+
+    try:
+        connection = psycopg2.connect("dbname='eoa_2017' user='kthoden' host='localhost'")
+        sys.stdout.write("Connection established.\n")
+        return(connection)
+    except AttributeError:
+        print_error("[ERROR] Could not connect.\n")
+# def connect_db ends here
+
+def get_publication_id(input_string, eoa_cursor):
+    """Queries the database for the publication id.
+
+    Input is the input string, return value the id."""
+
+    id_match = re.match(PUBLICATION_PATTERN, input_string)
+    eoa_series = id_match.group('series')
+    eoa_number = id_match.group('number')
+
+    query_string = """SELECT "Title", "id" FROM publications_publication WHERE "Serie" = '%s' AND "Number" = '%s'""" % (eoa_series, eoa_number)
+
+    eoa_cursor.execute(query_string)
+
+    rows = eoa_cursor.fetchall()
+    if len(rows) > 1:
+        print_error("There should be only one database entry that matches the input. Found %s" % len(rows))
+    elif len(rows) == 0:
+        print_error("It seems like there is no such as publication %s %s. Exiting." % (eoa_series.title(), eoa_number))
+        sys.exit()
+    else:
+        print("The title of the publication you selected is '%s'." % rows[0][0])
+
+    return(rows[0][1])
+# def get_publication_id ends here
+
+def get_paragraphs(chapter_id, eoa_cursor):
+    """Get the ordered list of paragraphs per chapter."""
+
+    query_string = """SELECT * FROM publications_element WHERE "Chapter_id" = %s ORDER BY "Order" ASC;""" % chapter_id
+
+    eoa_cursor.execute(query_string)
+
+    chapter_contents = eoa_cursor.fetchall()
+
+    return(chapter_contents)
+# def get_paragraphs ends here
+
+def build_paragraph_xml(paragraph_info):
+    """Return full text and other info of paragraphs and equivalents."""
+
+    fulltext = paragraph_info["Fulltext"]
+
+    eoa_paragraph_string = "<EOAparagraph>%s</EOAparagraph>" % fulltext
+
+    eoa_paragraph = etree.fromstring(eoa_paragraph_string)
+    eoa_paragraph.set("order", str(paragraph_info["Order"]))
+
+    return(eoa_paragraph)
+# def build_paragraph_xml ends here
+
+def get_chapters(eoa_pub_id, eoa_cursor):
+    """Queries database for information about the individual chapters.
+
+    Return chapter id, order, title and number."""
+
+    query_string = """SELECT * FROM publications_chapter WHERE
+    "Publication_id" = '%s' ORDER BY "Order" ASC""" % eoa_pub_id
+
+    eoa_cursor.execute(query_string)
+
+    chapter_information = eoa_cursor.fetchall()
+
+    return(chapter_information)
+# get_chapters ends here
+
+def build_chapter_xml(chap_dict):
+    """Return an XML tree consisting of a chapter"""
+
+    eoa_chapter = etree.Element("EOAchapter", language=chap_dict["Chapterlanguage"], order=str(chap_dict["Order"]))
+    if len(chap_dict["Number"]) > 0:
+        eoa_chapter.set("number", chap_dict["Number"])
+
+    chapter_head = etree.SubElement(eoa_chapter, "head")
+    chapter_head.text = chap_dict["Title"]
+
+    return(eoa_chapter)
+# def build_chapter_xml ends here
+
+def main(eoa_publication):
+    """Main function"""
+
+    # setting up database
+    eoa_db = connect_db()
+    eoa_cursor = eoa_db.cursor(cursor_factory=psycopg2.extras.DictCursor)
+
+    eoa_pub_id = get_publication_id(eoa_publication, eoa_cursor)
+
+    chapters = get_chapters(eoa_pub_id, eoa_cursor)
+
+    eoa_document = etree.Element("EOAdocument")
+
+    for chapter in chapters:
+        print("   ", chapter['Title'])
+        chapter_xml = build_chapter_xml(chapter)
+        chapter_paragraphs = get_paragraphs(chapter['id'], eoa_cursor)
+
+        for paragraph in chapter_paragraphs:
+            paragraph_xml = build_paragraph_xml(paragraph)
+            chapter_xml.append(paragraph_xml)
+
+        eoa_document.append(chapter_xml)
+
+    if not os.path.exists(OUTPUT_DIR):
+        os.mkdir(os.path.expanduser(OUTPUT_DIR))
+
+    xml_file = OUTPUT_DIR + "/" + eoa_publication + ".xml"
+    tree = etree.ElementTree(eoa_document)
+    tree.write(xml_file, pretty_print=True, encoding="utf-8", xml_declaration=False)
+# def main ends here
+
+if __name__ == '__main__':
+    if len(sys.argv) == 1:
+        print_error("You must specify a publication!")
+        sys.exit()
+    elif len(sys.argv) > 2:
+        print_error("You can work with only one publication at a time!")
+        sys.exit()
+    main(sys.argv[-1])
+# finis