From 3d66bced0b1e03c2203bf6667237496f1f611913 Mon Sep 17 00:00:00 2001 From: Klaus Thoden Date: Wed, 15 Mar 2017 16:20:56 +0100 Subject: [PATCH] Initial commit --- publicationexport.py | 166 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 publicationexport.py diff --git a/publicationexport.py b/publicationexport.py new file mode 100644 index 0000000..03f68e1 --- /dev/null +++ b/publicationexport.py @@ -0,0 +1,166 @@ +#!/usr/bin/python3 +# -*- coding: utf-8; mode: python -*- +__version__ = "1.0" +__date__ = "20170315" +__author__ = "kthoden@mpiwg-berlin.mpg.de" +__doc__ = """Export document out of database""" + +import sys +import re +import os +import psycopg2 +import psycopg2.extras +from lxml import etree + +PUBLICATION_PATTERN = re.compile("(?Pstudies|sources|proceedings|textbooks)(?P1?[1-9]?[0-9]{1})$") +OUTPUT_DIR = "./output" + +def print_error(message): + """Print error message to stderr """ + + message_string = "[ERROR] %s\n" % message + sys.stderr.write(message_string) +# def print_error ends here + +def check_publication(input_string): + """Checks the input string for a valid name.""" + + publication_match = re.match(PUBLICATION_PATTERN, input_string) + if publication_match is not None: + sys.stdout.write("Name of the publication %s is valid.\n" % input_string) + return(True) + else: + print_error("Name of publication %s is not valid. Exiting." % input_string) + sys.exit() +# def check_publication ends here + +def connect_db(): + """Connect to the database""" + + try: + connection = psycopg2.connect("dbname='eoa_2017' user='kthoden' host='localhost'") + sys.stdout.write("Connection established.\n") + return(connection) + except AttributeError: + print_error("[ERROR] Could not connect.\n") +# def connect_db ends here + +def get_publication_id(input_string, eoa_cursor): + """Queries the database for the publication id. + + Input is the input string, return value the id.""" + + id_match = re.match(PUBLICATION_PATTERN, input_string) + eoa_series = id_match.group('series') + eoa_number = id_match.group('number') + + query_string = """SELECT "Title", "id" FROM publications_publication WHERE "Serie" = '%s' AND "Number" = '%s'""" % (eoa_series, eoa_number) + + eoa_cursor.execute(query_string) + + rows = eoa_cursor.fetchall() + if len(rows) > 1: + print_error("There should be only one database entry that matches the input. Found %s" % len(rows)) + elif len(rows) == 0: + print_error("It seems like there is no such as publication %s %s. Exiting." % (eoa_series.title(), eoa_number)) + sys.exit() + else: + print("The title of the publication you selected is '%s'." % rows[0][0]) + + return(rows[0][1]) +# def get_publication_id ends here + +def get_paragraphs(chapter_id, eoa_cursor): + """Get the ordered list of paragraphs per chapter.""" + + query_string = """SELECT * FROM publications_element WHERE "Chapter_id" = %s ORDER BY "Order" ASC;""" % chapter_id + + eoa_cursor.execute(query_string) + + chapter_contents = eoa_cursor.fetchall() + + return(chapter_contents) +# def get_paragraphs ends here + +def build_paragraph_xml(paragraph_info): + """Return full text and other info of paragraphs and equivalents.""" + + fulltext = paragraph_info["Fulltext"] + + eoa_paragraph_string = "%s" % fulltext + + eoa_paragraph = etree.fromstring(eoa_paragraph_string) + eoa_paragraph.set("order", str(paragraph_info["Order"])) + + return(eoa_paragraph) +# def build_paragraph_xml ends here + +def get_chapters(eoa_pub_id, eoa_cursor): + """Queries database for information about the individual chapters. + + Return chapter id, order, title and number.""" + + query_string = """SELECT * FROM publications_chapter WHERE + "Publication_id" = '%s' ORDER BY "Order" ASC""" % eoa_pub_id + + eoa_cursor.execute(query_string) + + chapter_information = eoa_cursor.fetchall() + + return(chapter_information) +# get_chapters ends here + +def build_chapter_xml(chap_dict): + """Return an XML tree consisting of a chapter""" + + eoa_chapter = etree.Element("EOAchapter", language=chap_dict["Chapterlanguage"], order=str(chap_dict["Order"])) + if len(chap_dict["Number"]) > 0: + eoa_chapter.set("number", chap_dict["Number"]) + + chapter_head = etree.SubElement(eoa_chapter, "head") + chapter_head.text = chap_dict["Title"] + + return(eoa_chapter) +# def build_chapter_xml ends here + +def main(eoa_publication): + """Main function""" + + # setting up database + eoa_db = connect_db() + eoa_cursor = eoa_db.cursor(cursor_factory=psycopg2.extras.DictCursor) + + eoa_pub_id = get_publication_id(eoa_publication, eoa_cursor) + + chapters = get_chapters(eoa_pub_id, eoa_cursor) + + eoa_document = etree.Element("EOAdocument") + + for chapter in chapters: + print(" ", chapter['Title']) + chapter_xml = build_chapter_xml(chapter) + chapter_paragraphs = get_paragraphs(chapter['id'], eoa_cursor) + + for paragraph in chapter_paragraphs: + paragraph_xml = build_paragraph_xml(paragraph) + chapter_xml.append(paragraph_xml) + + eoa_document.append(chapter_xml) + + if not os.path.exists(OUTPUT_DIR): + os.mkdir(os.path.expanduser(OUTPUT_DIR)) + + xml_file = OUTPUT_DIR + "/" + eoa_publication + ".xml" + tree = etree.ElementTree(eoa_document) + tree.write(xml_file, pretty_print=True, encoding="utf-8", xml_declaration=False) +# def main ends here + +if __name__ == '__main__': + if len(sys.argv) == 1: + print_error("You must specify a publication!") + sys.exit() + elif len(sys.argv) > 2: + print_error("You can work with only one publication at a time!") + sys.exit() + main(sys.argv[-1]) +# finis