Skip to content
This repository has been archived by the owner. It is now read-only.

Commit

Permalink
Browse files Browse the repository at this point in the history
Initial commit
  • Loading branch information
Klaus Thoden committed Mar 15, 2017
0 parents commit 3d66bce
Showing 1 changed file with 166 additions and 0 deletions.
166 changes: 166 additions & 0 deletions publicationexport.py
@@ -0,0 +1,166 @@
#!/usr/bin/python3
# -*- coding: utf-8; mode: python -*-
__version__ = "1.0"
__date__ = "20170315"
__author__ = "kthoden@mpiwg-berlin.mpg.de"
__doc__ = """Export document out of database"""

import sys
import re
import os
import psycopg2
import psycopg2.extras
from lxml import etree

PUBLICATION_PATTERN = re.compile("(?P<series>studies|sources|proceedings|textbooks)(?P<number>1?[1-9]?[0-9]{1})$")
OUTPUT_DIR = "./output"

def print_error(message):
"""Print error message to stderr """

message_string = "[ERROR] %s\n" % message
sys.stderr.write(message_string)
# def print_error ends here

def check_publication(input_string):
"""Checks the input string for a valid name."""

publication_match = re.match(PUBLICATION_PATTERN, input_string)
if publication_match is not None:
sys.stdout.write("Name of the publication %s is valid.\n" % input_string)
return(True)
else:
print_error("Name of publication %s is not valid. Exiting." % input_string)
sys.exit()
# def check_publication ends here

def connect_db():
"""Connect to the database"""

try:
connection = psycopg2.connect("dbname='eoa_2017' user='kthoden' host='localhost'")
sys.stdout.write("Connection established.\n")
return(connection)
except AttributeError:
print_error("[ERROR] Could not connect.\n")
# def connect_db ends here

def get_publication_id(input_string, eoa_cursor):
"""Queries the database for the publication id.
Input is the input string, return value the id."""

id_match = re.match(PUBLICATION_PATTERN, input_string)
eoa_series = id_match.group('series')
eoa_number = id_match.group('number')

query_string = """SELECT "Title", "id" FROM publications_publication WHERE "Serie" = '%s' AND "Number" = '%s'""" % (eoa_series, eoa_number)

eoa_cursor.execute(query_string)

rows = eoa_cursor.fetchall()
if len(rows) > 1:
print_error("There should be only one database entry that matches the input. Found %s" % len(rows))
elif len(rows) == 0:
print_error("It seems like there is no such as publication %s %s. Exiting." % (eoa_series.title(), eoa_number))
sys.exit()
else:
print("The title of the publication you selected is '%s'." % rows[0][0])

return(rows[0][1])
# def get_publication_id ends here

def get_paragraphs(chapter_id, eoa_cursor):
"""Get the ordered list of paragraphs per chapter."""

query_string = """SELECT * FROM publications_element WHERE "Chapter_id" = %s ORDER BY "Order" ASC;""" % chapter_id

eoa_cursor.execute(query_string)

chapter_contents = eoa_cursor.fetchall()

return(chapter_contents)
# def get_paragraphs ends here

def build_paragraph_xml(paragraph_info):
"""Return full text and other info of paragraphs and equivalents."""

fulltext = paragraph_info["Fulltext"]

eoa_paragraph_string = "<EOAparagraph>%s</EOAparagraph>" % fulltext

eoa_paragraph = etree.fromstring(eoa_paragraph_string)
eoa_paragraph.set("order", str(paragraph_info["Order"]))

return(eoa_paragraph)
# def build_paragraph_xml ends here

def get_chapters(eoa_pub_id, eoa_cursor):
"""Queries database for information about the individual chapters.
Return chapter id, order, title and number."""

query_string = """SELECT * FROM publications_chapter WHERE
"Publication_id" = '%s' ORDER BY "Order" ASC""" % eoa_pub_id

eoa_cursor.execute(query_string)

chapter_information = eoa_cursor.fetchall()

return(chapter_information)
# get_chapters ends here

def build_chapter_xml(chap_dict):
"""Return an XML tree consisting of a chapter"""

eoa_chapter = etree.Element("EOAchapter", language=chap_dict["Chapterlanguage"], order=str(chap_dict["Order"]))
if len(chap_dict["Number"]) > 0:
eoa_chapter.set("number", chap_dict["Number"])

chapter_head = etree.SubElement(eoa_chapter, "head")
chapter_head.text = chap_dict["Title"]

return(eoa_chapter)
# def build_chapter_xml ends here

def main(eoa_publication):
"""Main function"""

# setting up database
eoa_db = connect_db()
eoa_cursor = eoa_db.cursor(cursor_factory=psycopg2.extras.DictCursor)

eoa_pub_id = get_publication_id(eoa_publication, eoa_cursor)

chapters = get_chapters(eoa_pub_id, eoa_cursor)

eoa_document = etree.Element("EOAdocument")

for chapter in chapters:
print(" ", chapter['Title'])
chapter_xml = build_chapter_xml(chapter)
chapter_paragraphs = get_paragraphs(chapter['id'], eoa_cursor)

for paragraph in chapter_paragraphs:
paragraph_xml = build_paragraph_xml(paragraph)
chapter_xml.append(paragraph_xml)

eoa_document.append(chapter_xml)

if not os.path.exists(OUTPUT_DIR):
os.mkdir(os.path.expanduser(OUTPUT_DIR))

xml_file = OUTPUT_DIR + "/" + eoa_publication + ".xml"
tree = etree.ElementTree(eoa_document)
tree.write(xml_file, pretty_print=True, encoding="utf-8", xml_declaration=False)
# def main ends here

if __name__ == '__main__':
if len(sys.argv) == 1:
print_error("You must specify a publication!")
sys.exit()
elif len(sys.argv) > 2:
print_error("You can work with only one publication at a time!")
sys.exit()
main(sys.argv[-1])
# finis

0 comments on commit 3d66bce

Please sign in to comment.