Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/usr/bin/python3
# -*- coding: utf-8; mode: python -*-
__version__ = "1.0"
__date__ = "20170315"
__author__ = "kthoden@mpiwg-berlin.mpg.de"
__doc__ = """Export document out of database"""
import sys
import re
import os
import psycopg2
import psycopg2.extras
from lxml import etree
PUBLICATION_PATTERN = re.compile("(?P<series>studies|sources|proceedings|textbooks)(?P<number>1?[1-9]?[0-9]{1})$")
OUTPUT_DIR = "./output"
def print_error(message):
"""Print error message to stderr """
message_string = "[ERROR] %s\n" % message
sys.stderr.write(message_string)
# def print_error ends here
def check_publication(input_string):
"""Checks the input string for a valid name."""
publication_match = re.match(PUBLICATION_PATTERN, input_string)
if publication_match is not None:
sys.stdout.write("Name of the publication %s is valid.\n" % input_string)
return(True)
else:
print_error("Name of publication %s is not valid. Exiting." % input_string)
sys.exit()
# def check_publication ends here
def connect_db():
"""Connect to the database"""
try:
connection = psycopg2.connect("dbname='eoa_2017' user='kthoden' host='localhost'")
sys.stdout.write("Connection established.\n")
return(connection)
except AttributeError:
print_error("[ERROR] Could not connect.\n")
# def connect_db ends here
def get_publication_id(input_string, eoa_cursor):
"""Queries the database for the publication id.
Input is the input string, return value the id."""
id_match = re.match(PUBLICATION_PATTERN, input_string)
eoa_series = id_match.group('series')
eoa_number = id_match.group('number')
query_string = """SELECT "Title", "id" FROM publications_publication WHERE "Serie" = '%s' AND "Number" = '%s'""" % (eoa_series, eoa_number)
eoa_cursor.execute(query_string)
rows = eoa_cursor.fetchall()
if len(rows) > 1:
print_error("There should be only one database entry that matches the input. Found %s" % len(rows))
elif len(rows) == 0:
print_error("It seems like there is no such as publication %s %s. Exiting." % (eoa_series.title(), eoa_number))
sys.exit()
else:
print("The title of the publication you selected is '%s'." % rows[0][0])
return(rows[0][1])
# def get_publication_id ends here
def get_paragraphs(chapter_id, eoa_cursor):
"""Get the ordered list of paragraphs per chapter."""
query_string = """SELECT * FROM publications_element WHERE "Chapter_id" = %s ORDER BY "Order" ASC;""" % chapter_id
eoa_cursor.execute(query_string)
chapter_contents = eoa_cursor.fetchall()
return(chapter_contents)
# def get_paragraphs ends here
def build_paragraph_xml(paragraph_info):
"""Return full text and other info of paragraphs and equivalents."""
fulltext = paragraph_info["Fulltext"]
eoa_paragraph_string = "<EOAparagraph>%s</EOAparagraph>" % fulltext
eoa_paragraph = etree.fromstring(eoa_paragraph_string)
eoa_paragraph.set("order", str(paragraph_info["Order"]))
return(eoa_paragraph)
# def build_paragraph_xml ends here
def get_chapters(eoa_pub_id, eoa_cursor):
"""Queries database for information about the individual chapters.
Return chapter id, order, title and number."""
query_string = """SELECT * FROM publications_chapter WHERE
"Publication_id" = '%s' ORDER BY "Order" ASC""" % eoa_pub_id
eoa_cursor.execute(query_string)
chapter_information = eoa_cursor.fetchall()
return(chapter_information)
# get_chapters ends here
def build_chapter_xml(chap_dict):
"""Return an XML tree consisting of a chapter"""
eoa_chapter = etree.Element("EOAchapter", language=chap_dict["Chapterlanguage"], order=str(chap_dict["Order"]))
if len(chap_dict["Number"]) > 0:
eoa_chapter.set("number", chap_dict["Number"])
chapter_head = etree.SubElement(eoa_chapter, "head")
chapter_head.text = chap_dict["Title"]
return(eoa_chapter)
# def build_chapter_xml ends here
def main(eoa_publication):
"""Main function"""
# setting up database
eoa_db = connect_db()
eoa_cursor = eoa_db.cursor(cursor_factory=psycopg2.extras.DictCursor)
eoa_pub_id = get_publication_id(eoa_publication, eoa_cursor)
chapters = get_chapters(eoa_pub_id, eoa_cursor)
eoa_document = etree.Element("EOAdocument")
for chapter in chapters:
print(" ", chapter['Title'])
chapter_xml = build_chapter_xml(chapter)
chapter_paragraphs = get_paragraphs(chapter['id'], eoa_cursor)
for paragraph in chapter_paragraphs:
paragraph_xml = build_paragraph_xml(paragraph)
chapter_xml.append(paragraph_xml)
eoa_document.append(chapter_xml)
if not os.path.exists(OUTPUT_DIR):
os.mkdir(os.path.expanduser(OUTPUT_DIR))
xml_file = OUTPUT_DIR + "/" + eoa_publication + ".xml"
tree = etree.ElementTree(eoa_document)
tree.write(xml_file, pretty_print=True, encoding="utf-8", xml_declaration=False)
# def main ends here
if __name__ == '__main__':
if len(sys.argv) == 1:
print_error("You must specify a publication!")
sys.exit()
elif len(sys.argv) > 2:
print_error("You can work with only one publication at a time!")
sys.exit()
main(sys.argv[-1])
# finis