This repository has been archived by the owner. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Initial commit
- Loading branch information
Klaus Thoden
committed
Mar 15, 2017
0 parents
commit 3d66bce
Showing
1 changed file
with
166 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
#!/usr/bin/python3 | ||
# -*- coding: utf-8; mode: python -*- | ||
__version__ = "1.0" | ||
__date__ = "20170315" | ||
__author__ = "kthoden@mpiwg-berlin.mpg.de" | ||
__doc__ = """Export document out of database""" | ||
|
||
import sys | ||
import re | ||
import os | ||
import psycopg2 | ||
import psycopg2.extras | ||
from lxml import etree | ||
|
||
PUBLICATION_PATTERN = re.compile("(?P<series>studies|sources|proceedings|textbooks)(?P<number>1?[1-9]?[0-9]{1})$") | ||
OUTPUT_DIR = "./output" | ||
|
||
def print_error(message): | ||
"""Print error message to stderr """ | ||
|
||
message_string = "[ERROR] %s\n" % message | ||
sys.stderr.write(message_string) | ||
# def print_error ends here | ||
|
||
def check_publication(input_string): | ||
"""Checks the input string for a valid name.""" | ||
|
||
publication_match = re.match(PUBLICATION_PATTERN, input_string) | ||
if publication_match is not None: | ||
sys.stdout.write("Name of the publication %s is valid.\n" % input_string) | ||
return(True) | ||
else: | ||
print_error("Name of publication %s is not valid. Exiting." % input_string) | ||
sys.exit() | ||
# def check_publication ends here | ||
|
||
def connect_db(): | ||
"""Connect to the database""" | ||
|
||
try: | ||
connection = psycopg2.connect("dbname='eoa_2017' user='kthoden' host='localhost'") | ||
sys.stdout.write("Connection established.\n") | ||
return(connection) | ||
except AttributeError: | ||
print_error("[ERROR] Could not connect.\n") | ||
# def connect_db ends here | ||
|
||
def get_publication_id(input_string, eoa_cursor): | ||
"""Queries the database for the publication id. | ||
Input is the input string, return value the id.""" | ||
|
||
id_match = re.match(PUBLICATION_PATTERN, input_string) | ||
eoa_series = id_match.group('series') | ||
eoa_number = id_match.group('number') | ||
|
||
query_string = """SELECT "Title", "id" FROM publications_publication WHERE "Serie" = '%s' AND "Number" = '%s'""" % (eoa_series, eoa_number) | ||
|
||
eoa_cursor.execute(query_string) | ||
|
||
rows = eoa_cursor.fetchall() | ||
if len(rows) > 1: | ||
print_error("There should be only one database entry that matches the input. Found %s" % len(rows)) | ||
elif len(rows) == 0: | ||
print_error("It seems like there is no such as publication %s %s. Exiting." % (eoa_series.title(), eoa_number)) | ||
sys.exit() | ||
else: | ||
print("The title of the publication you selected is '%s'." % rows[0][0]) | ||
|
||
return(rows[0][1]) | ||
# def get_publication_id ends here | ||
|
||
def get_paragraphs(chapter_id, eoa_cursor): | ||
"""Get the ordered list of paragraphs per chapter.""" | ||
|
||
query_string = """SELECT * FROM publications_element WHERE "Chapter_id" = %s ORDER BY "Order" ASC;""" % chapter_id | ||
|
||
eoa_cursor.execute(query_string) | ||
|
||
chapter_contents = eoa_cursor.fetchall() | ||
|
||
return(chapter_contents) | ||
# def get_paragraphs ends here | ||
|
||
def build_paragraph_xml(paragraph_info): | ||
"""Return full text and other info of paragraphs and equivalents.""" | ||
|
||
fulltext = paragraph_info["Fulltext"] | ||
|
||
eoa_paragraph_string = "<EOAparagraph>%s</EOAparagraph>" % fulltext | ||
|
||
eoa_paragraph = etree.fromstring(eoa_paragraph_string) | ||
eoa_paragraph.set("order", str(paragraph_info["Order"])) | ||
|
||
return(eoa_paragraph) | ||
# def build_paragraph_xml ends here | ||
|
||
def get_chapters(eoa_pub_id, eoa_cursor): | ||
"""Queries database for information about the individual chapters. | ||
Return chapter id, order, title and number.""" | ||
|
||
query_string = """SELECT * FROM publications_chapter WHERE | ||
"Publication_id" = '%s' ORDER BY "Order" ASC""" % eoa_pub_id | ||
|
||
eoa_cursor.execute(query_string) | ||
|
||
chapter_information = eoa_cursor.fetchall() | ||
|
||
return(chapter_information) | ||
# get_chapters ends here | ||
|
||
def build_chapter_xml(chap_dict): | ||
"""Return an XML tree consisting of a chapter""" | ||
|
||
eoa_chapter = etree.Element("EOAchapter", language=chap_dict["Chapterlanguage"], order=str(chap_dict["Order"])) | ||
if len(chap_dict["Number"]) > 0: | ||
eoa_chapter.set("number", chap_dict["Number"]) | ||
|
||
chapter_head = etree.SubElement(eoa_chapter, "head") | ||
chapter_head.text = chap_dict["Title"] | ||
|
||
return(eoa_chapter) | ||
# def build_chapter_xml ends here | ||
|
||
def main(eoa_publication): | ||
"""Main function""" | ||
|
||
# setting up database | ||
eoa_db = connect_db() | ||
eoa_cursor = eoa_db.cursor(cursor_factory=psycopg2.extras.DictCursor) | ||
|
||
eoa_pub_id = get_publication_id(eoa_publication, eoa_cursor) | ||
|
||
chapters = get_chapters(eoa_pub_id, eoa_cursor) | ||
|
||
eoa_document = etree.Element("EOAdocument") | ||
|
||
for chapter in chapters: | ||
print(" ", chapter['Title']) | ||
chapter_xml = build_chapter_xml(chapter) | ||
chapter_paragraphs = get_paragraphs(chapter['id'], eoa_cursor) | ||
|
||
for paragraph in chapter_paragraphs: | ||
paragraph_xml = build_paragraph_xml(paragraph) | ||
chapter_xml.append(paragraph_xml) | ||
|
||
eoa_document.append(chapter_xml) | ||
|
||
if not os.path.exists(OUTPUT_DIR): | ||
os.mkdir(os.path.expanduser(OUTPUT_DIR)) | ||
|
||
xml_file = OUTPUT_DIR + "/" + eoa_publication + ".xml" | ||
tree = etree.ElementTree(eoa_document) | ||
tree.write(xml_file, pretty_print=True, encoding="utf-8", xml_declaration=False) | ||
# def main ends here | ||
|
||
if __name__ == '__main__': | ||
if len(sys.argv) == 1: | ||
print_error("You must specify a publication!") | ||
sys.exit() | ||
elif len(sys.argv) > 2: | ||
print_error("You can work with only one publication at a time!") | ||
sys.exit() | ||
main(sys.argv[-1]) | ||
# finis |