This repository has been archived by the owner. It is now read-only.
Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
eoa_publicationexport/publicationexport.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
166 lines (120 sloc)
5.22 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# -*- coding: utf-8; mode: python -*- | |
__version__ = "1.0" | |
__date__ = "20170315" | |
__author__ = "kthoden@mpiwg-berlin.mpg.de" | |
__doc__ = """Export document out of database""" | |
import sys | |
import re | |
import os | |
import psycopg2 | |
import psycopg2.extras | |
from lxml import etree | |
PUBLICATION_PATTERN = re.compile("(?P<series>studies|sources|proceedings|textbooks)(?P<number>1?[1-9]?[0-9]{1})$") | |
OUTPUT_DIR = "./output" | |
def print_error(message): | |
"""Print error message to stderr """ | |
message_string = "[ERROR] %s\n" % message | |
sys.stderr.write(message_string) | |
# def print_error ends here | |
def check_publication(input_string): | |
"""Checks the input string for a valid name.""" | |
publication_match = re.match(PUBLICATION_PATTERN, input_string) | |
if publication_match is not None: | |
sys.stdout.write("Name of the publication %s is valid.\n" % input_string) | |
return(True) | |
else: | |
print_error("Name of publication %s is not valid. Exiting." % input_string) | |
sys.exit() | |
# def check_publication ends here | |
def connect_db(): | |
"""Connect to the database""" | |
try: | |
connection = psycopg2.connect("dbname='eoa_2017' user='kthoden' host='localhost'") | |
sys.stdout.write("Connection established.\n") | |
return(connection) | |
except AttributeError: | |
print_error("[ERROR] Could not connect.\n") | |
# def connect_db ends here | |
def get_publication_id(input_string, eoa_cursor): | |
"""Queries the database for the publication id. | |
Input is the input string, return value the id.""" | |
id_match = re.match(PUBLICATION_PATTERN, input_string) | |
eoa_series = id_match.group('series') | |
eoa_number = id_match.group('number') | |
query_string = """SELECT "Title", "id" FROM publications_publication WHERE "Serie" = '%s' AND "Number" = '%s'""" % (eoa_series, eoa_number) | |
eoa_cursor.execute(query_string) | |
rows = eoa_cursor.fetchall() | |
if len(rows) > 1: | |
print_error("There should be only one database entry that matches the input. Found %s" % len(rows)) | |
elif len(rows) == 0: | |
print_error("It seems like there is no such as publication %s %s. Exiting." % (eoa_series.title(), eoa_number)) | |
sys.exit() | |
else: | |
print("The title of the publication you selected is '%s'." % rows[0][0]) | |
return(rows[0][1]) | |
# def get_publication_id ends here | |
def get_paragraphs(chapter_id, eoa_cursor): | |
"""Get the ordered list of paragraphs per chapter.""" | |
query_string = """SELECT * FROM publications_element WHERE "Chapter_id" = %s ORDER BY "Order" ASC;""" % chapter_id | |
eoa_cursor.execute(query_string) | |
chapter_contents = eoa_cursor.fetchall() | |
return(chapter_contents) | |
# def get_paragraphs ends here | |
def build_paragraph_xml(paragraph_info): | |
"""Return full text and other info of paragraphs and equivalents.""" | |
fulltext = paragraph_info["Fulltext"] | |
eoa_paragraph_string = "<EOAparagraph>%s</EOAparagraph>" % fulltext | |
eoa_paragraph = etree.fromstring(eoa_paragraph_string) | |
eoa_paragraph.set("order", str(paragraph_info["Order"])) | |
return(eoa_paragraph) | |
# def build_paragraph_xml ends here | |
def get_chapters(eoa_pub_id, eoa_cursor): | |
"""Queries database for information about the individual chapters. | |
Return chapter id, order, title and number.""" | |
query_string = """SELECT * FROM publications_chapter WHERE | |
"Publication_id" = '%s' ORDER BY "Order" ASC""" % eoa_pub_id | |
eoa_cursor.execute(query_string) | |
chapter_information = eoa_cursor.fetchall() | |
return(chapter_information) | |
# get_chapters ends here | |
def build_chapter_xml(chap_dict): | |
"""Return an XML tree consisting of a chapter""" | |
eoa_chapter = etree.Element("EOAchapter", language=chap_dict["Chapterlanguage"], order=str(chap_dict["Order"])) | |
if len(chap_dict["Number"]) > 0: | |
eoa_chapter.set("number", chap_dict["Number"]) | |
chapter_head = etree.SubElement(eoa_chapter, "head") | |
chapter_head.text = chap_dict["Title"] | |
return(eoa_chapter) | |
# def build_chapter_xml ends here | |
def main(eoa_publication): | |
"""Main function""" | |
# setting up database | |
eoa_db = connect_db() | |
eoa_cursor = eoa_db.cursor(cursor_factory=psycopg2.extras.DictCursor) | |
eoa_pub_id = get_publication_id(eoa_publication, eoa_cursor) | |
chapters = get_chapters(eoa_pub_id, eoa_cursor) | |
eoa_document = etree.Element("EOAdocument") | |
for chapter in chapters: | |
print(" ", chapter['Title']) | |
chapter_xml = build_chapter_xml(chapter) | |
chapter_paragraphs = get_paragraphs(chapter['id'], eoa_cursor) | |
for paragraph in chapter_paragraphs: | |
paragraph_xml = build_paragraph_xml(paragraph) | |
chapter_xml.append(paragraph_xml) | |
eoa_document.append(chapter_xml) | |
if not os.path.exists(OUTPUT_DIR): | |
os.mkdir(os.path.expanduser(OUTPUT_DIR)) | |
xml_file = OUTPUT_DIR + "/" + eoa_publication + ".xml" | |
tree = etree.ElementTree(eoa_document) | |
tree.write(xml_file, pretty_print=True, encoding="utf-8", xml_declaration=False) | |
# def main ends here | |
if __name__ == '__main__': | |
if len(sys.argv) == 1: | |
print_error("You must specify a publication!") | |
sys.exit() | |
elif len(sys.argv) > 2: | |
print_error("You can work with only one publication at a time!") | |
sys.exit() | |
main(sys.argv[-1]) | |
# finis |