diff --git a/tei2django.py b/tei2django.py deleted file mode 100644 index c0c9215..0000000 --- a/tei2django.py +++ /dev/null @@ -1,217 +0,0 @@ -#!/usr/bin/python3 -# -*- coding: utf-8; mode: python -*- -__version__ = "1.0" -__date__ = "20170315" -__author__ = "kthoden@mpiwg-berlin.mpg.de" -__doc__ = """A converter from TEI to Django.""" - -import sys -import os -import configparser -from datetime import datetime -from lxml import etree -import mkimage - -OUTPUT_DIR = "./CONVERT" -XSL_FILE = os.path.dirname(sys.argv[0]) + "/data/tei2django.xsl" -FIGURE_DIR = "./data/images" - -def process_formulas(xml_tree): - """Process formulas""" - - """ - Format of filenames: EOAineq_12_62.png chapter number - """ - - pass -# def process formulas ends here - -def get_publication_info(xml_tree): - """Query the TEI document for metadata fields. - - Return a dictionary""" - - info_dict = {} - - ns_tei = "http://www.tei-c.org/ns/1.0" - ns_cc = "http://web.resource.org/cc/" - ns_rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" - NS_MAP = {"t" : ns_tei, "c" : ns_cc, "r" : ns_rdf} - - def get_field(xml_tree, query_path, mandatory=False, findall=False): - """Query XML for metadata fields. - - Default behaviour is if it fails, move on, if mandatory is set - to True, exit the program - """ - - if findall is True: - find_several = xml_tree.findall(query_path, namespaces=NS_MAP) - if len(find_several) == 1: - return_string = [find_several[0].text] - else: - return_string = [x.text for x in find_several] - else: - tmp_field = xml_tree.xpath(query_path, namespaces=NS_MAP) - if len(tmp_field) > 0: - return_string = tmp_field[0] - else: - if mandatory is True: - sys.exit("Field stored in %s is mandatory. Exiting." % query_path) - else: - return_string = "" - - return return_string - # def get_field ends here - - # Mandatory values (according to database schema) - info_dict['eoa_publicationdate'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:date/@when", mandatory=True) - info_dict['eoa_language'] = get_field(xml_tree, "//t:teiHeader/t:profileDesc/t:langUsage/t:language/@ident", mandatory=True) - info_dict['eoa_license'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:availability/t:licence/text()", mandatory=True) - info_dict['eoa_number'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@level='s']/@n", mandatory=True) - info_dict['eoa_series'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@level='s']/text()", mandatory=True) - info_dict['eoa_title'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@type='main']/text()", mandatory=True) - - # Optional (according to database schema) - info_dict['eoa_subtitle'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@type='sub']/text()") - info_dict['eoa_isbn'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:idno[@type='ISBN']/text()") - info_dict['eoa_price'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:extent/t:measure[@unit='EUR']/@quantity") - info_dict['eoa_shoplink_url'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:distributor/@xml:base") - info_dict['eoa_shoplink_id'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:distributor/@xml:id") - info_dict['eoa_shoplink_text'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:distributor/text()") - info_dict['eoa_brief_desc'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='BriefDescription']/text()") - info_dict['eoa_detail_desc'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='DetailedDescription']/text()") - info_dict['eoa_additional_info'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='additionalinformation']/text()") - info_dict['eoa_dedication'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='dedication']/text()") - - info_dict['eoa_submitters'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='submitter']", findall=True) - info_dict['eoa_publicationmanagers'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='publicationmanager']", findall=True) - info_dict['eoa_publicationassistants'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='publicationassistant']", findall=True) - info_dict['eoa_editorialcoordinators'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='editorialcoordinator']", findall=True) - info_dict['eoa_copyeditors'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='copyeditor']", findall=True) - info_dict['eoa_translators'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='translator']", findall=True) - info_dict['eoa_keywords'] = get_field(xml_tree, "//t:teiHeader/t:profileDesc/t:textClass/t:keywords/t:list/t:item", findall=True) - info_dict['eoa_authors'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:author", findall=True) - - return info_dict -# def get_publication_info ends here - -def populate_config_file(info_dict, config): - """Parse the XML header and write it in config file.""" - - # set up three main bits - config['Technical'] = {} - technical_config = config['Technical'] - config['General'] = {} - general_config = config['General'] - config['Authors'] = {} - authors_config = config['Authors'] - - date_object = datetime.strptime(info_dict['eoa_publicationdate'], "%Y-%m-%d") - - # fill in the fields - technical_config['Serie'] = info_dict['eoa_series'] #ok - technical_config['Number'] = info_dict['eoa_number'] #ok - technical_config['Title'] = info_dict['eoa_title'] #ok - technical_config['Subtitle'] = info_dict['eoa_subtitle'] #ok - technical_config['PublicationDate'] = info_dict['eoa_publicationdate'] #ok - technical_config['PublicationYear'] = datetime.strftime(date_object, "%Y") - technical_config['ISBN'] = info_dict['eoa_isbn'] #ok - technical_config['Price'] = info_dict['eoa_price'] #ok - technical_config['Shoplink'] = """{2}""".format(info_dict['eoa_shoplink_url'], info_dict['eoa_shoplink_id'].replace("id_", ""), info_dict['eoa_shoplink_text']) #ok - technical_config['Language'] = info_dict['eoa_language'] #ok - technical_config['License'] = info_dict['eoa_license'] #ok - - general_config['BriefDescription'] = info_dict['eoa_brief_desc'] #ok - general_config['Submitter'] = ", ".join(info_dict['eoa_submitters']) #ok - general_config['PublicationManagment'] = ", ".join(info_dict['eoa_publicationmanagers']) - general_config['PublicationAssistants'] = ", ".join(info_dict['eoa_publicationassistants']) - - if len(info_dict['eoa_keywords']) > 8: - sys.exit("Too many Keywords. Up to 8 are allowed. Exiting.") - else: - for keyword in info_dict['eoa_keywords']: - keyword_label = "Keyword" + str(info_dict['eoa_keywords'].index(keyword) + 1) - general_config[keyword_label] = keyword - - general_config['DetailedDescription'] = info_dict['eoa_detail_desc'] #ok - general_config['AdditionalInformation'] = info_dict['eoa_additional_info'] #ok - general_config['EditorialCoordination'] = ", ".join(info_dict['eoa_editorialcoordinators']) - general_config['Copyediting'] = ", ".join(info_dict['eoa_copyeditors']) - general_config['Dedication'] = info_dict['eoa_dedication'] #ok - general_config['Translator'] = ", ".join(info_dict['eoa_translators']) - - if len(info_dict['eoa_authors']) > 5: - sys.exit("Too many authors. Up to 5 are allowed. Exiting.") - else: - for entry in range(0, 5): - author_label = "Author" + str(entry + 1) - try: - authors_config[author_label] = info_dict['eoa_authors'][entry] - except IndexError: - authors_config[author_label] = "" - - authors_config['Zusatz'] = "" - - return config -# def populate_config_file ends here - -def write_publication_config(publication_dict): - """Main function""" - - config = configparser.ConfigParser(delimiters=(':')) - # https://stackoverflow.com/questions/1611799/preserve-case-in-configparser - config.optionxform=str - publication_config = populate_config_file(publication_dict, config) - - output_filename = OUTPUT_DIR + "/publication.cfg" - with open(output_filename, 'w') as configfile: - publication_config.write(configfile) - print("Wrote", output_filename) -# def write_publication_config ends here - -def write_django_xml(return_string): - """Write the output of XSL transformation to file""" - - output_filename = OUTPUT_DIR + "/Django.xml" - - with open(output_filename, 'w') as djangofile: - djangofile.write(str(return_string)) - - print("Wrote", output_filename) -# def write_django_xml ends here - -def xsl_for_body(xml_file, xsl_file): - """Perform XSL transformation of body. - - Return XSLT result tree.""" - - xml_tree = etree.parse(xml_file) - parsed_xsl_file = etree.parse(xsl_file) - transformer = etree.XSLT(parsed_xsl_file) - result_tree = transformer(xml_tree) - - return result_tree -# def xsl_for_body ends here - -if __name__ == '__main__': - if len(sys.argv) == 1: - print("You must specify an input file!") - sys.exit() - elif len(sys.argv) > 2: - print("You can work with only one publication at a time!") - sys.exit() - - tei_document = sys.argv[-1] - - xml_tree = etree.parse(tei_document) - publication_dict = get_publication_info(xml_tree) - - if not os.path.exists(OUTPUT_DIR): - os.mkdir(os.path.expanduser(OUTPUT_DIR)) - - write_publication_config(publication_dict) - # mkimage.create_cover(publication_dict, FIGURE_DIR, OUTPUT_DIR + "/Cover.jpg") - body_transformed = xsl_for_body(tei_document, XSL_FILE) - write_django_xml(body_transformed) -# finis diff --git a/transform_xml.py b/transform_xml.py index b92aaaf..38d1c9e 100644 --- a/transform_xml.py +++ b/transform_xml.py @@ -12,6 +12,8 @@ import subprocess import pickle import shlex +import configparser +from datetime import datetime from bs4 import BeautifulSoup from lxml import etree, objectify from lxml.html import soupparser @@ -27,10 +29,149 @@ TMP_DIR = os.path.expanduser("tmp_files") CSL_FILE = "/Users/kthoden/EOAKram/dev/eoa-csl/eoa.csl" +OUTPUT_DIR = os.path.expanduser("CONVERT") # this is duplicated from libeoaconvert dictLangFootnotes = {"it" : "Note a piè pagina", "fr" : "notes en bas de page", "de" : "Fußnoten", "en" : "Footnotes"} +def get_publication_info(xml_tree): + """Query the TEI document for metadata fields. + + Return a dictionary""" + + info_dict = {} + + ns_tei = "http://www.tei-c.org/ns/1.0" + ns_cc = "http://web.resource.org/cc/" + ns_rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" + NS_MAP = {"t" : ns_tei, "c" : ns_cc, "r" : ns_rdf} + + def get_field(xml_tree, query_path, mandatory=False, findall=False): + """Query XML for metadata fields. + + Default behaviour is if it fails, move on, if mandatory is set + to True, exit the program + """ + + if findall is True: + find_several = xml_tree.findall(query_path, namespaces=NS_MAP) + if len(find_several) == 1: + return_string = [find_several[0].text] + else: + return_string = [x.text for x in find_several] + else: + tmp_field = xml_tree.xpath(query_path, namespaces=NS_MAP) + if len(tmp_field) > 0: + return_string = tmp_field[0] + else: + if mandatory is True: + sys.exit("Field stored in %s is mandatory. Exiting." % query_path) + else: + return_string = "" + + return return_string + # def get_field ends here + + # Mandatory values (according to database schema) + info_dict['eoa_publicationdate'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:date/@when", mandatory=True) + info_dict['eoa_language'] = get_field(xml_tree, "//t:teiHeader/t:profileDesc/t:langUsage/t:language/@ident", mandatory=True) + info_dict['eoa_license'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:availability/t:licence/text()", mandatory=True) + info_dict['eoa_number'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@level='s']/@n", mandatory=True) + info_dict['eoa_series'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@level='s']/text()", mandatory=True) + info_dict['eoa_title'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@type='main']/text()", mandatory=True) + + # Optional (according to database schema) + info_dict['eoa_subtitle'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:title[@type='sub']/text()") + info_dict['eoa_isbn'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:idno[@type='ISBN']/text()") + info_dict['eoa_price'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:extent/t:measure[@unit='EUR']/@quantity") + info_dict['eoa_shoplink_url'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:distributor/@xml:base") + info_dict['eoa_shoplink_id'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:distributor/@xml:id") + info_dict['eoa_shoplink_text'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:publicationStmt/t:distributor/text()") + info_dict['eoa_brief_desc'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='BriefDescription']/text()") + info_dict['eoa_detail_desc'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='DetailedDescription']/text()") + info_dict['eoa_additional_info'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='additionalinformation']/text()") + info_dict['eoa_dedication'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:sourceDesc/t:ab[@type='dedication']/text()") + + info_dict['eoa_submitters'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='submitter']", findall=True) + info_dict['eoa_publicationmanagers'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='publicationmanager']", findall=True) + info_dict['eoa_publicationassistants'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='publicationassistant']", findall=True) + info_dict['eoa_editorialcoordinators'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='editorialcoordinator']", findall=True) + info_dict['eoa_copyeditors'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='copyeditor']", findall=True) + info_dict['eoa_translators'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:editor[@role='translator']", findall=True) + info_dict['eoa_keywords'] = get_field(xml_tree, "//t:teiHeader/t:profileDesc/t:textClass/t:keywords/t:list/t:item", findall=True) + info_dict['eoa_authors'] = get_field(xml_tree, "//t:teiHeader/t:fileDesc/t:titleStmt/t:author", findall=True) + + return info_dict +# def get_publication_info ends here + +def make_publication_cfg(info_dict): + """Main function""" + + config = configparser.ConfigParser(delimiters=(':')) + # https://stackoverflow.com/questions/1611799/preserve-case-in-configparser + config.optionxform=str + + # set up three main bits + config['Technical'] = {} + technical_config = config['Technical'] + config['General'] = {} + general_config = config['General'] + config['Authors'] = {} + authors_config = config['Authors'] + + date_object = datetime.strptime(info_dict['eoa_publicationdate'], "%Y-%m-%d") + + # fill in the fields + technical_config['Serie'] = info_dict['eoa_series'] #ok + technical_config['Number'] = info_dict['eoa_number'] #ok + technical_config['Title'] = info_dict['eoa_title'] #ok + technical_config['Subtitle'] = info_dict['eoa_subtitle'] #ok + technical_config['PublicationDate'] = info_dict['eoa_publicationdate'] #ok + technical_config['PublicationYear'] = datetime.strftime(date_object, "%Y") + technical_config['ISBN'] = info_dict['eoa_isbn'] #ok + technical_config['Price'] = info_dict['eoa_price'] #ok + technical_config['Shoplink'] = """{2}""".format(info_dict['eoa_shoplink_url'], info_dict['eoa_shoplink_id'].replace("id_", ""), info_dict['eoa_shoplink_text']) #ok + technical_config['Language'] = info_dict['eoa_language'] #ok + technical_config['License'] = info_dict['eoa_license'] #ok + + general_config['BriefDescription'] = info_dict['eoa_brief_desc'] #ok + general_config['Submitter'] = ", ".join(info_dict['eoa_submitters']) #ok + general_config['PublicationManagment'] = ", ".join(info_dict['eoa_publicationmanagers']) + general_config['PublicationAssistants'] = ", ".join(info_dict['eoa_publicationassistants']) + + if len(info_dict['eoa_keywords']) > 8: + sys.exit("Too many Keywords. Up to 8 are allowed. Exiting.") + else: + for keyword in info_dict['eoa_keywords']: + keyword_label = "Keyword" + str(info_dict['eoa_keywords'].index(keyword) + 1) + general_config[keyword_label] = keyword + + general_config['DetailedDescription'] = info_dict['eoa_detail_desc'] #ok + general_config['AdditionalInformation'] = info_dict['eoa_additional_info'] #ok + general_config['EditorialCoordination'] = ", ".join(info_dict['eoa_editorialcoordinators']) + general_config['Copyediting'] = ", ".join(info_dict['eoa_copyeditors']) + general_config['Dedication'] = info_dict['eoa_dedication'] #ok + general_config['Translator'] = ", ".join(info_dict['eoa_translators']) + + if len(info_dict['eoa_authors']) > 5: + sys.exit("Too many authors. Up to 5 are allowed. Exiting.") + else: + for entry in range(0, 5): + author_label = "Author" + str(entry + 1) + try: + authors_config[author_label] = info_dict['eoa_authors'][entry] + except IndexError: + authors_config[author_label] = "" + + authors_config['Zusatz'] = "" + + output_filename = OUTPUT_DIR + os.path.sep + "publication.cfg" + with open(output_filename, 'w') as configfile: + config.write(configfile) + + print("Wrote", output_filename) +# def make_publication_cfg ends here + def render_reference(list_of_xml_elements, cited_data): """Provide an attribute for a formatted version of Reference. @@ -521,6 +662,12 @@ def add_bibliography(xml_tree, refs_for_bib_chapter): # if not os.path.exists("debug"): # os.mkdir(os.path.expanduser("debug")) + if not os.path.exists(OUTPUT_DIR): + os.mkdir(os.path.expanduser(OUTPUT_DIR)) + + publication_info = get_publication_info(xml_tree) + make_publication_cfg(publication_info) + if not os.path.exists(TMP_DIR): os.mkdir(os.path.expanduser(TMP_DIR)) output_filename = TMP_DIR + os.path.sep + "IntermediateXMLFile.xml"