diff --git a/fix_tei.py b/fix_tei.py index 044e46c..686990d 100644 --- a/fix_tei.py +++ b/fix_tei.py @@ -39,21 +39,18 @@ import argparse import traceback import libeoaconvert +import configparser logging.basicConfig(level=logging.INFO, format=' %(asctime)s - %(levelname)s - %(message)s') -# do things like in latex2eoa: search and replace things by regex -# also, delete elements and attributes inserted by metypeset -# and, rename elements according to our schema - -# treat -# assignment of identifiers - ns_tei = "http://www.tei-c.org/ns/1.0" NS_MAP = {"t" : ns_tei} TMP_DIR = os.path.expanduser("tmp_files") +BOILERPLATES = configparser.ConfigParser() +BOILERPLATES.read("data/tei_boilerplate.cfg") + def get_place_in_xml_tree(element, tree): """Find out the position of an element in a tree. @@ -82,6 +79,25 @@ def parse_bibtex(bibfile): # return all_references # def parse_bibtex ends here +def restore_xml_tags(text): + """Convert XML entities back to code + + < => < + """ + + replacements = { + "<" : "<", + ">" : ">", + "'" : "'", + "&" : "&" + } + + for item in replacements: + text = text.replace(item, replacements[item]) + + return text +# def restore_xml_tags ends here + def unescape(text): """Remove HTML or XML character references and entities from a text string. Return a Unicode string. @@ -271,10 +287,11 @@ def make_figure_elements(list_of_figures, figure_directory): # def make_figure_elements ends here def cleanup_xml(xml_tree): - """Perform some cleaning on XML""" + """Perform some cleaning on XML - # also, delete elements and attributes inserted by metypeset - # and, rename elements according to our schema + Also, delete elements and attributes inserted by metypeset and + rename elements according to our schema + """ metypeset_attrib = xml_tree.findall("//t:*[@meTypesetSize]", namespaces=NS_MAP) color_attrib = xml_tree.xpath("//t:hi[contains(@rend, 'color') or contains(@rend, 'background')]", namespaces=NS_MAP) @@ -343,17 +360,6 @@ def fix_document_structure(xml_tree, highest_level): subsection.set("type", "subsection") for subsubsection in subsubsection_divs: subsubsection.set("type", "subsubsection") - - # section_divs = xml_tree.xpath("//t:body/t:div", namespaces=NS_MAP) - # subsection_divs = xml_tree.xpath("//t:body/t:div/t:div", namespaces=NS_MAP) - # subsubsection_divs = xml_tree.xpath("//t:body/t:div/t:div/t:div", namespaces=NS_MAP) - - # for section in section_divs: - # section.set("type", "section") - # for subsection in subsection_divs: - # subsection.set("type", "subsection") - # for subsubsection in subsubsection_divs: - # subsubsection.set("type", "subsubsection") # def fix_document_structure ends here def fix_tei_header(xml_tree, bibfile_string): @@ -361,16 +367,19 @@ def fix_tei_header(xml_tree, bibfile_string): title_statement = xml_tree.xpath("//t:titleStmt", namespaces=NS_MAP)[0] title_element = title_statement.find("t:title", namespaces=NS_MAP) - title_element.set("level", "s") - title_element.set("n", "20") - title_element.text = "Titel der Serie" - main_title = etree.Element("title", type="main") - main_title.text = "FotoObjekte" - title_statement.insert(0, main_title) + title_element.set("type", "main") + title_element.set("level", "m") + if title_element.text is None: + title_element.text = "Insert title of publication here" + else: + pass - # series = etree.SubElement(title_statement, "title", level="s", n="20").text = "Studies" - # subtitle = etree.SubElement(title_statement, "title", level="sub").text = "Artikelsammlung" + edition = xml_tree.xpath("//t:editionStmt/t:edition", namespaces=NS_MAP)[0] + edition_date = edition.find("t:date", namespaces=NS_MAP) + edition_date.clear() + edition_date.tag = "tagtobestripped" + edition.text = "First published {} by {}".format(datetime.now().strftime("%Y"), BOILERPLATES.get("Header","eoa_name")) publication_statement = xml_tree.xpath("//t:publicationStmt", namespaces=NS_MAP)[0] unknown_paragraph = publication_statement.find("t:p", namespaces=NS_MAP) @@ -378,26 +387,94 @@ def fix_tei_header(xml_tree, bibfile_string): unknown_paragraph.clear() unknown_paragraph.tag = "tagtobestripped" - etree.SubElement(publication_statement, "publisher").text = "Edition Open Access" + publisher_element = etree.SubElement(publication_statement, "publisher") + overall_org = etree.SubElement(publisher_element, "orgName", n="EOA", ref=BOILERPLATES.get("Header","eoa_url")) + overall_org.text = BOILERPLATES.get("Header","eoa_name") + publishing_org = etree.SubElement(publisher_element, "orgName", n="Press", ref=BOILERPLATES.get("Header","mprl_url")) + publishing_org.text = BOILERPLATES.get("Header","mprl_name") + pub_date = etree.SubElement(publication_statement, "date", when=datetime.now().strftime("%Y-%m-%d")) availability = etree.SubElement(publication_statement, "availability") - licence = etree.SubElement(availability, "licence", target="https://creativecommons.org/licenses/by-nc-sa/3.0/de/deed.en") - licence.text = "by-nc-sa" - # licence_text = etree.SubElement(licence, "p").text = """Distributed under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Germany License.""" + licence = etree.SubElement(availability, "licence", target=BOILERPLATES.get("Header","licence_url")) + licence.text = BOILERPLATES.get("Header","licence_text") + + # series statement + series_stmt = etree.Element("seriesStmt") + title_element = etree.SubElement(series_stmt, "title").text = "Series title" + resp_stmt = etree.SubElement(series_stmt, "respStmt") + resp_title = etree.SubElement(resp_stmt, "resp").text = "Series Editors" + resp_names = etree.SubElement(resp_stmt, "name", type="serieseditors") + resp_names.text = BOILERPLATES.get("Header","mprl_series_editors") + series_number = etree.SubElement(series_stmt, "idno", type="number").text = "number" + publication_stmt_parent = publication_statement.getparent() + series_stmt_insertion_point = get_place_in_xml_tree(publication_statement, publication_stmt_parent) + 1 + publication_stmt_parent.insert(series_stmt_insertion_point, series_stmt) source_desc = xml_tree.xpath("//t:sourceDesc", namespaces=NS_MAP)[0] - bibfile = etree.SubElement(source_desc, "ab", type="bibliography") + suggested_citation = etree.SubElement(source_desc, "ab", type="suggestedcitation").text = "Suggested Citation" + bibfile = etree.SubElement(source_desc, "ab", type="bibdatabase") etree.SubElement(bibfile, "ref", type="monograph", target=bibfile_string) + # profile description profile_desc = etree.Element("profileDesc") + brief_abstract = etree.SubElement(profile_desc, "abstract", n="BriefDescription") + brief_abstract_p = etree.SubElement(brief_abstract, "p").text = "Short abstract" + detailed_abstract = etree.SubElement(profile_desc, "abstract", n="DetailedDescription") + detailed_abstract_p = etree.SubElement(detailed_abstract, "p").text = "Long abstract" + additional_text = etree.SubElement(profile_desc, "abstract", n="additional") + additional_text_p = etree.SubElement(additional_text, "p").text = "Additional text" + textclass = etree.SubElement(profile_desc, "textClass") + keywords = etree.SubElement(textclass, "keywords") + list_keywords = etree.SubElement(keywords, "list") + keyword_item = etree.SubElement(list_keywords, "item").text = BOILERPLATES.get("Header","eoa_name") langusage = etree.SubElement(profile_desc, "langUsage") - language = etree.SubElement(langusage, "language", ident="en").text = "English" + language = etree.SubElement(langusage, "language", ident="principal language") + xml_tree.insert(1, profile_desc) + + encoding_desc = xml_tree.xpath("//t:encodingDesc", namespaces=NS_MAP)[0] + + project_desc = etree.Element("projectDesc") + eoainfo_p1 = etree.SubElement(project_desc, "p", n="eoainfo").text = BOILERPLATES.get("Header","eoainfo_p1") + eoainfo_p2 = etree.SubElement(project_desc, "p", n="eoainfo").text = BOILERPLATES.get("Header","eoainfo_p2") + mprlinformation = etree.SubElement(project_desc, "p", n="mprlinformation").text = BOILERPLATES.get("Header","mprlinformation") + scientificboard = etree.SubElement(project_desc, "p", n="scientificboard").text = BOILERPLATES.get("Header","scientificboard") + eoadevteam = etree.SubElement(project_desc, "p", n="eoadevteam").text = BOILERPLATES.get("Header","eoadevteam") + encoding_desc.insert(0, project_desc) - xml_tree.insert(2, profile_desc) + appinfo = encoding_desc.find("t:appInfo", namespaces=NS_MAP) + fix_tei_info = etree.Element("application", ident="fix_tei", version=__version__) + fix_tei_info.attrib["{http://www.w3.org/XML/1998/namespace}id"] = "fixtei" + fix_tei_label = etree.SubElement(fix_tei_info, "label").text = "Fix TEI for EOA" + appinfo.insert(-1, fix_tei_info) + + revision_desc = xml_tree.xpath("//t:revisionDesc", namespaces=NS_MAP)[0] + olderchanges = revision_desc.find("t:listChange", namespaces=NS_MAP) + olderchanges.clear() + olderchanges.tag = "tagtobestripped" + + first_change = etree.SubElement(revision_desc, "change", when=datetime.now().strftime("%Y-%m-%d"), who="#fixtei") + first_change.text = "Fixed TEI created by oxgarage conversion" return xml_tree # def fix_tei_header ends here +def add_tei_frontpart(): + """Add a small front part + + Contains a cover image and an optional dedication. + """ + + frontpart = etree.Element("front") + cover_image = etree.SubElement(frontpart, "figure", type="cover") + cover_url = etree.SubElement(cover_image, "graphic", url="images/Cover.jpg") + cover_caption = etree.SubElement(cover_image, "head").text = "Cover caption" + + dedication = etree.SubElement(frontpart, "div", type="dedication") + dedication_text = etree.SubElement(dedication, "ab").text = "Dedication text" + + return frontpart +# def add_tei_frontpart ends here + def evaluate_report(report): """Print report of conversion.""" @@ -510,6 +587,10 @@ def main(): tei_header = xml_tree2.xpath("//t:teiHeader", namespaces=NS_MAP) fix_tei_header(tei_header[0], str(args.bibfile)) + tei_text = xml_tree2.xpath("/t:TEI/t:text", namespaces=NS_MAP)[0] + tei_front_part = add_tei_frontpart() + tei_text.insert(0, tei_front_part) + etree.strip_tags(xml_tree2, "tagtobestripped") dictChapters = {}