diff --git a/README.md b/README.md index 2a6fba7..5194c82 100644 --- a/README.md +++ b/README.md @@ -111,17 +111,21 @@ In order to apply the workflow to any other publication copy it into the `input/ $ ./scripts/run.py # run if not yet running $ ./scripts/exec_in_container.py # enter container -1. eoatei -> eoatex +1. eoaTEI -> eoaTEI with bibliography + + $ tei_add_bibl.py input/example/tei + +1. eoaTEI -> eoaTEX $ tei2eoatex.py -f input/example/tei/exampleTEI.xml -1. eoatex -> pdf +1. eoaTEX -> pdf $ eoatex2pdf.py -f output/from_tei/eoatex/main.tex -o output/from_tei/pdf (adjust filename if necessary) -1. eoatex -> imxml (to intermediate xml) +1. eoaTEI -> imxml (to intermediate xml) $ gather_pickledata.py input/example/tei/exampleTEI.xml input/example/tei/example.bib $ tei2imxml.py -f input/example/tei/exampleTEI.xml diff --git a/src/stylesheets/teibib_to_eoa1.xsl b/src/stylesheets/teibib_to_eoa1.xsl index 9fd7776..0b00692 100644 --- a/src/stylesheets/teibib_to_eoa1.xsl +++ b/src/stylesheets/teibib_to_eoa1.xsl @@ -32,6 +32,14 @@

+
+

citetitle

+

+ + + +

+

citefull

diff --git a/src/stylesheets/tex4ht_2_tei.xsl b/src/stylesheets/tex4ht_2_tei.xsl index a17c3e5..5d80102 100644 --- a/src/stylesheets/tex4ht_2_tei.xsl +++ b/src/stylesheets/tex4ht_2_tei.xsl @@ -120,10 +120,12 @@ + + diff --git a/src/tei2imxml.py b/src/tei2imxml.py index 76472f6..97d8df5 100755 --- a/src/tei2imxml.py +++ b/src/tei2imxml.py @@ -26,7 +26,6 @@ import shlex import argparse import configparser -import bibtexparser from datetime import datetime from bs4 import BeautifulSoup from copy import deepcopy @@ -316,7 +315,10 @@ def format_reference_list(used_citekeys, html_file): return references # def format_reference_list ends here -def format_citations(used_citekeys, bibdata, html_file): +def format_citations( + used_citekeys, + html_file +): """Return a dictionary of the used citations as formatted entries. citation_dict[citekey] = (authoryear_citation, year_citation, title) @@ -329,24 +331,15 @@ def format_citations(used_citekeys, bibdata, html_file): sys.exit(1) citation_dict = {} - for entry in used_citekeys: - if entry in bibdata: - current_citation = entry - logging.debug(f"""{html_file}: {entry}.""") - try: - strTitle = bibdata[entry]["title"] - except KeyError: - logging.warning("No title found for %s", entry) - - title = strTitle - try: - authoryear_citation = cites.xpath(f"//div[@class='authoryear']/p/span[@data-cites='{entry}']")[0].text - year_citation = cites.xpath(f"//div[@class='year']/p/span[@data-cites='{entry}']")[0].text - except IndexError: - logging.error(f"Entry {entry} was not found in HTML file. Maybe you should run the tool again without -n option. Exiting.") - sys.exit(1) - citation_dict[entry] = (authoryear_citation, year_citation, title) + try: + authoryear_citation = cites.xpath(f"//div[@class='authoryear']/p/span[@data-cites='{entry}']")[0].text + year_citation = cites.xpath(f"//div[@class='year']/p/span[@data-cites='{entry}']")[0].text + title = cites.xpath(f"//div[@class='title']/p/span[@data-cites='{entry}']")[0].text + except IndexError: + logging.error(f"Entry {entry} was not found in HTML file. Maybe you should run the tool again without -n option. Exiting.") + sys.exit(1) + citation_dict[entry] = (authoryear_citation, year_citation, title) return citation_dict # def format_citations ends here @@ -691,6 +684,7 @@ def handle_refs_default(ref): eoa_citations = xml_tree.xpath("//t:bibl", namespaces=NS_MAP) for citation in eoa_citations: + # logging.debug( f"handling citation: {etree.tostring(citation)}" ) pagerange = "" cited_range = citation.xpath("t:citedRange", namespaces=NS_MAP) citeref = citation.xpath("t:ref", namespaces=NS_MAP) @@ -1177,108 +1171,6 @@ def update_ids(xml_tree, ignore_ref_errors): return xml_tree # def update_ids ends here - -def get_all_citations(xml_file): - """Retrieve citations from file """ - - all_citations = xml_file.xpath("//t:bibl/t:ref", namespaces=NS_MAP) - - all_citekeys = [] - - for citation in all_citations: - citekey = citation.get("target")[1:] - if citekey not in all_citekeys: - all_citekeys.append(citekey) - - return all_citekeys -# def get_all_citations ends here - - -def get_citations_per_chapter(xml_tree): - """If publication is anthology, store which citations are mentioned in each chapter: - - 'chap18_schwartz': {'Blodget_1857', 'CliffordMarcus_1986', - 'Hunter_2004', 'MarcusFischer_1986', 'Mitchell_1992', 'Nye_1994', - 'Schlereth_1980', 'Schwartz_2003', 'Schwartz_2011'}} - - """ - - refs_per_chapter = {} - - all_chapters = xml_tree.xpath("//t:div[@type='chapter']", namespaces=NS_MAP) - logging.info(f"Found {libeoaconvert.plural(len(all_chapters), 'chapter')}.") - - for chapter in all_chapters: - try: - chapter_id = chapter.xpath("@xml:id", namespaces=NS_MAP)[0] - except IndexError: - logging.error(f"Found a chapter without identifier. Each chapter must have one. Exiting.") - sys.exit(1) - all_refs_with_hash = chapter.xpath(".//t:bibl/t:ref/@target", namespaces=NS_MAP) - all_refs = [x[1:] for x in all_refs_with_hash] - logging.info(f"Found {libeoaconvert.plural(len(all_refs), 'reference')} in this chapter.") - refs_per_chapter[chapter_id] = set(all_refs) - - return refs_per_chapter -# def get_citations_per_chapter ends here - -def convert_bibliography_to_dict( - bib_file : Path -): - """Create a dictionary from bibliography data.""" - - parser = bibtexparser.bparser.BibTexParser() - # be a bit lax about nonstandard entry types - parser.ignore_nonstandard_types = False - - bibliography_dict = {} - - with open(bib_file) as btf: - btb = bibtexparser.load(btf, parser=parser) - bibliography_dict = btb.entries_dict - - return bibliography_dict -# def convert_bibliography_to_dict ends here - -def make_bibliography_tex4ht( - used_citekeys, - bib_data, - output_file_root, - publication_language, - TEMP_DIR, - log_dir, - input_dir - ): - """Create the HTML version of the bibliography using tex4ht - - Return the filename of the HTML file - """ - - translations = {"de" : "german", "en" : "english", "it" : "italian", "fr" : "french"} - - citations_filename_tei = Path(output_file_root).with_suffix(".tei") - - bib2html.bib2tei( - bib_file = input_dir / bib_data["source"], - citekeys = used_citekeys, - language = translations[publication_language], - temp_dir = TEMP_DIR, - output_file = citations_filename_tei, - log_dir = log_dir, - keywords = [""] - ) - - citations_filename_html = Path(output_file_root).with_suffix(".html") - bib2html.teibib_to_eoa1( - citations_filename_tei, - output_file = citations_filename_html - ) - - - return citations_filename_html -# def make_bibliography_tex4ht ends here - - def add_bibliography_monograph(xml_tree, refs_for_bib_chapter): """Add another chapter containing the bibliography.""" @@ -1493,37 +1385,32 @@ def main(): bib_data = check_bibliography(xml_tree) - citations_dict = convert_bibliography_to_dict( - INPUT_DIR / bib_data["source"] - ) - logging.debug("Creating bibliographies.") + cited_dict = {} if bib_data["type"] == "monograph": - used_citekeys = get_all_citations(xml_tree) - citations_filename_root = Path(TEMP_DIR, "formatted_citations_monograph") + bibl_info = bib2html.get_bibl_info( xml_tree ) + logging.debug( f"citekeys: {bibl_info['citekeys']}" ) - if args.no_bib4ht: - citations_filename_html = citations_filename_root.with_suffix(".html") - logging.info("Skipping creation of HTML bibliography files. Using the existing ones.") - else: - citations_filename_html = make_bibliography_tex4ht( - used_citekeys, - bib_data, - citations_filename_root, - publication_language, - TEMP_DIR, - LOG_DIR, - INPUT_DIR - ) + citations_filename_tei = \ + (INPUT_DIR / "bibliography/bibliography_all") . with_suffix(".tei") + + citations_filename_html = (TEMP_DIR / "formatted_citations_monograph") . with_suffix(".html") + bib2html.teibib_to_eoa1( + citations_filename_tei, + output_file = citations_filename_html + ) logging.info("Formatting citations now.") - cited_dict = format_citations(used_citekeys, citations_dict, citations_filename_html) - refs_for_bib_chapter = format_reference_list(used_citekeys, citations_filename_html) + # citekey -> (authoryear, year, title) + cited_dict = format_citations( + bibl_info['citekeys'], + citations_filename_html + ) + refs_for_bib_chapter = format_reference_list(bibl_info['citekeys'], citations_filename_html) elif bib_data["type"] == "anthology": - citations_per_chapter = get_citations_per_chapter(xml_tree) + bibl_info = bib2html.get_bibl_info( xml_tree ) formatted_references_dict = {} all_chapter_ids = xml_tree.xpath("//t:div[@type='chapter']/@xml:id", namespaces=NS_MAP) - cited_dict = {} for chapter_id in all_chapter_ids: used_citekeys_per_chapter = citations_per_chapter[chapter_id] @@ -1534,21 +1421,31 @@ def main(): else: citations_filename_root = Path(TEMP_DIR, f"formatted_citations_{chapter_id}") if args.no_bib4ht: - citations_filename_html_per_chapter = citations_filename_root.with_suffix(".html") + # citations_filename_html_per_chapter = citations_filename_root.with_suffix(".html") logging.info("Skipping creation of HTML bibliography files. Using the existing ones.") else: - citations_filename_html_per_chapter = make_bibliography_tex4ht( - used_citekeys_per_chapter, - bib_data, - citations_filename_root, - publication_language, - TEMP_DIR, - LOG_DIR, - INPUT_DIR + + citations_filename_tei_per_chapter = citations_filename_root . with_suffix(".tei") + if not citations_filename_tei.is_file(): + translations = {"de" : "german", "en" : "english", "it" : "italian", "fr" : "french"} + bib2html.bib2tei( + bib_file = INPUT_DIR / bib_data["source"], + citekeys = used_citekeys, + language = translations[publication_language], + temp_dir = TEMP_DIR, + output_file = citations_filename_tei_per_chapter, + log_dir = LOG_DIR, + keywords = [""] + ) + citations_filename_html_per_chapter = citations_filename_root . with_suffix(".html") + bib2html.teibib_to_eoa1( + citations_filename_tei_per_chapter, + output_file = citations_filename_html_per_chapter ) logging.info("Formatting citations now.") - cited_dict_per_chapter = format_citations(used_citekeys_per_chapter, citations_dict, citations_filename_html_per_chapter) + # citekey -> (authoryear, year, title) + cited_dict_per_chapter = format_citations(used_citekeys_per_chapter, citations_filename_html_per_chapter) # Merge dictionaries cited_dict = {**cited_dict, **cited_dict_per_chapter} @@ -1557,13 +1454,24 @@ def main(): # create a dictionary entry containing the formatted references formatted_references_dict[tmp_dict_key] = refs_for_bib_chapter logging.debug(f"cited_dict now has {libeoaconvert.plural(len(cited_dict), 'entry', plural='entries')}.") + else: + raise( Exception("unknown publication type!")) + logging.debug( cited_dict ) tei_body = xml_tree.xpath("//t:body", namespaces=NS_MAP)[0] if args.hyperimage: logging.info("Transforming body with Hyperimage support") else: pass - body_transformed_tmp = transform_body(tei_body, cited_dict, TRANSLATION_FILE, HI_XML_FILE, args.eoa_classic, publang=publication_language, hyperimage=args.hyperimage) + body_transformed_tmp = transform_body( + tei_body, + cited_dict, + TRANSLATION_FILE, + HI_XML_FILE, + args.eoa_classic, + publang=publication_language, + hyperimage=args.hyperimage + ) libeoaconvert.debug_xml_here(body_transformed_tmp, "body_transformed", DEBUG_DIR) body_transformed = etree.ElementTree(body_transformed_tmp) diff --git a/src/tei_add_bibl.py b/src/tei_add_bibl.py index c0502af..98f630f 100755 --- a/src/tei_add_bibl.py +++ b/src/tei_add_bibl.py @@ -12,10 +12,6 @@ from os import environ from shutil import rmtree, copytree, ignore_patterns, copy -def main( -): - logging.info("hallo") - BASE_DIR = Path( __file__ ).parent SCRIPT_NAME = Path( __file__).stem @@ -29,6 +25,8 @@ def main( DEFAULT_DEPENDENCIES_DIR = \ Path(environ['DEPENDENCIES_DIR'] if 'DEPENDENCIES_DIR' in environ else './dependencies') +NS_MAP = {"tei" : 'http://www.tei-c.org/ns/1.0'} + def copy_dir( src, dst, @@ -57,41 +55,52 @@ def copy_file( **opts ) -def info_from_tei( - tei_file -): - NS_MAP = {"tei" : 'http://www.tei-c.org/ns/1.0'} - translations = {"de" : "german", "en" : "english", "it" : "italian", "fr" : "french"} +def publication_info(xml_tree): + """Check TEI header for bibliography data, return relevant data as dictionary.""" - tei_tree = etree.parse(str(tei_file)) - citekeys = tei_tree.xpath( - "/tei:TEI//tei:body//tei:bibl/tei:ref/@target", - namespaces = NS_MAP - ) - citekeys = [key.lstrip('#') for key in citekeys] + bib_file = xml_tree.xpath("//tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:ab[@type='bibdatabase']/tei:ref/@target", namespaces=NS_MAP)[0] + publ_type = xml_tree.xpath("//tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:ab[@type='bibdatabase']/tei:ref/@type", namespaces=NS_MAP)[0] + if publ_type not in ["monograph", "anthology", "monograph-numeric", "anthology-numeric"]: + logging.error(f"The bibliography type {publ_type} is not allowed. Exiting") + exit(1) language = tei_tree.xpath( "/tei:TEI/tei:teiHeader/tei:profileDesc/tei:langUsage/tei:language/@ident", namespaces = NS_MAP )[0] - language = translations[language] - processing_instructions = tei_tree.xpath( - "//processing-instruction('eoa')" - ) - keywords = [] - for k in processing_instructions: - as_str = str(k).lstrip('').split(" ") - if as_str[0:2] == ["eoa", "printbibliography"]: - if len(as_str) > 2: - keywords += [as_str[2].strip('"').strip("'")] - else: - keywords += [""] - return { - "citekeys": citekeys, + "bib_file": bib_file, + "publ_type": publ_type, "language": language, - "keywords": keywords + } +def create_bibl_and_insert( + tei_tree, + temp_dir, + tei_bib_file, + tei_file, + tei_with_bibl_file +): + translations = {"de" : "german", "en" : "english", "it" : "italian", "fr" : "french"} + tei_info = bib2html.get_bibl_info( tei_tree ) + logging.debug( f"info from tei file: {tei_info}" ) + bib2html.bib2tei( + bib_file = bib_file, + citekeys = tei_info['citekeys'], + keywords = tei_info['keywords'], + language = translations[publ_info['language']], + tex_template = BASE_DIR / "data/aux/bibliography4ht.tex", + temp_dir = temp_dir, + output_file = tei_bib_file, + log_dir = temp_dir / "log" + ) + run_xslt( + tei_file, + BASE_DIR / "stylesheets/insert_bibliography.xsl", + params = [ f"tei_bib_file={tei_bib_file}" ], + output_file = tei_with_bibl_file + ) + if __name__ == '__main__': # parse args: @@ -192,34 +201,27 @@ def info_from_tei( copy_dir( publ_dir, output_dir, - ignore = ignore_patterns( tei_file_input ) if not(tei_file_input . is_absolute()) else None + # ignore = ignore_patterns( tei_file_input ) if not(tei_file_input . is_absolute()) else None ) - copy_file( - tei_file, - (output_dir / (tei_file.with_suffix("").name + "_orig")) . with_suffix( tei_file . suffix ) + tei_tree = etree.parse(str(tei_file)) + publ_info = publication_info( tei_tree ) + logging.info( f"The bibfile is '{publ_info['bib_file']}' and this publication type is '{publ_info['publ_type']}'. Language: '{publ_info['language']}'") + if publ_info["publ_type"] == "monograph": + create_bibl_and_insert( + tei_tree, + temp_dir = output_dir / "temp/all", + tei_bib_file = (output_dir/ "bibliography/bibliography_all") . with_suffix(".tei"), + tei_file = tei_file, + tei_with_bibl_file = (output_dir / (tei_file.with_suffix("").name + "_with_bibl")) . with_suffix( ".xml" ) ) - tei_info = info_from_tei( tei_file ) - - logging.debug( f"info from tei file: {tei_info}" ) - - # language = "german" - temp_dir = output_dir / "temp" - tei_bib_file = (output_dir/ "bibliography") . with_suffix(".tei") - - bib2html.bib2tei( - bib_file = bib_file, - citekeys = tei_info['citekeys'], - keywords = tei_info['keywords'], - language = tei_info['language'], - tex_template = BASE_DIR / "data/aux/bibliography4ht.tex", - temp_dir = temp_dir, - output_file = tei_bib_file, - log_dir = log_dir - ) - - run_xslt( - tei_file, - BASE_DIR / "stylesheets/insert_bibliography.xsl", - params = [ f"tei_bib_file={tei_bib_file}" ], - output_file = output_dir / tei_file.name - ) + elif publ_info["publ_type"] == "anthology": + for chap_node in tei_tree.xpath("//tei:body//tei:div[@type = 'chapter']"): + + chapter_id = chap_node.xpath("@xml:id", namespaces=NS_MAP) + create_bibl_and_insert( + tei_tree, + temp_dir = output_dir / f"temp/chap_{chapter_id}", + tei_bib_file = (output_dir/ f"bibliography/bibliography_chap_{chapter_id}") . with_suffix(".tei") + ) + else: + raise( Exception("unknown publication type!")) diff --git a/src/utils/bib2html.py b/src/utils/bib2html.py index d936e11..7c32c6e 100755 --- a/src/utils/bib2html.py +++ b/src/utils/bib2html.py @@ -35,6 +35,32 @@ BIBLIOGRAPHY_CHAPTER_NO_KEYWORD = "BIBLIOGRAPHY" BIBLIOGRAPHY_CHAPTER = "BIBLIOGRAPHY-{keyword}" +def get_bibl_info( + tei_tree +): + + NS_MAP = {"tei" : 'http://www.tei-c.org/ns/1.0'} + citekeys = tei_tree.xpath( + ".//tei:bibl/tei:ref/@target", + namespaces = NS_MAP + ) + citekeys = [key.lstrip('#') for key in citekeys] + processing_instructions = tei_tree.xpath( + ".//processing-instruction('eoa')" + ) + keywords = [] + for k in processing_instructions: + as_str = str(k).lstrip('').split(" ") + if as_str[0:2] == ["eoa", "printbibliography"]: + if len(as_str) > 2: + keywords += [as_str[2].strip('"').strip("'")] + else: + keywords += [""] + return { + "citekeys": citekeys, + "keywords": keywords + } + def latex_escape_non_ascii( input_str ): output = "" @@ -66,9 +92,9 @@ def write_dummy_latex( tmp_dir = tmp_filename.parent allcitekeys = "" - allcitekeys += "\\begin{tabular}{l l l}\n" + allcitekeys += "\\begin{tabular}{l l l l}\n" for (i,key) in enumerate(citekeys): - allcitekeys += f"\\verb|{key}| &\\cite{{{key}}}&\\cite*{{{key}}}" + allcitekeys += f"\\verb|{key}| &\\cite{{{key}}}&\\cite*{{{key}}}&\\citefield{{{key}}}{{title}}" if i < len(citekeys) - 1: allcitekeys += "\\\\" allcitekeys += "\n"