Skip to content

Commit

Permalink
improved bib->tei allows for further simplification of tei->imxml
Browse files Browse the repository at this point in the history
  • Loading branch information
EsGeh authored and EsGeh committed Dec 19, 2019
1 parent 513d929 commit 5fc7acc
Show file tree
Hide file tree
Showing 6 changed files with 169 additions and 219 deletions.
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,17 +111,21 @@ In order to apply the workflow to any other publication copy it into the `input/
$ ./scripts/run.py # run if not yet running
$ ./scripts/exec_in_container.py # enter container

1. eoatei -> eoatex
1. eoaTEI -> eoaTEI with bibliography

$ tei_add_bibl.py input/example/tei

1. eoaTEI -> eoaTEX

$ tei2eoatex.py -f input/example/tei/exampleTEI.xml

1. eoatex -> pdf
1. eoaTEX -> pdf

$ eoatex2pdf.py -f output/from_tei/eoatex/main.tex -o output/from_tei/pdf

(adjust filename if necessary)

1. eoatex -> imxml (to intermediate xml)
1. eoaTEI -> imxml (to intermediate xml)

$ gather_pickledata.py input/example/tei/exampleTEI.xml input/example/tei/example.bib
$ tei2imxml.py -f input/example/tei/exampleTEI.xml
8 changes: 8 additions & 0 deletions src/stylesheets/teibib_to_eoa1.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@
</xsl:call-template>
</p>
</div>
<div class="title">
<h1 id="citetitle">citetitle</h1>
<p>
<xsl:call-template name="render_bib_refs">
<xsl:with-param name="type" select="'title'"/>
</xsl:call-template>
</p>
</div>
<div class="full">
<h1 id="citefull">citefull</h1>
<xsl:call-template name="render_full_bibrefs"/>
Expand Down
2 changes: 2 additions & 0 deletions src/stylesheets/tex4ht_2_tei.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,12 @@
<variable name="citekey" as="xs:string" select="html:td[1]"/>
<variable name="authoryear" as="xs:string" select="html:td[2]"/>
<variable name="year" as="xs:string" select="html:td[3]"/>
<variable name="title" as="xs:string" select="html:td[4]"/>
<variable name="full" as="element()" select="//html:h3[. = 'BIBLIOGRAPHY']/following-sibling::html:dl/html:dt[substring-after(@id, 'X0-') = $citekey]/following-sibling::html:dd[1]"/>
<tei:listBibl xml:id="{$citekey}">
<tei:bibl type="authoryear"><value-of select="$authoryear"/></tei:bibl>
<tei:bibl type="year"><value-of select="$year"/></tei:bibl>
<tei:bibl type="title"><value-of select="$title"/></tei:bibl>
<tei:bibl type="full"><apply-templates mode="imhtml_to_tei" select="$full/node()"/></tei:bibl>
</tei:listBibl>
</for-each>
Expand Down
218 changes: 63 additions & 155 deletions src/tei2imxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
import shlex
import argparse
import configparser
import bibtexparser
from datetime import datetime
from bs4 import BeautifulSoup
from copy import deepcopy
Expand Down Expand Up @@ -316,7 +315,10 @@ def format_reference_list(used_citekeys, html_file):
return references
# def format_reference_list ends here

def format_citations(used_citekeys, bibdata, html_file):
def format_citations(
used_citekeys,
html_file
):
"""Return a dictionary of the used citations as formatted entries.
citation_dict[citekey] = (authoryear_citation, year_citation, title)
Expand All @@ -329,24 +331,15 @@ def format_citations(used_citekeys, bibdata, html_file):
sys.exit(1)

citation_dict = {}

for entry in used_citekeys:
if entry in bibdata:
current_citation = entry
logging.debug(f"""{html_file}: {entry}.""")
try:
strTitle = bibdata[entry]["title"]
except KeyError:
logging.warning("No title found for %s", entry)

title = strTitle
try:
authoryear_citation = cites.xpath(f"//div[@class='authoryear']/p/span[@data-cites='{entry}']")[0].text
year_citation = cites.xpath(f"//div[@class='year']/p/span[@data-cites='{entry}']")[0].text
except IndexError:
logging.error(f"Entry {entry} was not found in HTML file. Maybe you should run the tool again without -n option. Exiting.")
sys.exit(1)
citation_dict[entry] = (authoryear_citation, year_citation, title)
try:
authoryear_citation = cites.xpath(f"//div[@class='authoryear']/p/span[@data-cites='{entry}']")[0].text
year_citation = cites.xpath(f"//div[@class='year']/p/span[@data-cites='{entry}']")[0].text
title = cites.xpath(f"//div[@class='title']/p/span[@data-cites='{entry}']")[0].text
except IndexError:
logging.error(f"Entry {entry} was not found in HTML file. Maybe you should run the tool again without -n option. Exiting.")
sys.exit(1)
citation_dict[entry] = (authoryear_citation, year_citation, title)

return citation_dict
# def format_citations ends here
Expand Down Expand Up @@ -691,6 +684,7 @@ def handle_refs_default(ref):
eoa_citations = xml_tree.xpath("//t:bibl", namespaces=NS_MAP)

for citation in eoa_citations:
# logging.debug( f"handling citation: {etree.tostring(citation)}" )
pagerange = ""
cited_range = citation.xpath("t:citedRange", namespaces=NS_MAP)
citeref = citation.xpath("t:ref", namespaces=NS_MAP)
Expand Down Expand Up @@ -1177,108 +1171,6 @@ def update_ids(xml_tree, ignore_ref_errors):
return xml_tree
# def update_ids ends here


def get_all_citations(xml_file):
"""Retrieve citations from file """

all_citations = xml_file.xpath("//t:bibl/t:ref", namespaces=NS_MAP)

all_citekeys = []

for citation in all_citations:
citekey = citation.get("target")[1:]
if citekey not in all_citekeys:
all_citekeys.append(citekey)

return all_citekeys
# def get_all_citations ends here


def get_citations_per_chapter(xml_tree):
"""If publication is anthology, store which citations are mentioned in each chapter:
'chap18_schwartz': {'Blodget_1857', 'CliffordMarcus_1986',
'Hunter_2004', 'MarcusFischer_1986', 'Mitchell_1992', 'Nye_1994',
'Schlereth_1980', 'Schwartz_2003', 'Schwartz_2011'}}
"""

refs_per_chapter = {}

all_chapters = xml_tree.xpath("//t:div[@type='chapter']", namespaces=NS_MAP)
logging.info(f"Found {libeoaconvert.plural(len(all_chapters), 'chapter')}.")

for chapter in all_chapters:
try:
chapter_id = chapter.xpath("@xml:id", namespaces=NS_MAP)[0]
except IndexError:
logging.error(f"Found a chapter without identifier. Each chapter must have one. Exiting.")
sys.exit(1)
all_refs_with_hash = chapter.xpath(".//t:bibl/t:ref/@target", namespaces=NS_MAP)
all_refs = [x[1:] for x in all_refs_with_hash]
logging.info(f"Found {libeoaconvert.plural(len(all_refs), 'reference')} in this chapter.")
refs_per_chapter[chapter_id] = set(all_refs)

return refs_per_chapter
# def get_citations_per_chapter ends here

def convert_bibliography_to_dict(
bib_file : Path
):
"""Create a dictionary from bibliography data."""

parser = bibtexparser.bparser.BibTexParser()
# be a bit lax about nonstandard entry types
parser.ignore_nonstandard_types = False

bibliography_dict = {}

with open(bib_file) as btf:
btb = bibtexparser.load(btf, parser=parser)
bibliography_dict = btb.entries_dict

return bibliography_dict
# def convert_bibliography_to_dict ends here

def make_bibliography_tex4ht(
used_citekeys,
bib_data,
output_file_root,
publication_language,
TEMP_DIR,
log_dir,
input_dir
):
"""Create the HTML version of the bibliography using tex4ht
Return the filename of the HTML file
"""

translations = {"de" : "german", "en" : "english", "it" : "italian", "fr" : "french"}

citations_filename_tei = Path(output_file_root).with_suffix(".tei")

bib2html.bib2tei(
bib_file = input_dir / bib_data["source"],
citekeys = used_citekeys,
language = translations[publication_language],
temp_dir = TEMP_DIR,
output_file = citations_filename_tei,
log_dir = log_dir,
keywords = [""]
)

citations_filename_html = Path(output_file_root).with_suffix(".html")
bib2html.teibib_to_eoa1(
citations_filename_tei,
output_file = citations_filename_html
)


return citations_filename_html
# def make_bibliography_tex4ht ends here


def add_bibliography_monograph(xml_tree, refs_for_bib_chapter):
"""Add another chapter containing the bibliography."""

Expand Down Expand Up @@ -1493,37 +1385,32 @@ def main():

bib_data = check_bibliography(xml_tree)

citations_dict = convert_bibliography_to_dict(
INPUT_DIR / bib_data["source"]
)

logging.debug("Creating bibliographies.")
cited_dict = {}
if bib_data["type"] == "monograph":
used_citekeys = get_all_citations(xml_tree)
citations_filename_root = Path(TEMP_DIR, "formatted_citations_monograph")
bibl_info = bib2html.get_bibl_info( xml_tree )
logging.debug( f"citekeys: {bibl_info['citekeys']}" )

if args.no_bib4ht:
citations_filename_html = citations_filename_root.with_suffix(".html")
logging.info("Skipping creation of HTML bibliography files. Using the existing ones.")
else:
citations_filename_html = make_bibliography_tex4ht(
used_citekeys,
bib_data,
citations_filename_root,
publication_language,
TEMP_DIR,
LOG_DIR,
INPUT_DIR
)
citations_filename_tei = \
(INPUT_DIR / "bibliography/bibliography_all") . with_suffix(".tei")

citations_filename_html = (TEMP_DIR / "formatted_citations_monograph") . with_suffix(".html")
bib2html.teibib_to_eoa1(
citations_filename_tei,
output_file = citations_filename_html
)

logging.info("Formatting citations now.")
cited_dict = format_citations(used_citekeys, citations_dict, citations_filename_html)
refs_for_bib_chapter = format_reference_list(used_citekeys, citations_filename_html)
# citekey -> (authoryear, year, title)
cited_dict = format_citations(
bibl_info['citekeys'],
citations_filename_html
)
refs_for_bib_chapter = format_reference_list(bibl_info['citekeys'], citations_filename_html)
elif bib_data["type"] == "anthology":
citations_per_chapter = get_citations_per_chapter(xml_tree)
bibl_info = bib2html.get_bibl_info( xml_tree )
formatted_references_dict = {}
all_chapter_ids = xml_tree.xpath("//t:div[@type='chapter']/@xml:id", namespaces=NS_MAP)
cited_dict = {}

for chapter_id in all_chapter_ids:
used_citekeys_per_chapter = citations_per_chapter[chapter_id]
Expand All @@ -1534,21 +1421,31 @@ def main():
else:
citations_filename_root = Path(TEMP_DIR, f"formatted_citations_{chapter_id}")
if args.no_bib4ht:
citations_filename_html_per_chapter = citations_filename_root.with_suffix(".html")
# citations_filename_html_per_chapter = citations_filename_root.with_suffix(".html")
logging.info("Skipping creation of HTML bibliography files. Using the existing ones.")
else:
citations_filename_html_per_chapter = make_bibliography_tex4ht(
used_citekeys_per_chapter,
bib_data,
citations_filename_root,
publication_language,
TEMP_DIR,
LOG_DIR,
INPUT_DIR

citations_filename_tei_per_chapter = citations_filename_root . with_suffix(".tei")
if not citations_filename_tei.is_file():
translations = {"de" : "german", "en" : "english", "it" : "italian", "fr" : "french"}
bib2html.bib2tei(
bib_file = INPUT_DIR / bib_data["source"],
citekeys = used_citekeys,
language = translations[publication_language],
temp_dir = TEMP_DIR,
output_file = citations_filename_tei_per_chapter,
log_dir = LOG_DIR,
keywords = [""]
)
citations_filename_html_per_chapter = citations_filename_root . with_suffix(".html")
bib2html.teibib_to_eoa1(
citations_filename_tei_per_chapter,
output_file = citations_filename_html_per_chapter
)

logging.info("Formatting citations now.")
cited_dict_per_chapter = format_citations(used_citekeys_per_chapter, citations_dict, citations_filename_html_per_chapter)
# citekey -> (authoryear, year, title)
cited_dict_per_chapter = format_citations(used_citekeys_per_chapter, citations_filename_html_per_chapter)
# Merge dictionaries
cited_dict = {**cited_dict, **cited_dict_per_chapter}

Expand All @@ -1557,13 +1454,24 @@ def main():
# create a dictionary entry containing the formatted references
formatted_references_dict[tmp_dict_key] = refs_for_bib_chapter
logging.debug(f"cited_dict now has {libeoaconvert.plural(len(cited_dict), 'entry', plural='entries')}.")
else:
raise( Exception("unknown publication type!"))
logging.debug( cited_dict )

tei_body = xml_tree.xpath("//t:body", namespaces=NS_MAP)[0]
if args.hyperimage:
logging.info("Transforming body with Hyperimage support")
else:
pass
body_transformed_tmp = transform_body(tei_body, cited_dict, TRANSLATION_FILE, HI_XML_FILE, args.eoa_classic, publang=publication_language, hyperimage=args.hyperimage)
body_transformed_tmp = transform_body(
tei_body,
cited_dict,
TRANSLATION_FILE,
HI_XML_FILE,
args.eoa_classic,
publang=publication_language,
hyperimage=args.hyperimage
)
libeoaconvert.debug_xml_here(body_transformed_tmp, "body_transformed", DEBUG_DIR)
body_transformed = etree.ElementTree(body_transformed_tmp)

Expand Down
Loading

0 comments on commit 5fc7acc

Please sign in to comment.