Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
EOASkripts/utils/bib2html.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
407 lines (341 sloc)
13.2 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8; mode: python -*- | |
""" | |
Docstring goes here | |
""" | |
__version__ = "1.0" | |
__date__ = "20190313" | |
__author__ = "kthoden@mpiwg-berlin.mpg.de" | |
from utils.load_config import exec_command, ToFile, ToLog, check_executable | |
import argparse | |
import os | |
import subprocess | |
import shlex | |
import logging | |
import string | |
import shutil | |
from lxml import etree | |
from pathlib import Path | |
import sys | |
import textwrap | |
BASE_DIR = Path( __file__ ).resolve().parent.parent | |
SCRIPT_PATH = Path( __file__ ) | |
SCRIPT_NAME = SCRIPT_PATH.name | |
# logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') | |
NS_MAP = {"x" : 'http://www.w3.org/1999/xhtml'} | |
BIBLIOGRAPHY_CHAPTER_NO_KEYWORD = "BIBLIOGRAPHY" | |
BIBLIOGRAPHY_CHAPTER = "BIBLIOGRAPHY {keyword}" | |
def latex_escape_non_ascii( input_str ): | |
output = "" | |
for c in input_str: | |
if ord(c) > 0x7F: | |
output += "\entity{{{}}}".format( ord(c) ) | |
else: | |
output += c | |
return output | |
def check_executables(): | |
check_executable( "htlatex" ) | |
check_executable( "tidy" ) | |
check_executable( "biber" ) | |
def transform_reference(reference_element, dialect='html'): | |
"""Formatting transformation for reference element""" | |
string_from_xml = etree.tostring(reference_element).decode('utf-8') | |
removed_linebreak = string_from_xml.replace("\n", " ") | |
removed_namespace = removed_linebreak.replace('<p xmlns="http://www.w3.org/1999/xhtml" class="noindent">', '<p>') | |
cleaned_element = etree.fromstring(removed_namespace) | |
links = cleaned_element.xpath("a", namespaces=NS_MAP) | |
for link in links: | |
link.tag = "tagtobestripped" | |
ecti_span = cleaned_element.xpath("span[@class='ecti-1095']", namespaces=NS_MAP) | |
for ecti in ecti_span: | |
if dialect == 'tei': | |
ecti.tag = "hi" | |
ecti.set("rend", "italic") | |
else: | |
ecti.tag = "em" | |
ecti.attrib.pop('class') | |
ectt_span = cleaned_element.xpath("x:span[@class='ectt-1095']", namespaces=NS_MAP) | |
for ectt in ectt_span: | |
if dialect == 'tei': | |
ectt.tag = "hi" | |
ectt.set("rend", "monospace") | |
else: | |
ecti.tag = "code" | |
ectt.attrib.pop('class') | |
etree.strip_tags(cleaned_element, "tagtobestripped") | |
return cleaned_element | |
# def transform_reference ends here | |
def write_dummy_latex( | |
citekeys, | |
bibfile, | |
language, | |
keywords, | |
template_path, | |
tmp_filename | |
): | |
"""Prepare a latex file""" | |
tmp_dir = tmp_filename.parent | |
allcitekeys = "" | |
allcitekeys += "\\begin{tabular}{l l l}\n" | |
for key in citekeys: | |
allcitekeys += f"\\verb|{key}| &\\cite{{{key}}}&\\cite*{{{key}}}\\\\\n" | |
allcitekeys += "\\end{tabular}\n" | |
with open(template_path, "r") as tmp_template: | |
template = tmp_template.read() | |
fill_in_template = string.Template(template) | |
bibliographies = "" | |
for keyword in keywords: | |
if keyword == "": | |
chapter_heading = BIBLIOGRAPHY_CHAPTER_NO_KEYWORD | |
bibliographies += \ | |
textwrap.dedent( | |
f""" | |
\chapter{{{chapter_heading}}} | |
\printbibliography | |
""" | |
) | |
else: | |
chapter_heading = BIBLIOGRAPHY_CHAPTER.format( keyword=keyword ) | |
bibliographies += \ | |
textwrap.dedent( | |
f""" | |
\chapter{{{chapter_heading}}} | |
\printbibliography[keyword={{{keyword}}}] | |
""" | |
) | |
bibfile_orig = (tmp_dir / (bibfile.stem + "_orig")) . with_suffix( ".bib" ) | |
bibfile_local = tmp_dir / bibfile.name | |
shutil.copyfile( | |
bibfile, | |
bibfile_orig | |
) | |
import fileinput, unicodedata | |
with open( bibfile_local, "w") as out_file: | |
for line in fileinput.input(bibfile_orig): | |
out_file.write( | |
latex_escape_non_ascii( | |
line | |
) | |
) | |
bibfile_path = \ | |
bibfile if bibfile.is_absolute() else Path.cwd() / bibfile | |
substitions = fill_in_template.substitute( | |
language = language, | |
# language = translations[language], | |
bibfile = bibfile.name, | |
# bibfile = bibfile_path, | |
# bibfile = '../' + bibfile, | |
citations = allcitekeys, | |
bibliographies = bibliographies | |
) | |
# (just for debugging: save with unescaped non-ascii characters) | |
with open(tmp_dir / (tmp_filename.name + ".orig"), "w") as texfile: | |
texfile.write( | |
substitions | |
) | |
with open(tmp_filename, "w") as texfile: | |
texfile.write( | |
latex_escape_non_ascii( | |
substitions | |
) | |
) | |
logging.info(f"Wrote {tmp_filename}") | |
# def write_dummy_latex ends here | |
def run_htlatex( | |
tmp_filename, | |
log_dir | |
): | |
"""Create HTML file from temporary LaTeX file""" | |
exec_command( | |
f"htlatex {tmp_filename}.tex 'xhtml,charset=utf-8,fn-in' ' -utf8' '' '--interaction=nonstopmode'", | |
# f"htxelatex {tmp_filename}.tex 'xhtml,charset=utf-8' ' -cunihtf -utf8' '' '--interaction=nonstopmode'", | |
output_to = ToFile( Path(log_dir) / "htlatex1.log" ) | |
) | |
exec_command( | |
f"biber {tmp_filename}", | |
output_to = ToFile( Path(log_dir) / "biber.log" ) | |
) | |
exec_command( | |
f"htlatex {tmp_filename}.tex 'xhtml,charset=utf-8,fn-in' ' -utf8' '' '--interaction=nonstopmode'", | |
# f"htxelatex {tmp_filename}.tex 'xhtml,charset=utf-8' ' -cunihtf -utf8' '' '--interaction=nonstopmode'", | |
output_to = ToFile( Path(log_dir) / "htlatex2.log" ) | |
) | |
# def run_htlatex ends here | |
def create_citations(citekeys, xml_tree, style): | |
"""Create citations""" | |
logging.debug("creating citations") | |
if style in ["authoryear", "year"]: | |
pass | |
else: | |
logging.error("Unrecognized citation format, choose 'authoryear' or 'year'. Exiting") | |
sys.exit() | |
surrounding_div = etree.fromstring(f"<div class='{style}'><h1 id='cite{style}'>cite{style}</h1></div>") | |
p_element = etree.Element("p") | |
for citekey in citekeys: | |
logging.debug( f"working on citekey: '{citekey}', style: '{style}'" ) | |
citation_el = None | |
if style == "authoryear": | |
citation_el = xml_tree.xpath( | |
f"//x:table/x:tr/x:td[.//x:span[text() = '{citekey}'] ]/following-sibling::x:td[1]/text()", | |
namespaces=NS_MAP | |
) | |
else: | |
citation_el = xml_tree.xpath( | |
f"//x:table/x:tr/x:td[.//x:span[text() = '{citekey}'] ]/following-sibling::x:td[2]/text()", | |
namespaces=NS_MAP | |
) | |
if( len(citation_el) == 0 ): | |
logging.error( f"error parsing formatted citation: '{citekey}', style: '{style}'" ) | |
sys.exit( 1 ) | |
format_citation = citation_el[0].strip() | |
logging.debug( f"formatted: '{format_citation}'" ) | |
span_element = etree.fromstring(f"""<span class="citation" data-cites="{citekey}">{format_citation}</span>""") | |
p_element.append(span_element) | |
surrounding_div.insert(1, p_element) | |
return(surrounding_div) | |
# def create_citations ends here | |
def create_reference_list(reference_list): | |
"""Create HTML snippet for list of references""" | |
entries = reference_list.xpath(f"x:dt", namespaces=NS_MAP) | |
reference_div = etree.fromstring("""<div id="refs" class="references"></div>""") | |
for entry in entries: | |
entry_citekey = entry.get("id").replace("X0-", "") | |
reference_string = entry.xpath(f"following-sibling::x:dd[1]/x:p", namespaces=NS_MAP)[0] | |
formatted_reference = transform_reference(reference_string) | |
wrapper_div = etree.fromstring(f"""<div id="ref-{entry_citekey}"></div>""") | |
wrapper_div.append(formatted_reference) | |
reference_div.append(wrapper_div) | |
return reference_div | |
# def create_reference_list ends here | |
def main( | |
bib_file, | |
citekeys, | |
tex_template, | |
language, | |
temp_dir, | |
output_file, | |
# tmp_filename = "temp", | |
keywords = [""], | |
log_dir = "logs" | |
): | |
temp_dir = Path( temp_dir ) | |
output_file = Path( output_file ) | |
# tmp_filename = Path( tmp_filename ) | |
if not temp_dir.exists(): | |
os.makedirs( temp_dir ) | |
tmp_filename = Path(output_file.name) . with_suffix( "" ) | |
write_dummy_latex( | |
citekeys, | |
bib_file, | |
language, | |
keywords, | |
template_path = tex_template, | |
tmp_filename = temp_dir / tmp_filename . with_suffix( ".tex" ) | |
) | |
wd = Path.cwd() | |
log_dir = log_dir.resolve() | |
os.chdir( temp_dir ) | |
logging.info(f"cd {temp_dir}") | |
run_htlatex( | |
tmp_filename . with_suffix( "" ), | |
# tmp_filename, | |
log_dir = log_dir | |
) | |
logging.info(f"cd {wd}") | |
os.chdir( wd ) | |
tmp_path_html = temp_dir / tmp_filename . with_suffix( ".html" ) | |
tmp_path_html_utf8 = (temp_dir / (str(tmp_filename) + "-utf8")) . with_suffix( ".html" ) | |
tmp_path_html_fixed1 = temp_dir / tmp_filename . with_suffix( ".1.html" ) | |
tmp_path_html_fixed2 = temp_dir / tmp_filename . with_suffix( ".2.html" ) | |
exec_command( | |
f"iconv -f ISO-8859-1 -t UTF-8 --output={tmp_path_html_utf8} {tmp_path_html}" | |
) | |
# htlatex seems to produce incorrect xhtml. | |
# We have to fix it | |
# (this will e.g. replace '&' by '&'): | |
exec_command( | |
f"tidy -numeric -output {tmp_path_html_fixed1} {tmp_path_html_utf8}", | |
exit_code_ok = lambda x: x in (0,1) | |
) | |
import fileinput, unicodedata | |
# normalize unicode, e.g. replace ligatures (like " "ff" -> "ff"): | |
with open( tmp_path_html_fixed2, "w") as out_file: | |
for line in fileinput.input(tmp_path_html_fixed1): | |
out_file.write( | |
unicodedata.normalize("NFKD", line) | |
) | |
xml_tree = etree.parse(str(tmp_path_html_fixed2)) | |
citation_authoryear = create_citations(citekeys, xml_tree, "authoryear") | |
citation_year = create_citations(citekeys, xml_tree, "year") | |
bibliographies_dict = {} | |
for keyword in keywords: | |
if keyword == "": | |
chapter_heading = BIBLIOGRAPHY_CHAPTER_NO_KEYWORD | |
else: | |
chapter_heading = BIBLIOGRAPHY_CHAPTER.format( keyword = keyword ) | |
# '<dl class="thebibliography"> ... </dl> | |
bibliography_el = xml_tree.xpath( | |
f"//x:body/x:p[text() = '{chapter_heading}']/following-sibling::x:dl[1]", | |
namespaces = NS_MAP | |
) | |
if( len(bibliography_el) != 1 ): | |
logging.error( f"error parsing bibliography with keyword '{keyword}'" ) | |
sys.exit( 1 ) | |
bibliography_el = bibliography_el[0] | |
reference_div = create_reference_list(bibliography_el) | |
bibliographies_dict[keyword] = reference_div | |
html_element = etree.Element("html") | |
html_element.insert(0, citation_authoryear) | |
html_element.insert(1, citation_year) | |
for keyword in keywords: | |
bibl_el = etree.SubElement( | |
html_element, | |
"div", | |
**({} if keyword == "" else { 'keyword': keyword } ) | |
) | |
bibl_el.append( bibliographies_dict[keyword] ) | |
tree = etree.ElementTree(html_element) | |
logging.info("writing '%s'" % output_file) | |
tree.write(str(output_file), pretty_print=True, xml_declaration=True, encoding="utf-8") | |
return { | |
"references": bibliographies_dict, | |
"citation_authoryear": citation_authoryear, | |
"citation_year": citation_year, | |
} | |
""" | |
<h1 id="references" class="unnumbered">References</h1> | |
<div id="refs" class="references"> | |
<div id="ref-Appadurai_1986"> | |
<p>Appadurai, Arjun, ed. (1986). <em>The Social Life of Things: Commodities in Cultural Perspective</em>. Cambridge, UK: Cambridge University Press.</p> | |
</div> | |
""" | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"bibfile", | |
help="File that contains the bibliography") | |
parser.add_argument( | |
"--tex-template", | |
default = BASE_DIR / "bibformat" / "4ht" / "bibliography4ht.tex", | |
help="the latex template to use for the bibliography" | |
) | |
parser.add_argument( | |
"--temp-dir", | |
default = "tmp_files", | |
help="where to store temporary files" | |
) | |
args = parser.parse_args() | |
check_executables() | |
citekeys = ["Edwards_2017", "Riggs:2016aa", "Bruckler:2001aa", "Zdenek:1939aa", "Caraffa_2011", "Uhlikova:2010aa", "Noll:1992aa", "Schwarz:1931aa", "Schwartz_1995", "Faber:2015ab", "Rohacek:2010aa", "Lachnit:2005aa", "Groll:1865aa", "Schlosser:1934aa", "Eitelberger:1863ab", "Wirth:1939aa", "Faber:2015aa", "Trnkova:2015aa", "Trnkova:2010aa", "Frodl:1988aa"] | |
language = "de" | |
translations = {"de" : "german", "en" : "english", "it" : "italian", "fr" : "french"} | |
temp_dir = Path( args.temp_dir ) | |
references_in_html = main( | |
bib_file = args.bibfile, | |
citekeys = citekeys, | |
tex_template = args.tex_template, | |
language = translations[language], | |
temp_dir = args.temp_dir | |
) | |
print( etree.tostring( references_in_html ) ) | |
# finis |