Skip to content
Permalink
962ed1fe3e
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 407 lines (341 sloc) 13.2 KB
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
"""
Docstring goes here
"""
__version__ = "1.0"
__date__ = "20190313"
__author__ = "kthoden@mpiwg-berlin.mpg.de"
from utils.load_config import exec_command, ToFile, ToLog, check_executable
import argparse
import os
import subprocess
import shlex
import logging
import string
import shutil
from lxml import etree
from pathlib import Path
import sys
import textwrap
BASE_DIR = Path( __file__ ).resolve().parent.parent
SCRIPT_PATH = Path( __file__ )
SCRIPT_NAME = SCRIPT_PATH.name
# logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
NS_MAP = {"x" : 'http://www.w3.org/1999/xhtml'}
BIBLIOGRAPHY_CHAPTER_NO_KEYWORD = "BIBLIOGRAPHY"
BIBLIOGRAPHY_CHAPTER = "BIBLIOGRAPHY {keyword}"
def latex_escape_non_ascii( input_str ):
output = ""
for c in input_str:
if ord(c) > 0x7F:
output += "\entity{{{}}}".format( ord(c) )
else:
output += c
return output
def check_executables():
check_executable( "htlatex" )
check_executable( "tidy" )
check_executable( "biber" )
def transform_reference(reference_element, dialect='html'):
"""Formatting transformation for reference element"""
string_from_xml = etree.tostring(reference_element).decode('utf-8')
removed_linebreak = string_from_xml.replace("\n", " ")
removed_namespace = removed_linebreak.replace('<p xmlns="http://www.w3.org/1999/xhtml" class="noindent">', '<p>')
cleaned_element = etree.fromstring(removed_namespace)
links = cleaned_element.xpath("a", namespaces=NS_MAP)
for link in links:
link.tag = "tagtobestripped"
ecti_span = cleaned_element.xpath("span[@class='ecti-1095']", namespaces=NS_MAP)
for ecti in ecti_span:
if dialect == 'tei':
ecti.tag = "hi"
ecti.set("rend", "italic")
else:
ecti.tag = "em"
ecti.attrib.pop('class')
ectt_span = cleaned_element.xpath("x:span[@class='ectt-1095']", namespaces=NS_MAP)
for ectt in ectt_span:
if dialect == 'tei':
ectt.tag = "hi"
ectt.set("rend", "monospace")
else:
ecti.tag = "code"
ectt.attrib.pop('class')
etree.strip_tags(cleaned_element, "tagtobestripped")
return cleaned_element
# def transform_reference ends here
def write_dummy_latex(
citekeys,
bibfile,
language,
keywords,
template_path,
tmp_filename
):
"""Prepare a latex file"""
tmp_dir = tmp_filename.parent
allcitekeys = ""
allcitekeys += "\\begin{tabular}{l l l}\n"
for key in citekeys:
allcitekeys += f"\\verb|{key}| &\\cite{{{key}}}&\\cite*{{{key}}}\\\\\n"
allcitekeys += "\\end{tabular}\n"
with open(template_path, "r") as tmp_template:
template = tmp_template.read()
fill_in_template = string.Template(template)
bibliographies = ""
for keyword in keywords:
if keyword == "":
chapter_heading = BIBLIOGRAPHY_CHAPTER_NO_KEYWORD
bibliographies += \
textwrap.dedent(
f"""
\chapter{{{chapter_heading}}}
\printbibliography
"""
)
else:
chapter_heading = BIBLIOGRAPHY_CHAPTER.format( keyword=keyword )
bibliographies += \
textwrap.dedent(
f"""
\chapter{{{chapter_heading}}}
\printbibliography[keyword={{{keyword}}}]
"""
)
bibfile_orig = (tmp_dir / (bibfile.stem + "_orig")) . with_suffix( ".bib" )
bibfile_local = tmp_dir / bibfile.name
shutil.copyfile(
bibfile,
bibfile_orig
)
import fileinput, unicodedata
with open( bibfile_local, "w") as out_file:
for line in fileinput.input(bibfile_orig):
out_file.write(
latex_escape_non_ascii(
line
)
)
bibfile_path = \
bibfile if bibfile.is_absolute() else Path.cwd() / bibfile
substitions = fill_in_template.substitute(
language = language,
# language = translations[language],
bibfile = bibfile.name,
# bibfile = bibfile_path,
# bibfile = '../' + bibfile,
citations = allcitekeys,
bibliographies = bibliographies
)
# (just for debugging: save with unescaped non-ascii characters)
with open(tmp_dir / (tmp_filename.name + ".orig"), "w") as texfile:
texfile.write(
substitions
)
with open(tmp_filename, "w") as texfile:
texfile.write(
latex_escape_non_ascii(
substitions
)
)
logging.info(f"Wrote {tmp_filename}")
# def write_dummy_latex ends here
def run_htlatex(
tmp_filename,
log_dir
):
"""Create HTML file from temporary LaTeX file"""
exec_command(
f"htlatex {tmp_filename}.tex 'xhtml,charset=utf-8,fn-in' ' -utf8' '' '--interaction=nonstopmode'",
# f"htxelatex {tmp_filename}.tex 'xhtml,charset=utf-8' ' -cunihtf -utf8' '' '--interaction=nonstopmode'",
output_to = ToFile( Path(log_dir) / "htlatex1.log" )
)
exec_command(
f"biber {tmp_filename}",
output_to = ToFile( Path(log_dir) / "biber.log" )
)
exec_command(
f"htlatex {tmp_filename}.tex 'xhtml,charset=utf-8,fn-in' ' -utf8' '' '--interaction=nonstopmode'",
# f"htxelatex {tmp_filename}.tex 'xhtml,charset=utf-8' ' -cunihtf -utf8' '' '--interaction=nonstopmode'",
output_to = ToFile( Path(log_dir) / "htlatex2.log" )
)
# def run_htlatex ends here
def create_citations(citekeys, xml_tree, style):
"""Create citations"""
logging.debug("creating citations")
if style in ["authoryear", "year"]:
pass
else:
logging.error("Unrecognized citation format, choose 'authoryear' or 'year'. Exiting")
sys.exit()
surrounding_div = etree.fromstring(f"<div class='{style}'><h1 id='cite{style}'>cite{style}</h1></div>")
p_element = etree.Element("p")
for citekey in citekeys:
logging.debug( f"working on citekey: '{citekey}', style: '{style}'" )
citation_el = None
if style == "authoryear":
citation_el = xml_tree.xpath(
f"//x:table/x:tr/x:td[.//x:span[text() = '{citekey}'] ]/following-sibling::x:td[1]/text()",
namespaces=NS_MAP
)
else:
citation_el = xml_tree.xpath(
f"//x:table/x:tr/x:td[.//x:span[text() = '{citekey}'] ]/following-sibling::x:td[2]/text()",
namespaces=NS_MAP
)
if( len(citation_el) == 0 ):
logging.error( f"error parsing formatted citation: '{citekey}', style: '{style}'" )
sys.exit( 1 )
format_citation = citation_el[0].strip()
logging.debug( f"formatted: '{format_citation}'" )
span_element = etree.fromstring(f"""<span class="citation" data-cites="{citekey}">{format_citation}</span>""")
p_element.append(span_element)
surrounding_div.insert(1, p_element)
return(surrounding_div)
# def create_citations ends here
def create_reference_list(reference_list):
"""Create HTML snippet for list of references"""
entries = reference_list.xpath(f"x:dt", namespaces=NS_MAP)
reference_div = etree.fromstring("""<div id="refs" class="references"></div>""")
for entry in entries:
entry_citekey = entry.get("id").replace("X0-", "")
reference_string = entry.xpath(f"following-sibling::x:dd[1]/x:p", namespaces=NS_MAP)[0]
formatted_reference = transform_reference(reference_string)
wrapper_div = etree.fromstring(f"""<div id="ref-{entry_citekey}"></div>""")
wrapper_div.append(formatted_reference)
reference_div.append(wrapper_div)
return reference_div
# def create_reference_list ends here
def main(
bib_file,
citekeys,
tex_template,
language,
temp_dir,
output_file,
# tmp_filename = "temp",
keywords = [""],
log_dir = "logs"
):
temp_dir = Path( temp_dir )
output_file = Path( output_file )
# tmp_filename = Path( tmp_filename )
if not temp_dir.exists():
os.makedirs( temp_dir )
tmp_filename = Path(output_file.name) . with_suffix( "" )
write_dummy_latex(
citekeys,
bib_file,
language,
keywords,
template_path = tex_template,
tmp_filename = temp_dir / tmp_filename . with_suffix( ".tex" )
)
wd = Path.cwd()
log_dir = log_dir.resolve()
os.chdir( temp_dir )
logging.info(f"cd {temp_dir}")
run_htlatex(
tmp_filename . with_suffix( "" ),
# tmp_filename,
log_dir = log_dir
)
logging.info(f"cd {wd}")
os.chdir( wd )
tmp_path_html = temp_dir / tmp_filename . with_suffix( ".html" )
tmp_path_html_utf8 = (temp_dir / (str(tmp_filename) + "-utf8")) . with_suffix( ".html" )
tmp_path_html_fixed1 = temp_dir / tmp_filename . with_suffix( ".1.html" )
tmp_path_html_fixed2 = temp_dir / tmp_filename . with_suffix( ".2.html" )
exec_command(
f"iconv -f ISO-8859-1 -t UTF-8 --output={tmp_path_html_utf8} {tmp_path_html}"
)
# htlatex seems to produce incorrect xhtml.
# We have to fix it
# (this will e.g. replace '&' by '&amp;'):
exec_command(
f"tidy -numeric -output {tmp_path_html_fixed1} {tmp_path_html_utf8}",
exit_code_ok = lambda x: x in (0,1)
)
import fileinput, unicodedata
# normalize unicode, e.g. replace ligatures (like " "ff" -> "ff"):
with open( tmp_path_html_fixed2, "w") as out_file:
for line in fileinput.input(tmp_path_html_fixed1):
out_file.write(
unicodedata.normalize("NFKD", line)
)
xml_tree = etree.parse(str(tmp_path_html_fixed2))
citation_authoryear = create_citations(citekeys, xml_tree, "authoryear")
citation_year = create_citations(citekeys, xml_tree, "year")
bibliographies_dict = {}
for keyword in keywords:
if keyword == "":
chapter_heading = BIBLIOGRAPHY_CHAPTER_NO_KEYWORD
else:
chapter_heading = BIBLIOGRAPHY_CHAPTER.format( keyword = keyword )
# '<dl class="thebibliography"> ... </dl>
bibliography_el = xml_tree.xpath(
f"//x:body/x:p[text() = '{chapter_heading}']/following-sibling::x:dl[1]",
namespaces = NS_MAP
)
if( len(bibliography_el) != 1 ):
logging.error( f"error parsing bibliography with keyword '{keyword}'" )
sys.exit( 1 )
bibliography_el = bibliography_el[0]
reference_div = create_reference_list(bibliography_el)
bibliographies_dict[keyword] = reference_div
html_element = etree.Element("html")
html_element.insert(0, citation_authoryear)
html_element.insert(1, citation_year)
for keyword in keywords:
bibl_el = etree.SubElement(
html_element,
"div",
**({} if keyword == "" else { 'keyword': keyword } )
)
bibl_el.append( bibliographies_dict[keyword] )
tree = etree.ElementTree(html_element)
logging.info("writing '%s'" % output_file)
tree.write(str(output_file), pretty_print=True, xml_declaration=True, encoding="utf-8")
return {
"references": bibliographies_dict,
"citation_authoryear": citation_authoryear,
"citation_year": citation_year,
}
"""
<h1 id="references" class="unnumbered">References</h1>
<div id="refs" class="references">
<div id="ref-Appadurai_1986">
<p>Appadurai, Arjun, ed. (1986). <em>The Social Life of Things: Commodities in Cultural Perspective</em>. Cambridge, UK: Cambridge University Press.</p>
</div>
"""
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
"bibfile",
help="File that contains the bibliography")
parser.add_argument(
"--tex-template",
default = BASE_DIR / "bibformat" / "4ht" / "bibliography4ht.tex",
help="the latex template to use for the bibliography"
)
parser.add_argument(
"--temp-dir",
default = "tmp_files",
help="where to store temporary files"
)
args = parser.parse_args()
check_executables()
citekeys = ["Edwards_2017", "Riggs:2016aa", "Bruckler:2001aa", "Zdenek:1939aa", "Caraffa_2011", "Uhlikova:2010aa", "Noll:1992aa", "Schwarz:1931aa", "Schwartz_1995", "Faber:2015ab", "Rohacek:2010aa", "Lachnit:2005aa", "Groll:1865aa", "Schlosser:1934aa", "Eitelberger:1863ab", "Wirth:1939aa", "Faber:2015aa", "Trnkova:2015aa", "Trnkova:2010aa", "Frodl:1988aa"]
language = "de"
translations = {"de" : "german", "en" : "english", "it" : "italian", "fr" : "french"}
temp_dir = Path( args.temp_dir )
references_in_html = main(
bib_file = args.bibfile,
citekeys = citekeys,
tex_template = args.tex_template,
language = translations[language],
temp_dir = args.temp_dir
)
print( etree.tostring( references_in_html ) )
# finis