Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
EOASkripts/src/utils/bib2html.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
467 lines (396 sloc)
13.4 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8; mode: python -*- | |
""" | |
Docstring goes here | |
""" | |
__version__ = "1.0" | |
__date__ = "20190313" | |
__author__ = "kthoden@mpiwg-berlin.mpg.de" | |
from utils.load_config import exec_command, ToFile, ToLog, check_executable | |
from utils.run_xslt import run_xslt | |
import argparse | |
import os | |
import subprocess | |
import shlex | |
import logging | |
import string | |
import shutil | |
from lxml import etree | |
from pathlib import Path | |
import sys | |
import textwrap | |
BASE_DIR = Path( __file__ ).resolve().parent.parent | |
SCRIPT_PATH = Path( __file__ ) | |
SCRIPT_NAME = SCRIPT_PATH.name | |
# logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') | |
NS_MAP = {"x" : 'http://www.w3.org/1999/xhtml'} | |
BIBLIOGRAPHY_CHAPTER_NO_KEYWORD = "BIBLIOGRAPHY" | |
BIBLIOGRAPHY_CHAPTER = "BIBLIOGRAPHY-{keyword}" | |
def get_citations_per_chapter(xml_tree): | |
"""If publication is anthology, store which citations are mentioned in each chapter: | |
'chap18_schwartz': {'Blodget_1857', 'CliffordMarcus_1986', | |
'Hunter_2004', 'MarcusFischer_1986', 'Mitchell_1992', 'Nye_1994', | |
'Schlereth_1980', 'Schwartz_2003', 'Schwartz_2011'}} | |
""" | |
NS_MAP = {"tei" : 'http://www.tei-c.org/ns/1.0'} | |
refs_per_chapter = {} | |
all_chapters = xml_tree.xpath("//tei:div[@type='chapter']", namespaces=NS_MAP) | |
logging.info(f"Found {len(all_chapters)}.") | |
for chapter in all_chapters: | |
try: | |
chapter_id = chapter.xpath("@xml:id", namespaces=NS_MAP)[0] | |
except IndexError: | |
logging.error(f"Found a chapter without identifier. Each chapter must have one. Exiting.") | |
sys.exit(1) | |
all_refs_with_hash = chapter.xpath(".//tei:bibl/tei:ref/@target", namespaces=NS_MAP) | |
all_refs = [x[1:] for x in all_refs_with_hash] | |
logging.info(f"Found {len(all_refs)} in this chapter.") | |
refs_per_chapter[chapter_id] = set(all_refs) | |
return refs_per_chapter | |
# def get_citations_per_chapter ends here | |
def get_bibl_info( | |
tei_tree | |
): | |
NS_MAP = {"tei" : 'http://www.tei-c.org/ns/1.0'} | |
citekeys = tei_tree.xpath( | |
".//tei:bibl/tei:ref/@target", | |
namespaces = NS_MAP | |
) | |
citekeys = [key.lstrip('#') for key in citekeys] | |
processing_instructions = tei_tree.xpath( | |
".//processing-instruction('eoa')" | |
) | |
keywords = [] | |
for k in processing_instructions: | |
as_str = str(k).lstrip('<?').rstrip('?>').split(" ") | |
if as_str[0:2] == ["eoa", "printbibliography"]: | |
if len(as_str) > 2: | |
keywords += [as_str[2].strip('"').strip("'")] | |
else: | |
keywords += [""] | |
return { | |
"citekeys": citekeys, | |
"keywords": keywords | |
} | |
def latex_escape_non_ascii( input_str ): | |
output = "" | |
escape_lower_threshold = 255 | |
# escape_upper_threshold = 8200 | |
escape_exceptions = [8211,] | |
for c in input_str: | |
if ord(c) > escape_lower_threshold and ord(c) not in escape_exceptions : | |
# if escape_upper_threshold > ord(c) > escape_lower_threshold : | |
output += "\entity{{{}}}".format( ord(c) ) | |
else: | |
output += c | |
return output | |
def check_executables(): | |
check_executable( "htlatex" ) | |
check_executable( "tidy" ) | |
check_executable( "biber" ) | |
def write_dummy_latex( | |
citekeys, | |
bibfile, | |
language, | |
keywords, | |
template_path, | |
tmp_filename, | |
dashed | |
): | |
"""Prepare a latex file""" | |
tmp_dir = tmp_filename.parent | |
allcitekeys = "" | |
allcitekeys += "\\begin{tabular}{l l l l}\n" | |
for (i,key) in enumerate(citekeys): | |
allcitekeys += f"\\verb|{key}| &\\cite{{{key}}}&\\cite*{{{key}}}&\\citefield{{{key}}}{{title}}" | |
if i < len(citekeys) - 1: | |
allcitekeys += "\\\\" | |
allcitekeys += "\n" | |
allcitekeys += "\\end{tabular}\n" | |
with open(template_path, "r") as tmp_template: | |
template = tmp_template.read() | |
fill_in_template = string.Template(template) | |
bibliographies = "" | |
if dashed: | |
for keyword in keywords: | |
if keyword == "": | |
section_heading = BIBLIOGRAPHY_CHAPTER_NO_KEYWORD | |
bibliographies += \ | |
textwrap.dedent( | |
f""" | |
\section*{{{section_heading}}} | |
\printbibliography[heading=none] | |
""" | |
) | |
else: | |
section_heading = BIBLIOGRAPHY_CHAPTER.format( keyword=keyword ) | |
bibliographies += \ | |
textwrap.dedent( | |
f""" | |
\section*{{{section_heading}}} | |
\printbibliography[heading=none, keyword={{{keyword}}}] | |
""" | |
) | |
else: | |
section_heading = BIBLIOGRAPHY_CHAPTER_NO_KEYWORD | |
bibliographies += \ | |
textwrap.dedent( | |
f""" | |
\section*{{{section_heading}}} | |
\printbibliography[heading=none] | |
""" | |
) | |
bibfile_orig = (tmp_dir / (bibfile.stem + "_orig")) . with_suffix( ".bib" ) | |
bibfile_local = tmp_dir / bibfile.name | |
shutil.copyfile( | |
bibfile, | |
bibfile_orig | |
) | |
import fileinput, unicodedata | |
with open( bibfile_local, "w") as out_file: | |
for line in fileinput.input(bibfile_orig): | |
out_file.write( | |
latex_escape_non_ascii( | |
line | |
) | |
) | |
bibshorthands = """ | |
\section*{Shorthands} | |
\printbiblist[heading=none]{shorthand} | |
""" | |
if dashed: | |
usepackagebiblatexstring = ",dashed=true" | |
else: | |
usepackagebiblatexstring = ",dashed=false" | |
bibfile_path = \ | |
bibfile if bibfile.is_absolute() else Path.cwd() / bibfile | |
substitions = fill_in_template.substitute( | |
language = language, | |
# language = translations[language], | |
bibfile = bibfile.name, | |
usepackagebiblatex = usepackagebiblatexstring, | |
# bibfile = bibfile_path, | |
# bibfile = '../' + bibfile, | |
citations = allcitekeys, | |
bibshorthands = bibshorthands, | |
bibliographies = bibliographies | |
) | |
# (just for debugging: save with unescaped non-ascii characters) | |
with open(tmp_dir / (tmp_filename.name + ".orig"), "w") as texfile: | |
texfile.write( | |
substitions | |
) | |
with open(tmp_filename, "w") as texfile: | |
texfile.write( | |
latex_escape_non_ascii( | |
substitions | |
) | |
) | |
logging.info(f"Wrote {tmp_filename}") | |
# def write_dummy_latex ends here | |
def create_makefile(tmp_filename): | |
"""Create a makefile | |
Found on https://tex.stackexchange.com/questions/162626 | |
""" | |
makefile_input = """local filter = require "make4ht-filter" | |
local process = filter{"cleanspan", "fixligatures", "hruletohr"} | |
Make:add("biber", "biber ${input}") | |
Make:htlatex() | |
Make:biber() | |
Make:htlatex() | |
Make:htlatex() | |
Make:match("html$",process) | |
""" | |
with open(tmp_filename, "w") as makefile: | |
makefile.write(makefile_input) | |
logging.info(f"Wrote {tmp_filename}.") | |
# def create_makefile ends here | |
def run_mk4_makefile( | |
tmp_filename, | |
log_dir | |
): | |
"""Create HTML from LaTeX using makefile""" | |
exec_command( | |
f"make4ht {tmp_filename}" | |
) | |
# def run_mk4_makefile ends here | |
def run_htlatex( | |
tmp_filename, | |
log_dir | |
): | |
"""Create HTML file from temporary LaTeX file""" | |
exec_command( | |
f"htlatex {tmp_filename}.tex 'xhtml,charset=utf-8,fn-in' ' -utf8' '' '--interaction=nonstopmode'", | |
# f"htxelatex {tmp_filename}.tex 'xhtml,charset=utf-8' ' -cunihtf -utf8' '' '--interaction=nonstopmode'", | |
output_to = ToFile( Path(log_dir) / "htlatex1.log" ) | |
) | |
exec_command( | |
f"biber {tmp_filename}", | |
output_to = ToFile( Path(log_dir) / "biber.log" ) | |
) | |
exec_command( | |
f"htlatex {tmp_filename}.tex 'xhtml,charset=utf-8,fn-in' ' -utf8' '' '--interaction=nonstopmode'", | |
# f"htxelatex {tmp_filename}.tex 'xhtml,charset=utf-8' ' -cunihtf -utf8' '' '--interaction=nonstopmode'", | |
output_to = ToFile( Path(log_dir) / "htlatex2.log" ) | |
) | |
# def run_htlatex ends here | |
def bib2tei( | |
bib_file, | |
citekeys, | |
language, | |
temp_dir, | |
output_file, | |
keywords = [""], | |
log_dir = "logs", | |
tex_template = BASE_DIR / "data" / "aux" / "bibliography4ht.tex", | |
): | |
imhtml_file = (temp_dir / "imhtml-nondashed") . with_suffix( ".html" ) | |
__bib2imhtml( | |
bib_file = bib_file, | |
citekeys = citekeys, | |
tex_template = tex_template, | |
language = language, | |
temp_dir = temp_dir, | |
output_file = imhtml_file, | |
dashed_status = False, | |
keywords = keywords, | |
log_dir = log_dir | |
) | |
imhtml_dashed_file = (temp_dir / "imhtml-dashed") . with_suffix( ".html" ) | |
__bib2imhtml( | |
bib_file = bib_file, | |
citekeys = citekeys, | |
tex_template = tex_template, | |
language = language, | |
temp_dir = temp_dir, | |
output_file = imhtml_dashed_file, | |
dashed_status = True, | |
keywords = keywords, | |
log_dir = log_dir | |
) | |
__imhtml_2_tei( | |
imhtml_file, | |
imhtml_dashed_file, | |
output_file = output_file | |
) | |
def __bib2imhtml( | |
bib_file, | |
citekeys, | |
tex_template, | |
language, | |
temp_dir, | |
dashed_status, | |
output_file, | |
keywords, | |
log_dir | |
): | |
"convert .bib to html inside 'temp_dir', return as xml tree. Optionally copy result to output_file" | |
temp_dir = Path( temp_dir ) | |
translations = {"de" : "german", "en" : "english", "it" : "italian", "fr" : "french"} | |
if language in translations.values(): | |
language_translated = language | |
else: | |
language_translated = translations[language] | |
if not temp_dir.exists(): | |
os.makedirs( temp_dir ) | |
# tmp_filename = Path(output_file.name) . with_suffix( "" ) | |
if not dashed_status: | |
tmp_filename = Path("nondashed") | |
else: | |
tmp_filename = Path("dashed") | |
write_dummy_latex( | |
citekeys, | |
bib_file, | |
language_translated, | |
keywords, | |
template_path = tex_template, | |
tmp_filename = temp_dir / tmp_filename . with_suffix( ".tex" ), | |
dashed = dashed_status | |
) | |
wd = Path.cwd() | |
log_dir = log_dir.resolve() | |
os.chdir( temp_dir ) | |
logging.info(f"cd {temp_dir}") | |
tmp_path_makefile = tmp_filename . with_suffix( ".mk4" ) | |
create_makefile(tmp_path_makefile) | |
run_mk4_makefile( | |
tmp_filename . with_suffix( "" ), | |
# tmp_filename, | |
log_dir = log_dir | |
) | |
# run_htlatex( | |
# tmp_filename . with_suffix( "" ), | |
# # tmp_filename, | |
# log_dir = log_dir | |
# ) | |
logging.info(f"cd {wd}") | |
os.chdir( wd ) | |
tmp_path_html = temp_dir / tmp_filename . with_suffix( ".html" ) | |
tmp_path_html_utf8 = ((temp_dir / (str(tmp_filename) + "-utf8"))) . with_suffix( ".html" ) | |
# tmp_path_html_utf8 = (temp_dir / (str(tmp_filename) + "-utf8")) . with_suffix( ".html" ) | |
exec_command( | |
f"iconv -f ISO-8859-1 -t UTF-8 \"{tmp_path_html}\"", | |
output_to = ToFile( tmp_path_html_utf8 ) | |
# output_to = ToFile( tmp_path_html_utf8 ) | |
) | |
# exec_command( | |
# f"iconv -f ISO-8859-1 -t UTF-8 -o \"{tmp_path_html_utf8}\" \"{tmp_path_html}\"", | |
# output_to = ToFile( log_dir / "iconv.log" ) | |
# # output_to = ToFile( tmp_path_html_utf8 ) | |
# ) | |
# tmp_path_html_res = temp_dir / tmp_filename . with_suffix( ".html" ) | |
# htlatex seems to produce incorrect xhtml. | |
# We have to fix it | |
# (this will e.g. replace '&' by '&'): | |
exec_command( | |
f"tidy -numeric -output {output_file} {tmp_path_html_utf8}", | |
exit_code_ok = lambda x: x in (0,1) | |
) | |
def __imhtml_2_tei( | |
imhtml_file, | |
imhtml_dashed_file, | |
output_file | |
): | |
run_xslt( | |
imhtml_file, | |
BASE_DIR / "stylesheets/tex4ht_2_tei.xsl", | |
params = [ f"dashed_file={imhtml_dashed_file.absolute()}" ], | |
output_file = output_file | |
) | |
def teibib_to_eoa1( | |
tei_bibl_file: Path, | |
output_file: Path | |
): | |
run_xslt( | |
tei_bibl_file, | |
BASE_DIR / "stylesheets/teibib_to_eoa1.xsl", | |
output_file = output_file | |
) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"bibfile", | |
help="File that contains the bibliography") | |
parser.add_argument( | |
"--tex-template", | |
default = BASE_DIR / "data" / "aux" / "bibliography4ht.tex", | |
help="the latex template to use for the bibliography" | |
) | |
parser.add_argument( | |
"--temp-dir", | |
default = "tmp_files", | |
help="where to store temporary files" | |
) | |
args = parser.parse_args() | |
check_executables() | |
language = "de" | |
temp_dir = Path( args.temp_dir ) | |
references_in_html = main( | |
bib_file = args.bibfile, | |
citekeys = citekeys, | |
tex_template = args.tex_template, | |
language = translations[language], | |
temp_dir = args.temp_dir | |
) | |
# print( etree.tostring( references_in_html ) ) | |
# finis |