From db30e89a159af608d1957233d0171e1b20416cf0 Mon Sep 17 00:00:00 2001 From: EsGeh Date: Tue, 23 Apr 2019 16:14:14 +0200 Subject: [PATCH 1/7] fixed html bibliography generation: output should be the same as before. --- eoatex2imxml.py | 3 +-- utils/bib2html.py | 18 ++++++++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/eoatex2imxml.py b/eoatex2imxml.py index f5255b3..aaf8c3c 100755 --- a/eoatex2imxml.py +++ b/eoatex2imxml.py @@ -1195,7 +1195,6 @@ def insert_bibliographies( citations_json, citekeys ) - # use language of the first chapter: formatted_bibl_info = bib2html.main( bib_file = bib_file, citekeys = citekeys, @@ -1402,7 +1401,6 @@ def add_bibliography_to_xml( if bibl_info is None: logging.warning("No bibliography database found.") else: - (bib_type, bib_database) = bibl_info logging.debug(f"bib type is {bib_type}") @@ -1426,6 +1424,7 @@ def add_bibliography_to_xml( if bib_type == "monograph": keyword_to_print_bibl_el = insert_bibliographies( xmlTree, + # use language of the first chapter: xmlChapters[0].get( "language" ), citations_json, ## paths: diff --git a/utils/bib2html.py b/utils/bib2html.py index efc1d10..b254c7f 100755 --- a/utils/bib2html.py +++ b/utils/bib2html.py @@ -42,7 +42,7 @@ def transform_reference(reference_element, dialect='html'): """Formatting transformation for reference element""" string_from_xml = etree.tostring(reference_element).decode('utf-8') - removed_linebreak = string_from_xml.replace("\n", "") + removed_linebreak = string_from_xml.replace("\n", " ") removed_namespace = removed_linebreak.replace('

', '

') cleaned_element = etree.fromstring(removed_namespace) @@ -269,6 +269,7 @@ def main( chapter_heading = BIBLIOGRAPHY_CHAPTER_NO_KEYWORD else: chapter_heading = BIBLIOGRAPHY_CHAPTER.format( keyword = keyword ) + # '

...
bibliography_el = xml_tree.xpath( f"//x:body/x:p[text() = '{chapter_heading}']/following-sibling::x:dl[1]", namespaces = NS_MAP @@ -276,16 +277,21 @@ def main( if( len(bibliography_el) != 1 ): logging.error( f"error parsing bibliography with keyword '{keyword}'" ) sys.exit( 1 ) - bibliographies_dict[keyword] = bibliography_el[0] + bibliography_el = bibliography_el[0] - xml_tree.xpath(f"//x:dl[@class='thebibliography']", namespaces=NS_MAP)[0] - reference_list = xml_tree.xpath(f"//x:dl[@class='thebibliography']", namespaces=NS_MAP)[0] - reference_div = create_reference_list(reference_list) + reference_div = create_reference_list(bibliography_el) + bibliographies_dict[keyword] = reference_div html_element = etree.Element("html") html_element.insert(0, citation_authoryear) html_element.insert(1, citation_year) - html_element.insert(2, reference_div) + for keyword in keywords: + bibl_el = etree.SubElement( + html_element, + "div", + **({} if keyword == "" else { 'keyword': keyword } ) + ) + bibl_el.append( bibliographies_dict[keyword] ) # print(etree.tostring(html_element)) From 57587d37d6d8d6b74a77624057c6d40c1da6187a Mon Sep 17 00:00:00 2001 From: EsGeh Date: Tue, 23 Apr 2019 16:28:58 +0200 Subject: [PATCH 2/7] some citations were chopped, should be ok now --- eoatex2imxml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eoatex2imxml.py b/eoatex2imxml.py index aaf8c3c..36a72d9 100755 --- a/eoatex2imxml.py +++ b/eoatex2imxml.py @@ -1543,7 +1543,7 @@ def add_bibliography_to_xml( if xmlCitation.tag == "EOAciteauthoryear": strCitation = citeauthoryear_value elif xmlCitation.tag == "EOAciteyear": - strCitation = form_cit.select("#citeyear ~ p > span[data-cites='%s']" % string_citekey)[0].text[1:-1] + strCitation = form_cit.select("#citeyear ~ p > span[data-cites='%s']" % string_citekey)[0].text elif xmlCitation.tag == "EOAcitemanual": cite_text = xmlCitation.find("citetext") if cite_text.getchildren(): From 812b7e5951caf5037392fb4fd8fe5528943182f4 Mon Sep 17 00:00:00 2001 From: EsGeh Date: Tue, 23 Apr 2019 16:30:07 +0200 Subject: [PATCH 3/7] cleaning up dead commented code --- utils/bib2html.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/utils/bib2html.py b/utils/bib2html.py index b254c7f..f0cbce3 100755 --- a/utils/bib2html.py +++ b/utils/bib2html.py @@ -293,8 +293,6 @@ def main( ) bibl_el.append( bibliographies_dict[keyword] ) - # print(etree.tostring(html_element)) - tree = etree.ElementTree(html_element) logging.info("writing '%s'" % output_file) tree.write(str(output_file), pretty_print=True, xml_declaration=True, encoding="utf-8") From 593bb1214b140212a5016e75409c650298f2aedc Mon Sep 17 00:00:00 2001 From: EsGeh Date: Tue, 23 Apr 2019 17:07:46 +0200 Subject: [PATCH 4/7] dummy cover generation script picks an image with the right file suffix --- mkimage.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/mkimage.py b/mkimage.py index bbfcb98..11ad51d 100755 --- a/mkimage.py +++ b/mkimage.py @@ -38,21 +38,20 @@ def get_cover_image(image_path): """Choose a random landscape image from publications in this volume""" - import random - - candidates = os.listdir(image_path) - + import random, glob + extensions = ("png", "jpg") + candidates = [] + for extension in extensions: + path = f"{image_path}/**/*.{extension}" + candidates.extend( + glob.glob( path, recursive = True) + ) for image in candidates: - if image == ".DS_Store": - candidates.remove(image) - continue - tmp_image = Image.open(image_path + "/" + str(image)) + tmp_image = Image.open(str(image)) ratio = calculate_ratio(tmp_image) if ratio < 1: candidates.remove(image) - chosen_image = random.choice(candidates) - return chosen_image # def get_cover_image ends here @@ -209,7 +208,7 @@ def create_cover(metadata_dict, image_directory, cover_filename, image_is_file): text_draw.multiline_text((ptcenter,DIMENSIONS[1]-400), press_text_joined, font=small_font, align="center") if image_is_file == False: - image_on_cover = Image.open(os.path.join(image_directory, get_cover_image(image_directory))) + image_on_cover = Image.open(get_cover_image(image_directory)) else: image_on_cover = Image.open(image_directory) From 5db6ad0729f9745f03d8224bae0a4c2f9d7db429 Mon Sep 17 00:00:00 2001 From: EsGeh Date: Tue, 23 Apr 2019 18:50:34 +0200 Subject: [PATCH 5/7] fixed and simplified imxml2django facsimile handling --- imxml2django.py | 2 +- utils/libeoaconvert.py | 63 ++++++++++++++++++++---------------------- 2 files changed, 31 insertions(+), 34 deletions(-) diff --git a/imxml2django.py b/imxml2django.py index 31e9272..82a009c 100755 --- a/imxml2django.py +++ b/imxml2django.py @@ -1127,7 +1127,7 @@ def check_publication_cfg(configuration_file): strImageFileDir = re.sub("/", "", strImageFileDir) strImageFileName = os.path.basename(strImageFile) shutil.copy( - INPUT_DIR / strImageFile, + PUBLICATION_DIR / strImageFile, CONVERT_DIR / "django/images" / (strImageFileDir + strImageFileName) ) # shutil.copy(os.getcwd() + "/" + strImageFile, os.getcwd() + "/CONVERT/django/images/" + strImageFileDir + strImageFileName) diff --git a/utils/libeoaconvert.py b/utils/libeoaconvert.py index 6ffdd92..ce10d8f 100644 --- a/utils/libeoaconvert.py +++ b/utils/libeoaconvert.py @@ -71,34 +71,33 @@ def sanitizeImage( ): """Adjust and convert image for epub standard""" - if not os.path.exists(Path(tmp_dir) / "tmp_images/"): - os.makedirs(os.path.expanduser(Path(tmp_dir) / "tmp_images/")) + tmp_dir = Path( tmp_dir ) + strImagepath = Path( strImagepath ) + if not (tmp_dir / "tmp_images").exists(): + os.makedirs(tmp_dir / "tmp_images/") - tmp_image_dir = Path(tmp_dir) / "tmp_images/" - xelatex_sanitizeimage_logfile = open( Path(tmp_dir) / 'xelatex-run-images.log', 'w') + tmp_image_dir = tmp_dir / "tmp_images" logging.debug(strImagepath) - strCommand = GM_PATH + " identify -format \"%w\" " + str(strImagepath) - listArguments = shlex.split(strCommand) - exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True) - intImageWidth = int(exeShell) + intImageWidth = int(subprocess.check_output( + shlex.split( f"{GM_PATH} identify -format \"%w\" {strImagepath}" ), + universal_newlines=True + )) if intImageWidth > 700: - strCommand = GM_PATH + " convert " + str(strImagepath) + " -resize 700x\\> " + str(strImagepath) - listArguments = shlex.split(strCommand) - subprocess.check_output(listArguments, shell=False) - strCommand = GM_PATH + " identify -format \"%h\" " + str(strImagepath) - listArguments = shlex.split(strCommand) - exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True) - intImageHeight = int(exeShell) + exec_command( + f"{GM_PATH} convert {strImagepath} -resize 700x\\> {strImagepath}" + ) + intImageHeight = int( subprocess.check_output( + shlex.split( f"{GM_PATH} identify -format \"%h\" {strImagepath}" ), + universal_newlines=True + )) if intImageHeight > 1000: - strCommand = GM_PATH + " convert " + str(strImagepath) + " -resize x1000\\> " + str(strImagepath) - listArguments = shlex.split(strCommand) - subprocess.check_output(listArguments, shell=False) - strCommand = GM_PATH + " identify -format \"%m\" " + str(strImagepath) - listArguments = shlex.split(strCommand) - exeShell = subprocess.check_output(listArguments, shell=False, universal_newlines=True) - strFileFormat = str(exeShell) - strFileFormat = strFileFormat.strip() + exec_command( + f"{GM_PATH} convert {strImagepath} -resize x1000\\> {strImagepath}" + ) + strFileFormat = str( subprocess.check_output( + shlex.split( f"{GM_PATH} identify -format \"%m\" {strImagepath}" ) + )).strip() if strFileFormat == "JPEG": pass # print("looking at jpeg file") @@ -119,17 +118,15 @@ def sanitizeImage( # strImagepath = strNewImagepath + ".png" elif strFileFormat == "PDF": strNewImagepath = os.path.splitext(str(strImagepath))[0] - clipped_file = strImagepath.replace(".pdf", "-clipped.pdf") - - Kommando = PDFCROP_EXEC + " --margins 10 --clip --hires " + str(strImagepath) + " " + clipped_file - logging.debug(Kommando) + clipped_file = str(strImagepath).replace(".pdf", "-clipped.pdf") - Argumente = shlex.split(Kommando) - subprocess.call(Argumente, cwd=tmp_image_dir, stdout=xelatex_sanitizeimage_logfile) - - strCommand = GM_PATH + " convert -density 400 " + clipped_file + " " + strNewImagepath + ".png" - listArguments = shlex.split(strCommand) - subprocess.call(listArguments) + exec_command( + f"{PDFCROP_EXEC} --margins 10 --clip --hires {strImagepath} {clipped_file}", + # wd = tmp_image_dir + ) + exec_command( + f"{GM_PATH} convert -density 400 {clipped_file} {strNewImagepath}.png" + ) logging.debug("Removing two files: %s and %s " % (clipped_file, strImagepath)) os.remove(clipped_file) os.remove(strImagepath) From d22976608b482401c7583b09b7184884eb40ba19 Mon Sep 17 00:00:00 2001 From: EsGeh Date: Wed, 24 Apr 2019 17:12:27 +0200 Subject: [PATCH 6/7] fix non-ascii problems with bibliography representing them as xml entities --- bibformat/4ht/bibliography4ht.tex | 5 +++ utils/bib2html.py | 54 ++++++++++++++++++++++++++++--- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/bibformat/4ht/bibliography4ht.tex b/bibformat/4ht/bibliography4ht.tex index 15cc7b6..9d60796 100644 --- a/bibformat/4ht/bibliography4ht.tex +++ b/bibformat/4ht/bibliography4ht.tex @@ -91,6 +91,11 @@ \bibliography{$bibfile} \begin{document} % \maketitle +\makeatletter +\def\hshchr{\expandafter\@gobble\string\#} +\def\ampchr{\expandafter\@gobble\string\&} +\def\entity#1{\HCode{\ampchr\hshchr#1;}} +\makeatother \section{Citations} diff --git a/utils/bib2html.py b/utils/bib2html.py index f0cbce3..6f4d70e 100755 --- a/utils/bib2html.py +++ b/utils/bib2html.py @@ -33,6 +33,15 @@ BIBLIOGRAPHY_CHAPTER_NO_KEYWORD = "BIBLIOGRAPHY" BIBLIOGRAPHY_CHAPTER = "BIBLIOGRAPHY {keyword}" +def latex_escape_non_ascii( input_str ): + output = "" + for c in input_str: + if ord(c) > 0x7F: + output += "\entity{{{}}}".format( ord(c) ) + else: + output += c + return output + def check_executables(): check_executable( "htlatex" ) check_executable( "tidy" ) @@ -82,6 +91,7 @@ def write_dummy_latex( tmp_filename ): """Prepare a latex file""" + tmp_dir = tmp_filename.parent allcitekeys = "" @@ -112,20 +122,45 @@ def write_dummy_latex( f""" \chapter{{{chapter_heading}}} \printbibliography[keyword={{{keyword}}}]\n""" + + bibfile_orig = (tmp_dir / (bibfile.stem + "_orig")) . with_suffix( ".bib" ) + bibfile_local = tmp_dir / bibfile.name + shutil.copyfile( + bibfile, + bibfile_orig + ) + import fileinput, unicodedata + with open( bibfile_local, "w") as out_file: + for line in fileinput.input(bibfile_orig): + out_file.write( + latex_escape_non_ascii( + line + ) + ) bibfile_path = \ bibfile if bibfile.is_absolute() else Path.cwd() / bibfile substitions = fill_in_template.substitute( language = language, # language = translations[language], - bibfile = bibfile_path, + bibfile = bibfile.name, + # bibfile = bibfile_path, # bibfile = '../' + bibfile, citations = allcitekeys, bibliographies = bibliographies ) + # (just for debugging: save with unescaped non-ascii characters) + with open(tmp_dir / (tmp_filename.name + ".orig"), "w") as texfile: + texfile.write( + substitions + ) with open(tmp_filename, "w") as texfile: - texfile.write(substitions) + texfile.write( + latex_escape_non_ascii( + substitions + ) + ) logging.info(f"Wrote {tmp_filename}") # def write_dummy_latex ends here @@ -137,7 +172,8 @@ def run_htlatex( ): """Create HTML file from temporary LaTeX file""" exec_command( - f"htxelatex {tmp_filename}.tex 'xhtml,charset=utf-8' ' -cunihtf -utf8' '' '--interaction=nonstopmode'", + f"htlatex {tmp_filename}.tex 'xhtml,charset=utf-8,fn-in' ' -utf8' '' '--interaction=nonstopmode'", + # f"htxelatex {tmp_filename}.tex 'xhtml,charset=utf-8' ' -cunihtf -utf8' '' '--interaction=nonstopmode'", output_to = ToFile( Path(log_dir) / "htlatex1.log" ) ) exec_command( @@ -145,7 +181,8 @@ def run_htlatex( output_to = ToFile( Path(log_dir) / "biber.log" ) ) exec_command( - f"htxelatex {tmp_filename}.tex 'xhtml,charset=utf-8' ' -cunihtf -utf8' '' '--interaction=nonstopmode'", + f"htlatex {tmp_filename}.tex 'xhtml,charset=utf-8,fn-in' ' -utf8' '' '--interaction=nonstopmode'", + # f"htxelatex {tmp_filename}.tex 'xhtml,charset=utf-8' ' -cunihtf -utf8' '' '--interaction=nonstopmode'", output_to = ToFile( Path(log_dir) / "htlatex2.log" ) ) # def run_htlatex ends here @@ -230,22 +267,29 @@ def main( wd = Path.cwd() log_dir = log_dir.resolve() os.chdir( temp_dir ) + logging.info(f"cd {temp_dir}") run_htlatex( tmp_filename . with_suffix( "" ), # tmp_filename, log_dir = log_dir ) + logging.info(f"cd {wd}") os.chdir( wd ) tmp_path_html = temp_dir / tmp_filename . with_suffix( ".html" ) + tmp_path_html_utf8 = (temp_dir / (str(tmp_filename) + "-utf8")) . with_suffix( ".html" ) tmp_path_html_fixed1 = temp_dir / tmp_filename . with_suffix( ".1.html" ) tmp_path_html_fixed2 = temp_dir / tmp_filename . with_suffix( ".2.html" ) + exec_command( + f"iconv -f ISO-8859-1 -t UTF-8 --output={tmp_path_html_utf8} {tmp_path_html}" + ) + # htlatex seems to produce incorrect xhtml. # We have to fix it # (this will e.g. replace '&' by '&'): exec_command( - f"tidy -numeric -output {tmp_path_html_fixed1} {tmp_path_html}", + f"tidy -numeric -output {tmp_path_html_fixed1} {tmp_path_html_utf8}", exit_code_ok = lambda x: x in (0,1) ) import fileinput, unicodedata From e57cce7c9a5315c1cd7401a683875f1b5857dbaa Mon Sep 17 00:00:00 2001 From: EsGeh Date: Thu, 25 Apr 2019 13:31:10 +0200 Subject: [PATCH 7/7] citations in temporary html are represented as html table --- utils/bib2html.py | 48 +++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/utils/bib2html.py b/utils/bib2html.py index 6f4d70e..9964cb8 100755 --- a/utils/bib2html.py +++ b/utils/bib2html.py @@ -21,6 +21,7 @@ from lxml import etree from pathlib import Path import sys +import textwrap BASE_DIR = Path( __file__ ).resolve().parent.parent SCRIPT_PATH = Path( __file__ ) @@ -94,14 +95,10 @@ def write_dummy_latex( tmp_dir = tmp_filename.parent allcitekeys = "" - + allcitekeys += "\\begin{tabular}{l l l}\n" for key in citekeys: - allcitekeys += """ -\subsection*{%s} -\subsubsection*{authoryear} -\cite{%s} -\subsubsection*{year} -\cite*{%s}\n""" % (key, key, key) + allcitekeys += f"\\verb|{key}| &\\cite{{{key}}}&\\cite*{{{key}}}\\\\\n" + allcitekeys += "\\end{tabular}\n" with open(template_path, "r") as tmp_template: template = tmp_template.read() @@ -113,15 +110,21 @@ def write_dummy_latex( if keyword == "": chapter_heading = BIBLIOGRAPHY_CHAPTER_NO_KEYWORD bibliographies += \ - f""" -\chapter{{{chapter_heading}}} -\printbibliography\n""" + textwrap.dedent( + f""" + \chapter{{{chapter_heading}}} + \printbibliography + """ + ) else: chapter_heading = BIBLIOGRAPHY_CHAPTER.format( keyword=keyword ) bibliographies += \ - f""" -\chapter{{{chapter_heading}}} -\printbibliography[keyword={{{keyword}}}]\n""" + textwrap.dedent( + f""" + \chapter{{{chapter_heading}}} + \printbibliography[keyword={{{keyword}}}] + """ + ) bibfile_orig = (tmp_dir / (bibfile.stem + "_orig")) . with_suffix( ".bib" ) bibfile_local = tmp_dir / bibfile.name @@ -202,11 +205,24 @@ def create_citations(citekeys, xml_tree, style): p_element = etree.Element("p") for citekey in citekeys: - logging.debug(f"working on citekey: {citekey}" ) + logging.debug( f"working on citekey: '{citekey}', style: '{style}'" ) + citation_el = None if style == "authoryear": - format_citation = xml_tree.xpath(f"//x:h4[text() = '{citekey}']/following-sibling::x:p[2]/text()", namespaces=NS_MAP)[0].strip() + citation_el = xml_tree.xpath( + f"//x:table/x:tr/x:td[.//x:span[text() = '{citekey}'] ]/following-sibling::x:td[1]/text()", + namespaces=NS_MAP + ) else: - format_citation = xml_tree.xpath(f"//x:h4[text() = '{citekey}']/following-sibling::x:p[3]/text()", namespaces=NS_MAP)[0].strip() + citation_el = xml_tree.xpath( + f"//x:table/x:tr/x:td[.//x:span[text() = '{citekey}'] ]/following-sibling::x:td[2]/text()", + namespaces=NS_MAP + ) + if( len(citation_el) == 0 ): + logging.error( f"error parsing formatted citation: '{citekey}', style: '{style}'" ) + sys.exit( 1 ) + + format_citation = citation_el[0].strip() + logging.debug( f"formatted: '{format_citation}'" ) span_element = etree.fromstring(f"""{format_citation}""") p_element.append(span_element)