Merge branch 'clean_and_fix' of https://github.molgen.mpg.de/EditionO…

…penAccess/EOASkripts into clean_and_fix
EditionOpenAccess · Apr 29, 2019 · 962ed1f · 962ed1f
2 parents 088deef + e57cce7
commit 962ed1f
Show file tree

Hide file tree

Showing 6 changed files with 141 additions and 77 deletions.
diff --git a/bibformat/4ht/bibliography4ht.tex b/bibformat/4ht/bibliography4ht.tex
@@ -91,6 +91,11 @@
 \bibliography{$bibfile}
 \begin{document}
 % \maketitle
+\makeatletter
+\def\hshchr{\expandafter\@gobble\string\#}
+\def\ampchr{\expandafter\@gobble\string\&}
+\def\entity#1{\HCode{\ampchr\hshchr#1;}}
+\makeatother
 
 \section{Citations}
 

diff --git a/eoatex2imxml.py b/eoatex2imxml.py
@@ -1195,7 +1195,6 @@ def insert_bibliographies(
                 citations_json,
                 citekeys
         )
-        # use language of the first chapter:
         formatted_bibl_info = bib2html.main(
             bib_file = bib_file,
             citekeys = citekeys,
@@ -1402,7 +1401,6 @@ def add_bibliography_to_xml(
 if bibl_info is None:
     logging.warning("No bibliography database found.")
 else:
-
     (bib_type, bib_database) = bibl_info
     logging.debug(f"bib type is {bib_type}")
 
@@ -1426,6 +1424,7 @@ def add_bibliography_to_xml(
     if bib_type == "monograph":
         keyword_to_print_bibl_el = insert_bibliographies(
                 xmlTree,
+                # use language of the first chapter:
                 xmlChapters[0].get( "language" ),
                 citations_json,
                 ## paths:
@@ -1544,7 +1543,7 @@ def add_bibliography_to_xml(
                 if xmlCitation.tag == "EOAciteauthoryear":
                     strCitation = citeauthoryear_value
                 elif xmlCitation.tag == "EOAciteyear":
-                    strCitation = form_cit.select("#citeyear ~ p > span[data-cites='%s']" % string_citekey)[0].text[1:-1]
+                    strCitation = form_cit.select("#citeyear ~ p > span[data-cites='%s']" % string_citekey)[0].text
                 elif xmlCitation.tag == "EOAcitemanual":
                     cite_text = xmlCitation.find("citetext")
                     if cite_text.getchildren():

diff --git a/imxml2django.py b/imxml2django.py
@@ -1127,7 +1127,7 @@ def check_publication_cfg(configuration_file):
         strImageFileDir = re.sub("/", "", strImageFileDir)
         strImageFileName = os.path.basename(strImageFile)
         shutil.copy(
-                INPUT_DIR / strImageFile,
+                PUBLICATION_DIR / strImageFile,
                 CONVERT_DIR / "django/images" / (strImageFileDir + strImageFileName)
         )
         # shutil.copy(os.getcwd() + "/" + strImageFile, os.getcwd() + "/CONVERT/django/images/" + strImageFileDir + strImageFileName)

diff --git a/mkimage.py b/mkimage.py
@@ -38,21 +38,20 @@
 def get_cover_image(image_path):
     """Choose a random landscape image from publications in this volume"""
 
-    import random
-
-    candidates = os.listdir(image_path)
-
+    import random, glob
+    extensions = ("png", "jpg")
+    candidates = []
+    for extension in extensions:
+        path = f"{image_path}/**/*.{extension}"
+        candidates.extend(
+            glob.glob( path, recursive = True)
+        )
     for image in candidates:
-        if image == ".DS_Store":
-            candidates.remove(image)
-            continue
-        tmp_image = Image.open(image_path + "/" + str(image))
+        tmp_image = Image.open(str(image))
         ratio = calculate_ratio(tmp_image)
         if ratio < 1:
             candidates.remove(image)
-
     chosen_image = random.choice(candidates)
-
     return chosen_image
 # def get_cover_image ends here
 
@@ -209,7 +208,7 @@ def create_cover(metadata_dict, image_directory, cover_filename, image_is_file):
     text_draw.multiline_text((ptcenter,DIMENSIONS[1]-400), press_text_joined, font=small_font, align="center")
 
     if image_is_file == False:
-        image_on_cover = Image.open(os.path.join(image_directory, get_cover_image(image_directory)))
+        image_on_cover = Image.open(get_cover_image(image_directory))
     else:
         image_on_cover = Image.open(image_directory)
 

diff --git a/utils/bib2html.py b/utils/bib2html.py
@@ -21,6 +21,7 @@
 from lxml import etree
 from pathlib import Path
 import sys
+import textwrap
 
 BASE_DIR = Path( __file__ ).resolve().parent.parent
 SCRIPT_PATH = Path( __file__ )
@@ -33,6 +34,15 @@
 BIBLIOGRAPHY_CHAPTER_NO_KEYWORD = "BIBLIOGRAPHY"
 BIBLIOGRAPHY_CHAPTER = "BIBLIOGRAPHY {keyword}"
 
+def latex_escape_non_ascii( input_str ):
+    output = ""
+    for c in input_str:
+        if ord(c) > 0x7F:
+            output += "\entity{{{}}}".format( ord(c) )
+        else:
+            output += c
+    return output
+
 def check_executables():
     check_executable( "htlatex" )
     check_executable( "tidy" )
@@ -42,7 +52,7 @@ def transform_reference(reference_element, dialect='html'):
     """Formatting transformation for reference element"""
 
     string_from_xml = etree.tostring(reference_element).decode('utf-8')
-    removed_linebreak = string_from_xml.replace("\n", "")
+    removed_linebreak = string_from_xml.replace("\n", " ")
     removed_namespace = removed_linebreak.replace('<p xmlns="http://www.w3.org/1999/xhtml" class="noindent">', '<p>')
     cleaned_element = etree.fromstring(removed_namespace)
 
@@ -82,16 +92,13 @@ def write_dummy_latex(
         tmp_filename
 ):
     """Prepare a latex file"""
+    tmp_dir = tmp_filename.parent
 
     allcitekeys = ""
-
+    allcitekeys += "\\begin{tabular}{l l l}\n"
     for key in citekeys:
-        allcitekeys += """
-\subsection*{%s}
-\subsubsection*{authoryear}
-\cite{%s}
-\subsubsection*{year}
-\cite*{%s}\n""" % (key, key, key)
+        allcitekeys += f"\\verb|{key}|  &\\cite{{{key}}}&\\cite*{{{key}}}\\\\\n"
+    allcitekeys += "\\end{tabular}\n"
 
     with open(template_path, "r") as tmp_template:
         template = tmp_template.read()
@@ -103,29 +110,60 @@ def write_dummy_latex(
         if keyword == "":
             chapter_heading = BIBLIOGRAPHY_CHAPTER_NO_KEYWORD
             bibliographies += \
-                f"""
-\chapter{{{chapter_heading}}}
-\printbibliography\n"""
+                textwrap.dedent(
+                    f"""
+                    \chapter{{{chapter_heading}}}
+                    \printbibliography
+                    """
+                )
         else:
             chapter_heading = BIBLIOGRAPHY_CHAPTER.format( keyword=keyword )
             bibliographies += \
-                f"""
-\chapter{{{chapter_heading}}}
-\printbibliography[keyword={{{keyword}}}]\n"""
+                textwrap.dedent(
+                    f"""
+                    \chapter{{{chapter_heading}}}
+                    \printbibliography[keyword={{{keyword}}}]
+                    """
+                )
+
+    bibfile_orig = (tmp_dir / (bibfile.stem + "_orig")) . with_suffix( ".bib" )
+    bibfile_local = tmp_dir / bibfile.name
+    shutil.copyfile(
+            bibfile,
+            bibfile_orig
+    )
+    import fileinput, unicodedata
+    with open( bibfile_local, "w") as out_file:
+        for line in fileinput.input(bibfile_orig):
+            out_file.write(
+                    latex_escape_non_ascii(
+                        line
+                    )
+            )
 
     bibfile_path = \
         bibfile if bibfile.is_absolute() else Path.cwd() / bibfile
     substitions = fill_in_template.substitute(
             language = language,
             # language = translations[language],
-            bibfile = bibfile_path,
+            bibfile = bibfile.name,
+            # bibfile = bibfile_path,
             # bibfile = '../' + bibfile,
             citations = allcitekeys,
             bibliographies = bibliographies
     )
+    # (just for debugging: save with unescaped non-ascii characters)
+    with open(tmp_dir / (tmp_filename.name + ".orig"), "w") as texfile:
+        texfile.write(
+            substitions
+        )
 
     with open(tmp_filename, "w") as texfile:
-        texfile.write(substitions)
+        texfile.write(
+            latex_escape_non_ascii(
+                substitions
+            )
+        )
 
     logging.info(f"Wrote {tmp_filename}")
 # def write_dummy_latex ends here
@@ -137,15 +175,17 @@ def run_htlatex(
 ):
     """Create HTML file from temporary LaTeX file"""
     exec_command(
-        f"htxelatex {tmp_filename}.tex 'xhtml,charset=utf-8' ' -cunihtf -utf8' '' '--interaction=nonstopmode'",
+        f"htlatex {tmp_filename}.tex 'xhtml,charset=utf-8,fn-in' ' -utf8' '' '--interaction=nonstopmode'",
+        # f"htxelatex {tmp_filename}.tex 'xhtml,charset=utf-8' ' -cunihtf -utf8' '' '--interaction=nonstopmode'",
         output_to = ToFile( Path(log_dir) / "htlatex1.log" )
     )
     exec_command(
         f"biber {tmp_filename}",
         output_to = ToFile( Path(log_dir) / "biber.log" )
     )
     exec_command(
-        f"htxelatex {tmp_filename}.tex 'xhtml,charset=utf-8' ' -cunihtf -utf8' '' '--interaction=nonstopmode'",
+        f"htlatex {tmp_filename}.tex 'xhtml,charset=utf-8,fn-in' ' -utf8' '' '--interaction=nonstopmode'",
+        # f"htxelatex {tmp_filename}.tex 'xhtml,charset=utf-8' ' -cunihtf -utf8' '' '--interaction=nonstopmode'",
         output_to = ToFile( Path(log_dir) / "htlatex2.log" )
     )
 # def run_htlatex ends here
@@ -165,11 +205,24 @@ def create_citations(citekeys, xml_tree, style):
     p_element = etree.Element("p")
 
     for citekey in citekeys:
-        logging.debug(f"working on citekey: {citekey}" )
+        logging.debug( f"working on citekey: '{citekey}', style: '{style}'" )
+        citation_el = None
         if style == "authoryear":
-            format_citation = xml_tree.xpath(f"//x:h4[text() = '{citekey}']/following-sibling::x:p[2]/text()", namespaces=NS_MAP)[0].strip()
+            citation_el = xml_tree.xpath(
+                f"//x:table/x:tr/x:td[.//x:span[text() = '{citekey}'] ]/following-sibling::x:td[1]/text()",
+                namespaces=NS_MAP
+            )
         else:
-            format_citation = xml_tree.xpath(f"//x:h4[text() = '{citekey}']/following-sibling::x:p[3]/text()", namespaces=NS_MAP)[0].strip()
+            citation_el = xml_tree.xpath(
+                f"//x:table/x:tr/x:td[.//x:span[text() = '{citekey}'] ]/following-sibling::x:td[2]/text()",
+                    namespaces=NS_MAP
+            )
+        if( len(citation_el) == 0 ):
+            logging.error( f"error parsing formatted citation: '{citekey}', style: '{style}'" )
+            sys.exit( 1 )
+
+        format_citation = citation_el[0].strip()
+        logging.debug( f"formatted: '{format_citation}'" )
         span_element = etree.fromstring(f"""<span class="citation" data-cites="{citekey}">{format_citation}</span>""")
 
         p_element.append(span_element)
@@ -230,22 +283,29 @@ def main(
     wd = Path.cwd()
     log_dir = log_dir.resolve()
     os.chdir( temp_dir )
+    logging.info(f"cd {temp_dir}")
     run_htlatex(
             tmp_filename . with_suffix( "" ),
             # tmp_filename,
             log_dir = log_dir
     )
+    logging.info(f"cd {wd}")
     os.chdir( wd )
 
     tmp_path_html = temp_dir / tmp_filename . with_suffix( ".html" )
+    tmp_path_html_utf8 = (temp_dir / (str(tmp_filename) + "-utf8")) . with_suffix( ".html" )
     tmp_path_html_fixed1 = temp_dir / tmp_filename . with_suffix( ".1.html" )
     tmp_path_html_fixed2 = temp_dir / tmp_filename . with_suffix( ".2.html" )
 
+    exec_command(
+        f"iconv -f ISO-8859-1 -t UTF-8 --output={tmp_path_html_utf8} {tmp_path_html}"
+    )
+
     # htlatex seems to produce incorrect xhtml.
     # We have to fix it
     # (this will e.g. replace '&' by '&amp;'):
     exec_command(
-            f"tidy -numeric -output {tmp_path_html_fixed1} {tmp_path_html}",
+            f"tidy -numeric -output {tmp_path_html_fixed1} {tmp_path_html_utf8}",
             exit_code_ok = lambda x: x in (0,1)
     )
     import fileinput, unicodedata
@@ -269,25 +329,29 @@ def main(
             chapter_heading = BIBLIOGRAPHY_CHAPTER_NO_KEYWORD
         else:
             chapter_heading = BIBLIOGRAPHY_CHAPTER.format( keyword = keyword )
+        # '<dl class="thebibliography"> ... </dl>
         bibliography_el = xml_tree.xpath(
                 f"//x:body/x:p[text() = '{chapter_heading}']/following-sibling::x:dl[1]",
                 namespaces = NS_MAP
         )
         if( len(bibliography_el) != 1 ):
             logging.error( f"error parsing bibliography with keyword '{keyword}'" )
             sys.exit( 1 )
-        bibliographies_dict[keyword] =  bibliography_el[0]
+        bibliography_el = bibliography_el[0]
 
-    xml_tree.xpath(f"//x:dl[@class='thebibliography']", namespaces=NS_MAP)[0]
-    reference_list = xml_tree.xpath(f"//x:dl[@class='thebibliography']", namespaces=NS_MAP)[0]
-    reference_div = create_reference_list(reference_list)
+        reference_div = create_reference_list(bibliography_el)
+        bibliographies_dict[keyword] = reference_div
 
     html_element = etree.Element("html")
     html_element.insert(0, citation_authoryear)
     html_element.insert(1, citation_year)
-    html_element.insert(2, reference_div)
-
-    # print(etree.tostring(html_element))
+    for keyword in keywords:
+        bibl_el = etree.SubElement(
+                html_element,
+                "div",
+                **({} if keyword == "" else { 'keyword': keyword } )
+        )
+        bibl_el.append( bibliographies_dict[keyword] )
 
     tree = etree.ElementTree(html_element)
     logging.info("writing '%s'" % output_file)