From 0a553d025324028bf00ec9b769c358a64169cbac Mon Sep 17 00:00:00 2001
From: Klaus Thoden <kthoden@mpiwg-berlin.mpg.de>
Date: Fri, 26 Jan 2018 16:30:43 +0100
Subject: [PATCH] Bibliography output added

---
 transform_xml.py | 136 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 86 insertions(+), 50 deletions(-)

diff --git a/transform_xml.py b/transform_xml.py
index 8efe56f..87e0f17 100644
--- a/transform_xml.py
+++ b/transform_xml.py
@@ -14,6 +14,7 @@
 import shlex
 from bs4 import BeautifulSoup
 from lxml import etree, objectify
+from lxml.html import soupparser
 
 # things to be done
 # assign ids top to bottom for the following elements:
@@ -85,10 +86,17 @@ def format_citations(used_citekeys, bibdata):
         citation_formatter.write("\n# References\n")
 
     with open(OUTPUT_DIR + os.path.sep + "formatted_citations.html", "r") as ding:
-        dd = BeautifulSoup(ding, "html.parser")
+        cites = BeautifulSoup(ding, "html.parser")
+    with open(OUTPUT_DIR + os.path.sep + "formatted_citations.html", "r") as ding:
+        reference_list = soupparser.fromstring(ding, features="html.parser")
+
+    # references = dd.xpath("//div[@class='references']")
+    # with open("tmp_files/formatted_citations.html", "r") as ding:
 
-    full_paren_cites = dd.select("#full-parentheses ~ p > span")
-    year_paren_cites = dd.select("#year-parentheses ~ p > span")
+    references = reference_list.xpath("//div[@class='references']")[0]
+
+    # full_paren_cites = cites.select("#full-parentheses ~ p > span")
+    # year_paren_cites = cites.select("#year-parentheses ~ p > span")
 
     citation_dict = {}
 
@@ -99,11 +107,11 @@ def format_citations(used_citekeys, bibdata):
                 strTitle = entry_2["title"]
 
         title = strTitle
-        full_paren = dd.select("#citeauthoryear ~ p > span[data-cites='%s']" % entry)[0].text
-        year_paren = dd.select("#yearparen ~ p > span[data-cites='%s']" % entry)[0].text
+        full_paren = cites.select("#citeauthoryear ~ p > span[data-cites='%s']" % entry)[0].text
+        year_paren = cites.select("#yearparen ~ p > span[data-cites='%s']" % entry)[0].text
         citation_dict[entry] = (full_paren, year_paren, title)
 
-    return citation_dict
+    return citation_dict, references
 # def format_citations ends here
 
 def format_pagerange(pagerange_start, pagerange_end):
@@ -273,7 +281,6 @@ def transform_body(xml_tree, cited_data, publang):
     ##############
     # Hi-Element #
     ##############
-
     eoa_hi = xml_tree.xpath("//t:hi", namespaces=NS_MAP)
 
     for hi in eoa_hi:
@@ -290,20 +297,11 @@ def transform_body(xml_tree, cited_data, publang):
         else:
           logging.debug("The rend attribute in hi has the value %s. This is not supported" % rend_attribute)
 
-    """
-  <xsl:template match="tei:hi[@rend='math']">
-    <xsl:element name="em">
-      <xsl:apply-templates/>
-    </xsl:element>
-  </xsl:template>
-
-    """
-
     return xml_tree
 # def transform_body ends here
 
-def assign_ids(xml_tree):
-    """Walk the xml tree again and assign ids."""
+def assign_ids(xml_tree, data):
+    """Walk the xml tree again. Assign ids to xml and put them into dicts, as well."""
 
     chapterdict = {}
     figdict = {}
@@ -325,10 +323,11 @@ def assign_ids(xml_tree):
         table_counter = 1
         theorem_counter = 1
 
-        chapter.set("id-text", str(chapter_counter))
-        chapterdict[chapter.get("id")] = str(chapter_counter)
+        if chapter.get('rend') != "nonumber":
+            chapter.set("id-text", str(chapter_counter))
+            chapterdict[chapter.get("id")] = str(chapter_counter)
 
-        figure_anchors = chapter.xpath("//EOAfigure/anchor")
+        figure_anchors = chapter.findall(".//EOAfigure/anchor")
         figure_counter = 1
         for anchor in figure_anchors:
             figure_number = "%d.%d" % (chapter_counter, figure_counter)
@@ -340,11 +339,11 @@ def assign_ids(xml_tree):
             figure_element.set("id", anchor.get("id"))
             figdict[anchor.get("id")] = figure_number
 
-        footnotes = chapter.xpath("//note")
+        footnotes = chapter.findall(".//note")
         for footnote in footnotes:
             fndict[footnote.get("id")] = footnote.get("n")
 
-        sections = chapter.xpath("//div2")
+        sections = chapter.findall(".//div2")
         section_counter = 1
         for section in sections:
             section_number = "%d.%d" % (chapter_counter, section_counter)
@@ -352,7 +351,7 @@ def assign_ids(xml_tree):
             secdict[section.get("id")] = section_number
 
             subsection_counter = 1
-            subsections = section.xpath("//div3")
+            subsections = section.findall(".//div3")
             for subsection in subsections:
                 subsection_number = "%d.%d.%d" % (chapter_counter, section_counter, subsection_counter)
                 subsection.set("id-text", subsection_number)
@@ -364,11 +363,49 @@ def assign_ids(xml_tree):
 
     # not implemented yet: equation, list, pagelabel, tab, theorem
 
-    print(chapterdict, figdict, fndict, secdict)
-
-    return xml_tree
+    data["chapterdict"] = chapterdict
+    data["figdict"] = figdict
+    data["eqdict"] = eqdict
+    data["fndict"] = fndict
+    data["listdict"] = listdict
+    data["pagelabeldict"] = pagelabeldict
+    data["secdict"] = secdict
+    data["tabdict"] = tabdict
+    data["theoremdict"] = theoremdict
+
+    return xml_tree, data
 # def assign_ids ends here
 
+def add_bibliography(xml_tree, refs_for_bib_chapter):
+    """Add another chapter containing the bibliography."""
+
+    root_element = xml_tree.getroot()
+
+    xml_chapters = root_element.xpath("//div1")
+    number_of_chapters = len(xml_chapters)
+    bibliography_chapter = etree.Element("div1", rend="nonumber", language="english")
+    # this needs to be configurable by language
+    bib_head = etree.SubElement(bibliography_chapter, "head").text = "Bibliography"
+    bib_div_1 = etree.SubElement(bibliography_chapter, "div")
+    bib_div_2 = etree.SubElement(bib_div_1, "div")
+
+    entries = refs_for_bib_chapter.findall(".//div")
+
+    for entry in entries:
+        entry_id = entry.get("id")
+        entry.set("class", "bibliography")
+        etree.strip_tags(entry, "p")
+        entry.tag = "p"
+        internal_markup = entry.findall(".//em")
+        for markup in internal_markup:
+            markup.tag = "i"
+
+        bib_div_2.append(entry)
+
+    root_element.insert(number_of_chapters + 1, bibliography_chapter)
+
+    return root_element
+# def add_bibliography ends here
 
 if __name__ == '__main__':
     if len(sys.argv) == 1:
@@ -383,16 +420,6 @@ def assign_ids(xml_tree):
 
     used_citekeys = data["citekeys"]
 
-    dictChapters = data["chapterdict"]
-    dictEquations = data["eqdict"]
-    dictLists = data["listdict"]
-    dictTheorems = data["theoremdict"]
-    dictSections = data["secdict"]
-    dictFigures = data["figdict"]
-    dictFootnotes = data["fndict"]
-    dictTables = data["tabdict"]
-    dictPagelabels = data["pagelabeldict"]
-
     tei_document = sys.argv[-1]
     xml_tree = etree.parse(tei_document)
 
@@ -410,14 +437,13 @@ def assign_ids(xml_tree):
     # json
     interim_bib_json_file = "tmp-bib.json"
     citeproc_command = "pandoc-citeproc --bib2json  %s" % bib_data["source"]
-    logging.debug(citeproc_command)
     citeproc_arguments = shlex.split(citeproc_command)
     citeproc_process = subprocess.Popen(citeproc_arguments, stdout=subprocess.PIPE)
     citeproc_json = citeproc_process.stdout.read()
     citations_json = json.loads(citeproc_json)
 
-    # cited_dict = format_citations(set(used_citekeys), bibdata)
-    cited_dict = format_citations(set(used_citekeys), citations_json)
+    # refs for bib_chapter contains formatted reference entries
+    cited_dict, refs_for_bib_chapter = format_citations(set(used_citekeys), citations_json)
     # render_reference(all_references, cited_dict)
 
     assert(bib_data["type"] in ["monograph", "anthology", "monograph-numeric", "anthology-numeric"])
@@ -427,29 +453,39 @@ def assign_ids(xml_tree):
     body_transformed = transform_body(tei_body, cited_dict, publang=publication_language)
 
     resulting_tree = etree.ElementTree(body_transformed)
-    etree.strip_tags(resulting_tree, "tagtobestripped")
+    xml_add_bib = add_bibliography(resulting_tree, refs_for_bib_chapter)
+
+    etree.strip_tags(xml_add_bib, "tagtobestripped")
 
-    elements_with_ids = resulting_tree.xpath(".//div1 | //div2 | //div3 | //note | //item | //table | //EOAfigure/anchor | //EOAequation | //formula | //theorem")
+    elements_with_ids = xml_add_bib.xpath("//div1 | //div2 | //div3 | //note | //item | //table | //EOAfigure/anchor | //EOAequation | //formula | //theorem")
     element_counter = 1
     for element in elements_with_ids:
         element.set("id", "uid" + str(element_counter))
         element_counter += 1
 
-    assigned_ids = assign_ids(resulting_tree)
-
+    assigned_ids, data_to_pickle = assign_ids(resulting_tree, data)
     xml_root = assigned_ids.getroot()
+
     xml_root.tag = "Book"
 
-    etree.cleanup_namespaces(xml_root)
-    objectify.deannotate(resulting_tree, cleanup_namespaces=True)
+    final_tree = etree.ElementTree(xml_root)
+    # objectify.deannotate(final_tree, cleanup_namespaces=True)
+    # etree.cleanup_namespaces(xml_root)
 
-    if not os.path.exists(OUTPUT_DIR):
-        os.mkdir(os.path.expanduser(OUTPUT_DIR))
-    output_filename = OUTPUT_DIR + os.path.sep +  "IntermediateXMLFile.xml"
+    with open(OUTPUT_DIR + os.path.sep + 'data.pickle', 'wb') as f:
+        # Pickle the 'data' dictionary using the highest protocol available.
+        pickle.dump(data_to_pickle, f, pickle.HIGHEST_PROTOCOL)
 
     if not os.path.exists("CONVERT"):
         os.mkdir(os.path.expanduser("CONVERT"))
+    if not os.path.exists("debug"):
+        os.mkdir(os.path.expanduser("debug"))
+
+    if not os.path.exists(OUTPUT_DIR):
+        os.mkdir(os.path.expanduser(OUTPUT_DIR))
+    output_filename = OUTPUT_DIR + os.path.sep + "IntermediateXMLFile.xml"
 
-    resulting_tree.write(output_filename, pretty_print=True, xml_declaration=True,encoding="utf-8")
+    # resulting_tree.write(output_filename, pretty_print=True, xml_declaration=True,encoding="utf-8")
+    final_tree.write(output_filename, pretty_print=True, xml_declaration=True,encoding="utf-8")
     logging.debug("Wrote %s." % output_filename)
 # finis