From 0a553d025324028bf00ec9b769c358a64169cbac Mon Sep 17 00:00:00 2001 From: Klaus Thoden Date: Fri, 26 Jan 2018 16:30:43 +0100 Subject: [PATCH] Bibliography output added --- transform_xml.py | 136 ++++++++++++++++++++++++++++++----------------- 1 file changed, 86 insertions(+), 50 deletions(-) diff --git a/transform_xml.py b/transform_xml.py index 8efe56f..87e0f17 100644 --- a/transform_xml.py +++ b/transform_xml.py @@ -14,6 +14,7 @@ import shlex from bs4 import BeautifulSoup from lxml import etree, objectify +from lxml.html import soupparser # things to be done # assign ids top to bottom for the following elements: @@ -85,10 +86,17 @@ def format_citations(used_citekeys, bibdata): citation_formatter.write("\n# References\n") with open(OUTPUT_DIR + os.path.sep + "formatted_citations.html", "r") as ding: - dd = BeautifulSoup(ding, "html.parser") + cites = BeautifulSoup(ding, "html.parser") + with open(OUTPUT_DIR + os.path.sep + "formatted_citations.html", "r") as ding: + reference_list = soupparser.fromstring(ding, features="html.parser") + + # references = dd.xpath("//div[@class='references']") + # with open("tmp_files/formatted_citations.html", "r") as ding: - full_paren_cites = dd.select("#full-parentheses ~ p > span") - year_paren_cites = dd.select("#year-parentheses ~ p > span") + references = reference_list.xpath("//div[@class='references']")[0] + + # full_paren_cites = cites.select("#full-parentheses ~ p > span") + # year_paren_cites = cites.select("#year-parentheses ~ p > span") citation_dict = {} @@ -99,11 +107,11 @@ def format_citations(used_citekeys, bibdata): strTitle = entry_2["title"] title = strTitle - full_paren = dd.select("#citeauthoryear ~ p > span[data-cites='%s']" % entry)[0].text - year_paren = dd.select("#yearparen ~ p > span[data-cites='%s']" % entry)[0].text + full_paren = cites.select("#citeauthoryear ~ p > span[data-cites='%s']" % entry)[0].text + year_paren = cites.select("#yearparen ~ p > span[data-cites='%s']" % entry)[0].text citation_dict[entry] = (full_paren, year_paren, title) - return citation_dict + return citation_dict, references # def format_citations ends here def format_pagerange(pagerange_start, pagerange_end): @@ -273,7 +281,6 @@ def transform_body(xml_tree, cited_data, publang): ############## # Hi-Element # ############## - eoa_hi = xml_tree.xpath("//t:hi", namespaces=NS_MAP) for hi in eoa_hi: @@ -290,20 +297,11 @@ def transform_body(xml_tree, cited_data, publang): else: logging.debug("The rend attribute in hi has the value %s. This is not supported" % rend_attribute) - """ - - - - - - - """ - return xml_tree # def transform_body ends here -def assign_ids(xml_tree): - """Walk the xml tree again and assign ids.""" +def assign_ids(xml_tree, data): + """Walk the xml tree again. Assign ids to xml and put them into dicts, as well.""" chapterdict = {} figdict = {} @@ -325,10 +323,11 @@ def assign_ids(xml_tree): table_counter = 1 theorem_counter = 1 - chapter.set("id-text", str(chapter_counter)) - chapterdict[chapter.get("id")] = str(chapter_counter) + if chapter.get('rend') != "nonumber": + chapter.set("id-text", str(chapter_counter)) + chapterdict[chapter.get("id")] = str(chapter_counter) - figure_anchors = chapter.xpath("//EOAfigure/anchor") + figure_anchors = chapter.findall(".//EOAfigure/anchor") figure_counter = 1 for anchor in figure_anchors: figure_number = "%d.%d" % (chapter_counter, figure_counter) @@ -340,11 +339,11 @@ def assign_ids(xml_tree): figure_element.set("id", anchor.get("id")) figdict[anchor.get("id")] = figure_number - footnotes = chapter.xpath("//note") + footnotes = chapter.findall(".//note") for footnote in footnotes: fndict[footnote.get("id")] = footnote.get("n") - sections = chapter.xpath("//div2") + sections = chapter.findall(".//div2") section_counter = 1 for section in sections: section_number = "%d.%d" % (chapter_counter, section_counter) @@ -352,7 +351,7 @@ def assign_ids(xml_tree): secdict[section.get("id")] = section_number subsection_counter = 1 - subsections = section.xpath("//div3") + subsections = section.findall(".//div3") for subsection in subsections: subsection_number = "%d.%d.%d" % (chapter_counter, section_counter, subsection_counter) subsection.set("id-text", subsection_number) @@ -364,11 +363,49 @@ def assign_ids(xml_tree): # not implemented yet: equation, list, pagelabel, tab, theorem - print(chapterdict, figdict, fndict, secdict) - - return xml_tree + data["chapterdict"] = chapterdict + data["figdict"] = figdict + data["eqdict"] = eqdict + data["fndict"] = fndict + data["listdict"] = listdict + data["pagelabeldict"] = pagelabeldict + data["secdict"] = secdict + data["tabdict"] = tabdict + data["theoremdict"] = theoremdict + + return xml_tree, data # def assign_ids ends here +def add_bibliography(xml_tree, refs_for_bib_chapter): + """Add another chapter containing the bibliography.""" + + root_element = xml_tree.getroot() + + xml_chapters = root_element.xpath("//div1") + number_of_chapters = len(xml_chapters) + bibliography_chapter = etree.Element("div1", rend="nonumber", language="english") + # this needs to be configurable by language + bib_head = etree.SubElement(bibliography_chapter, "head").text = "Bibliography" + bib_div_1 = etree.SubElement(bibliography_chapter, "div") + bib_div_2 = etree.SubElement(bib_div_1, "div") + + entries = refs_for_bib_chapter.findall(".//div") + + for entry in entries: + entry_id = entry.get("id") + entry.set("class", "bibliography") + etree.strip_tags(entry, "p") + entry.tag = "p" + internal_markup = entry.findall(".//em") + for markup in internal_markup: + markup.tag = "i" + + bib_div_2.append(entry) + + root_element.insert(number_of_chapters + 1, bibliography_chapter) + + return root_element +# def add_bibliography ends here if __name__ == '__main__': if len(sys.argv) == 1: @@ -383,16 +420,6 @@ def assign_ids(xml_tree): used_citekeys = data["citekeys"] - dictChapters = data["chapterdict"] - dictEquations = data["eqdict"] - dictLists = data["listdict"] - dictTheorems = data["theoremdict"] - dictSections = data["secdict"] - dictFigures = data["figdict"] - dictFootnotes = data["fndict"] - dictTables = data["tabdict"] - dictPagelabels = data["pagelabeldict"] - tei_document = sys.argv[-1] xml_tree = etree.parse(tei_document) @@ -410,14 +437,13 @@ def assign_ids(xml_tree): # json interim_bib_json_file = "tmp-bib.json" citeproc_command = "pandoc-citeproc --bib2json %s" % bib_data["source"] - logging.debug(citeproc_command) citeproc_arguments = shlex.split(citeproc_command) citeproc_process = subprocess.Popen(citeproc_arguments, stdout=subprocess.PIPE) citeproc_json = citeproc_process.stdout.read() citations_json = json.loads(citeproc_json) - # cited_dict = format_citations(set(used_citekeys), bibdata) - cited_dict = format_citations(set(used_citekeys), citations_json) + # refs for bib_chapter contains formatted reference entries + cited_dict, refs_for_bib_chapter = format_citations(set(used_citekeys), citations_json) # render_reference(all_references, cited_dict) assert(bib_data["type"] in ["monograph", "anthology", "monograph-numeric", "anthology-numeric"]) @@ -427,29 +453,39 @@ def assign_ids(xml_tree): body_transformed = transform_body(tei_body, cited_dict, publang=publication_language) resulting_tree = etree.ElementTree(body_transformed) - etree.strip_tags(resulting_tree, "tagtobestripped") + xml_add_bib = add_bibliography(resulting_tree, refs_for_bib_chapter) + + etree.strip_tags(xml_add_bib, "tagtobestripped") - elements_with_ids = resulting_tree.xpath(".//div1 | //div2 | //div3 | //note | //item | //table | //EOAfigure/anchor | //EOAequation | //formula | //theorem") + elements_with_ids = xml_add_bib.xpath("//div1 | //div2 | //div3 | //note | //item | //table | //EOAfigure/anchor | //EOAequation | //formula | //theorem") element_counter = 1 for element in elements_with_ids: element.set("id", "uid" + str(element_counter)) element_counter += 1 - assigned_ids = assign_ids(resulting_tree) - + assigned_ids, data_to_pickle = assign_ids(resulting_tree, data) xml_root = assigned_ids.getroot() + xml_root.tag = "Book" - etree.cleanup_namespaces(xml_root) - objectify.deannotate(resulting_tree, cleanup_namespaces=True) + final_tree = etree.ElementTree(xml_root) + # objectify.deannotate(final_tree, cleanup_namespaces=True) + # etree.cleanup_namespaces(xml_root) - if not os.path.exists(OUTPUT_DIR): - os.mkdir(os.path.expanduser(OUTPUT_DIR)) - output_filename = OUTPUT_DIR + os.path.sep + "IntermediateXMLFile.xml" + with open(OUTPUT_DIR + os.path.sep + 'data.pickle', 'wb') as f: + # Pickle the 'data' dictionary using the highest protocol available. + pickle.dump(data_to_pickle, f, pickle.HIGHEST_PROTOCOL) if not os.path.exists("CONVERT"): os.mkdir(os.path.expanduser("CONVERT")) + if not os.path.exists("debug"): + os.mkdir(os.path.expanduser("debug")) + + if not os.path.exists(OUTPUT_DIR): + os.mkdir(os.path.expanduser(OUTPUT_DIR)) + output_filename = OUTPUT_DIR + os.path.sep + "IntermediateXMLFile.xml" - resulting_tree.write(output_filename, pretty_print=True, xml_declaration=True,encoding="utf-8") + # resulting_tree.write(output_filename, pretty_print=True, xml_declaration=True,encoding="utf-8") + final_tree.write(output_filename, pretty_print=True, xml_declaration=True,encoding="utf-8") logging.debug("Wrote %s." % output_filename) # finis