From dfe0b4d1bd5363e50c039d2f8ceb7a4a0b28f150 Mon Sep 17 00:00:00 2001 From: Klaus Thoden Date: Tue, 29 May 2018 17:06:57 +0200 Subject: [PATCH] Citations and bibliography --- tei2imxml.py | 106 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 85 insertions(+), 21 deletions(-) diff --git a/tei2imxml.py b/tei2imxml.py index 6edee7f..af60da6 100644 --- a/tei2imxml.py +++ b/tei2imxml.py @@ -222,12 +222,12 @@ def render_reference(list_of_xml_elements, cited_data): element.text = cited_data[citekey][2] # def render_reference ends here -def write_citation_markdown(used_citekeys): +def write_citation_markdown(used_citekeys, citations_filename): """Write markdown file with citekeys for bibliography rendering""" md_file_header = "---\nlang: en\ntitle: Citations\n...\n\n" - with open(TMP_DIR + os.path.sep + "used_citations.md", "w") as citation_formatter: + with open(TMP_DIR + os.path.sep + citations_filename, "w") as citation_formatter: citation_formatter.write(md_file_header) # citation_formatter.write("# Full parentheses\n") citation_formatter.write("# citeauthoryear\n") @@ -245,21 +245,25 @@ def write_citation_markdown(used_citekeys): logging.info("Wrote citation formatter.") # def write_citation_markdown ends here -def format_citations(used_citekeys, bibdata): - """Return a formatted entry of the used citations""" +def format_reference_list(used_citekeys, html_file): + """Create an HTML formatted list of references""" - with open(TMP_DIR + os.path.sep + "formatted_citations.html", "r") as ding: - cites = BeautifulSoup(ding, "html.parser") - with open(TMP_DIR + os.path.sep + "formatted_citations.html", "r") as ding: + # second part of function + with open(TMP_DIR + os.path.sep + html_file, "r") as ding: reference_list = soupparser.fromstring(ding, features="html.parser") - # references = dd.xpath("//div[@class='references']") - # with open("tmp_files/formatted_citations.html", "r") as ding: - references = reference_list.xpath("//div[@class='references']")[0] - # full_paren_cites = cites.select("#full-parentheses ~ p > span") - # year_paren_cites = cites.select("#year-parentheses ~ p > span") + return references +# def format_reference_list ends here + +def format_citations(used_citekeys, bibdata, html_file): + """Return a formatted entry of the used citations""" + + # print(used_citekeys) + + with open(TMP_DIR + os.path.sep + html_file, "r") as ding: + cites = BeautifulSoup(ding, "html.parser") citation_dict = {} @@ -267,14 +271,15 @@ def format_citations(used_citekeys, bibdata): for entry_2 in bibdata: if entry_2["id"] == entry: current_citation = entry + # logging.info("%s: The title %s" % (html_file, entry_2["title"])) strTitle = entry_2["title"] - title = strTitle - authoryear_citation = cites.select("#citeauthoryear ~ p > span[data-cites='%s']" % entry)[0].text - year_citation = cites.select("#citeyear ~ p > span[data-cites='%s']" % entry)[0].text - citation_dict[entry] = (authoryear_citation, year_citation, title) + title = strTitle + authoryear_citation = cites.select("#citeauthoryear ~ p > span[data-cites='%s']" % entry)[0].text[1:-1] + year_citation = cites.select("#citeyear ~ p > span[data-cites='%s']" % entry)[0].text[1:-1] + citation_dict[entry] = (authoryear_citation, year_citation, title) - return citation_dict, references + return citation_dict # def format_citations ends here def format_pagerange(pagerange_start, pagerange_end): @@ -378,9 +383,18 @@ def transform_body(xml_tree, cited_data, authors, publang): citation.set("data-placement", "bottom") if len(cited_range) > 0: - pagerange_start = cited_range[0].get("from") - pagerange_end = cited_range[0].get("to") - pagerange = ", " + format_pagerange(pagerange_start, pagerange_end) + if cited_range[0].text is not None and cited_range[0].get("from") is not None: + print("You must not use 'from' attribute and text in citedRange at the same time. Exiting.") + sys.exit() + elif cited_range[0].text is not None: + # might contain markup! + pagerange = ", {}".format(cited_range[0].text) + # clear the text + cited_range[0].text = "" + elif cited_range[0].get("from") is not None: + pagerange_start = cited_range[0].get("from") + pagerange_end = cited_range[0].get("to") + pagerange = ", " + format_pagerange(pagerange_start, pagerange_end) cited_range[0].tag = "tagtobestripped" if cite_render == 'inline': @@ -572,7 +586,57 @@ def assign_ids(xml_tree, data): return xml_tree, data # def assign_ids ends here -def add_bibliography(xml_tree, refs_for_bib_chapter): +def update_ids(xml_tree): + """Update the references in EOAref to the id value assigned in assign_ids""" + + xmlReferences = xml_tree.findall(".//EOAref") + + for xmlReference in xmlReferences: + eoa_reference = xmlReference.find("ref") + + label_text = xmlReference.find("Label").text[1:] + logging.debug("label text is %s" % label_text) + + # if label_text.endswith("-hi"): + # logging.debug("%s is a hyperimage reference. Leaving out for now." % label_text) + # pass + # else: + corresponding_eoa_id_element = xml_tree.xpath("//*[@xml:id='{}']".format(label_text)) + if len(corresponding_eoa_id_element) == 0: + print("There seems to be no corresponding xml:id for %s. Exiting." % label_text) + sys.exit() + elif len(corresponding_eoa_id_element) > 1: + print("The xml:id %s has been assigned more than once. This is not allowed. Exiting." % corresponding_eoa_id_element) + sys.exit() + else: + eoa_id_element = corresponding_eoa_id_element[0] + + eoa_id = eoa_id_element.get("id") + eoa_reference.set("target", eoa_id) + + return xml_tree +# def update_ids ends here + +def prepare_bibliography(bib_data): + """Create a JSON version of bibliography data, using pandoc-citeproc""" + + # json + interim_bib_json_file = TMP_DIR + os.path.sep + "tmp-bib.json" + citeproc_command = "pandoc-citeproc --bib2json %s" % bib_data["source"] + citeproc_arguments = shlex.split(citeproc_command) + citeproc_process = subprocess.Popen(citeproc_arguments, stdout=subprocess.PIPE) + citeproc_json = citeproc_process.stdout.read() + citations_json = json.loads(citeproc_json) + + with open(interim_bib_json_file, 'w') as json_file: + json_file.write(citeproc_json.decode('utf-8')) + + logging.info("Wrote json file") + + return citations_json +# def prepare_bibliography ends here + +def add_bibliography_monograph(xml_tree, refs_for_bib_chapter): """Add another chapter containing the bibliography.""" root_element = xml_tree.getroot()