Skip to content

Commit

Permalink
Bibliography output added
Browse files Browse the repository at this point in the history
  • Loading branch information
Klaus Thoden committed Jan 26, 2018
1 parent 91e3bd6 commit 0a553d0
Showing 1 changed file with 86 additions and 50 deletions.
136 changes: 86 additions & 50 deletions transform_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import shlex
from bs4 import BeautifulSoup
from lxml import etree, objectify
from lxml.html import soupparser

# things to be done
# assign ids top to bottom for the following elements:
Expand Down Expand Up @@ -85,10 +86,17 @@ def format_citations(used_citekeys, bibdata):
citation_formatter.write("\n# References\n")

with open(OUTPUT_DIR + os.path.sep + "formatted_citations.html", "r") as ding:
dd = BeautifulSoup(ding, "html.parser")
cites = BeautifulSoup(ding, "html.parser")
with open(OUTPUT_DIR + os.path.sep + "formatted_citations.html", "r") as ding:
reference_list = soupparser.fromstring(ding, features="html.parser")

# references = dd.xpath("//div[@class='references']")
# with open("tmp_files/formatted_citations.html", "r") as ding:

full_paren_cites = dd.select("#full-parentheses ~ p > span")
year_paren_cites = dd.select("#year-parentheses ~ p > span")
references = reference_list.xpath("//div[@class='references']")[0]

# full_paren_cites = cites.select("#full-parentheses ~ p > span")
# year_paren_cites = cites.select("#year-parentheses ~ p > span")

citation_dict = {}

Expand All @@ -99,11 +107,11 @@ def format_citations(used_citekeys, bibdata):
strTitle = entry_2["title"]

title = strTitle
full_paren = dd.select("#citeauthoryear ~ p > span[data-cites='%s']" % entry)[0].text
year_paren = dd.select("#yearparen ~ p > span[data-cites='%s']" % entry)[0].text
full_paren = cites.select("#citeauthoryear ~ p > span[data-cites='%s']" % entry)[0].text
year_paren = cites.select("#yearparen ~ p > span[data-cites='%s']" % entry)[0].text
citation_dict[entry] = (full_paren, year_paren, title)

return citation_dict
return citation_dict, references
# def format_citations ends here

def format_pagerange(pagerange_start, pagerange_end):
Expand Down Expand Up @@ -273,7 +281,6 @@ def transform_body(xml_tree, cited_data, publang):
##############
# Hi-Element #
##############

eoa_hi = xml_tree.xpath("//t:hi", namespaces=NS_MAP)

for hi in eoa_hi:
Expand All @@ -290,20 +297,11 @@ def transform_body(xml_tree, cited_data, publang):
else:
logging.debug("The rend attribute in hi has the value %s. This is not supported" % rend_attribute)

"""
<xsl:template match="tei:hi[@rend='math']">
<xsl:element name="em">
<xsl:apply-templates/>
</xsl:element>
</xsl:template>
"""

return xml_tree
# def transform_body ends here

def assign_ids(xml_tree):
"""Walk the xml tree again and assign ids."""
def assign_ids(xml_tree, data):
"""Walk the xml tree again. Assign ids to xml and put them into dicts, as well."""

chapterdict = {}
figdict = {}
Expand All @@ -325,10 +323,11 @@ def assign_ids(xml_tree):
table_counter = 1
theorem_counter = 1

chapter.set("id-text", str(chapter_counter))
chapterdict[chapter.get("id")] = str(chapter_counter)
if chapter.get('rend') != "nonumber":
chapter.set("id-text", str(chapter_counter))
chapterdict[chapter.get("id")] = str(chapter_counter)

figure_anchors = chapter.xpath("//EOAfigure/anchor")
figure_anchors = chapter.findall(".//EOAfigure/anchor")
figure_counter = 1
for anchor in figure_anchors:
figure_number = "%d.%d" % (chapter_counter, figure_counter)
Expand All @@ -340,19 +339,19 @@ def assign_ids(xml_tree):
figure_element.set("id", anchor.get("id"))
figdict[anchor.get("id")] = figure_number

footnotes = chapter.xpath("//note")
footnotes = chapter.findall(".//note")
for footnote in footnotes:
fndict[footnote.get("id")] = footnote.get("n")

sections = chapter.xpath("//div2")
sections = chapter.findall(".//div2")
section_counter = 1
for section in sections:
section_number = "%d.%d" % (chapter_counter, section_counter)
section.set("id-text", section_number)
secdict[section.get("id")] = section_number

subsection_counter = 1
subsections = section.xpath("//div3")
subsections = section.findall(".//div3")
for subsection in subsections:
subsection_number = "%d.%d.%d" % (chapter_counter, section_counter, subsection_counter)
subsection.set("id-text", subsection_number)
Expand All @@ -364,11 +363,49 @@ def assign_ids(xml_tree):

# not implemented yet: equation, list, pagelabel, tab, theorem

print(chapterdict, figdict, fndict, secdict)

return xml_tree
data["chapterdict"] = chapterdict
data["figdict"] = figdict
data["eqdict"] = eqdict
data["fndict"] = fndict
data["listdict"] = listdict
data["pagelabeldict"] = pagelabeldict
data["secdict"] = secdict
data["tabdict"] = tabdict
data["theoremdict"] = theoremdict

return xml_tree, data
# def assign_ids ends here

def add_bibliography(xml_tree, refs_for_bib_chapter):
"""Add another chapter containing the bibliography."""

root_element = xml_tree.getroot()

xml_chapters = root_element.xpath("//div1")
number_of_chapters = len(xml_chapters)
bibliography_chapter = etree.Element("div1", rend="nonumber", language="english")
# this needs to be configurable by language
bib_head = etree.SubElement(bibliography_chapter, "head").text = "Bibliography"
bib_div_1 = etree.SubElement(bibliography_chapter, "div")
bib_div_2 = etree.SubElement(bib_div_1, "div")

entries = refs_for_bib_chapter.findall(".//div")

for entry in entries:
entry_id = entry.get("id")
entry.set("class", "bibliography")
etree.strip_tags(entry, "p")
entry.tag = "p"
internal_markup = entry.findall(".//em")
for markup in internal_markup:
markup.tag = "i"

bib_div_2.append(entry)

root_element.insert(number_of_chapters + 1, bibliography_chapter)

return root_element
# def add_bibliography ends here

if __name__ == '__main__':
if len(sys.argv) == 1:
Expand All @@ -383,16 +420,6 @@ def assign_ids(xml_tree):

used_citekeys = data["citekeys"]

dictChapters = data["chapterdict"]
dictEquations = data["eqdict"]
dictLists = data["listdict"]
dictTheorems = data["theoremdict"]
dictSections = data["secdict"]
dictFigures = data["figdict"]
dictFootnotes = data["fndict"]
dictTables = data["tabdict"]
dictPagelabels = data["pagelabeldict"]

tei_document = sys.argv[-1]
xml_tree = etree.parse(tei_document)

Expand All @@ -410,14 +437,13 @@ def assign_ids(xml_tree):
# json
interim_bib_json_file = "tmp-bib.json"
citeproc_command = "pandoc-citeproc --bib2json %s" % bib_data["source"]
logging.debug(citeproc_command)
citeproc_arguments = shlex.split(citeproc_command)
citeproc_process = subprocess.Popen(citeproc_arguments, stdout=subprocess.PIPE)
citeproc_json = citeproc_process.stdout.read()
citations_json = json.loads(citeproc_json)

# cited_dict = format_citations(set(used_citekeys), bibdata)
cited_dict = format_citations(set(used_citekeys), citations_json)
# refs for bib_chapter contains formatted reference entries
cited_dict, refs_for_bib_chapter = format_citations(set(used_citekeys), citations_json)
# render_reference(all_references, cited_dict)

assert(bib_data["type"] in ["monograph", "anthology", "monograph-numeric", "anthology-numeric"])
Expand All @@ -427,29 +453,39 @@ def assign_ids(xml_tree):
body_transformed = transform_body(tei_body, cited_dict, publang=publication_language)

resulting_tree = etree.ElementTree(body_transformed)
etree.strip_tags(resulting_tree, "tagtobestripped")
xml_add_bib = add_bibliography(resulting_tree, refs_for_bib_chapter)

etree.strip_tags(xml_add_bib, "tagtobestripped")

elements_with_ids = resulting_tree.xpath(".//div1 | //div2 | //div3 | //note | //item | //table | //EOAfigure/anchor | //EOAequation | //formula | //theorem")
elements_with_ids = xml_add_bib.xpath("//div1 | //div2 | //div3 | //note | //item | //table | //EOAfigure/anchor | //EOAequation | //formula | //theorem")
element_counter = 1
for element in elements_with_ids:
element.set("id", "uid" + str(element_counter))
element_counter += 1

assigned_ids = assign_ids(resulting_tree)

assigned_ids, data_to_pickle = assign_ids(resulting_tree, data)
xml_root = assigned_ids.getroot()

xml_root.tag = "Book"

etree.cleanup_namespaces(xml_root)
objectify.deannotate(resulting_tree, cleanup_namespaces=True)
final_tree = etree.ElementTree(xml_root)
# objectify.deannotate(final_tree, cleanup_namespaces=True)
# etree.cleanup_namespaces(xml_root)

if not os.path.exists(OUTPUT_DIR):
os.mkdir(os.path.expanduser(OUTPUT_DIR))
output_filename = OUTPUT_DIR + os.path.sep + "IntermediateXMLFile.xml"
with open(OUTPUT_DIR + os.path.sep + 'data.pickle', 'wb') as f:
# Pickle the 'data' dictionary using the highest protocol available.
pickle.dump(data_to_pickle, f, pickle.HIGHEST_PROTOCOL)

if not os.path.exists("CONVERT"):
os.mkdir(os.path.expanduser("CONVERT"))
if not os.path.exists("debug"):
os.mkdir(os.path.expanduser("debug"))

if not os.path.exists(OUTPUT_DIR):
os.mkdir(os.path.expanduser(OUTPUT_DIR))
output_filename = OUTPUT_DIR + os.path.sep + "IntermediateXMLFile.xml"

resulting_tree.write(output_filename, pretty_print=True, xml_declaration=True,encoding="utf-8")
# resulting_tree.write(output_filename, pretty_print=True, xml_declaration=True,encoding="utf-8")
final_tree.write(output_filename, pretty_print=True, xml_declaration=True,encoding="utf-8")
logging.debug("Wrote %s." % output_filename)
# finis

0 comments on commit 0a553d0

Please sign in to comment.