Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Option for disabling citedRange parser
  • Loading branch information
kthoden committed Dec 21, 2018
1 parent 38ba854 commit 2c1d168
Showing 1 changed file with 31 additions and 19 deletions.
50 changes: 31 additions & 19 deletions fix_tei.py
Expand Up @@ -136,7 +136,7 @@ def convert_references(string):
return string
# def convert_references ends here

def convert_citations(string):
def convert_citations(string, citedrangetext):
"""Find citation shorthand using regex.
Return a tuple of the modified string and a list of found citations
Expand All @@ -155,12 +155,18 @@ def convert_citations(string):
year_citations_pattern = re.compile(r"(§|&#xA7;|&#167;)(§|&#xA7;|&#167;)(?P<citekey>.+?)(\!(?P<pages>.*?))?(§|&#xA7;|&#167;)(§|&#xA7;|&#167;)")
year_citations = re.findall(year_citations_pattern, string)
logging.info("Found %s year citations." % len(year_citations))
string = re.sub(year_citations_pattern, r"<bibl><ref type='inline' target='#\g<citekey>'/><citedRange from='\g<pages>'/></bibl>", string)
if citedrangetext:
string = re.sub(year_citations_pattern, r"<bibl><ref type='inline' target='#\g<citekey>'/><citedRange>\g<pages></citedRange></bibl>", string)
else:
string = re.sub(year_citations_pattern, r"<bibl><ref type='inline' target='#\g<citekey>'/><citedRange from='\g<pages>'/></bibl>", string)

authoryear_citation_pattern = re.compile(r"(§|&#xA7;|&#167;)(?P<citekey>.+?)(\!(?P<pages>.*?))?(§|&#xA7;|&#167;)")
authoryear_citations = re.findall(authoryear_citation_pattern, string)
logging.info("Found %s author/year citations." % len(authoryear_citations))
string = re.sub(authoryear_citation_pattern, r"<bibl><ref target='#\g<citekey>'/><citedRange from='\g<pages>'/></bibl>", string)
if citedrangetext:
string = re.sub(authoryear_citation_pattern, r"<bibl><ref target='#\g<citekey>'/><citedRange>\g<pages></citedRange></bibl>", string)
else:
string = re.sub(authoryear_citation_pattern, r"<bibl><ref target='#\g<citekey>'/><citedRange from='\g<pages>'/></bibl>", string)

for year_citation in year_citations:
citations.append(year_citation[2])
Expand All @@ -179,23 +185,27 @@ def parse_cited_range(list_of_xml_elements):
cited_range = reference.find("t:citedRange", namespaces=NS_MAP)
from_value = (cited_range.get("from"))

split_values = re.findall(r"[\w']+", from_value)
if len(from_value) == 0:
cited_range.tag = "tagtobestripped"
cited_range.attrib.pop("from")
elif len(split_values) == 1:
cited_range.set("from", split_values[0])
elif len(split_values) == 2:
cited_range.set("from", split_values[0])
cited_range.set("to", split_values[1])
elif len(split_values) == 3:
cited_range.set("from", split_values[0])
cited_range.set("to", split_values[2])
if from_value is None:
pass
else:
logging.info("Splitting the page range produced unexpected result. Tried to split %s. Wrote to text field." % from_value)
cited_range.text = from_value
cited_range.attrib.pop("from")
unsplittable_pageref.append(from_value)

split_values = re.findall(r"[\w']+", from_value)
if len(from_value) == 0:
cited_range.tag = "tagtobestripped"
cited_range.attrib.pop("from")
elif len(split_values) == 1:
cited_range.set("from", split_values[0])
elif len(split_values) == 2:
cited_range.set("from", split_values[0])
cited_range.set("to", split_values[1])
elif len(split_values) == 3:
cited_range.set("from", split_values[0])
cited_range.set("to", split_values[2])
else:
logging.info("Splitting the page range produced unexpected result. Tried to split %s. Wrote to text field." % from_value)
cited_range.text = from_value
cited_range.attrib.pop("from")
unsplittable_pageref.append(from_value)

return unsplittable_pageref
# def parse_cited_range ends here
Expand Down Expand Up @@ -551,9 +561,11 @@ def main():
parser.add_argument("-d", "--dochighestorder", default='chapter', help="Specify which divider is at the highest level, possible values: part, chapter. Default is chapter.")
parser.add_argument("-p", "--bibtexparserlog", help="Display logging output of bibtexparser", action="store_true")
parser.add_argument("-f", "--finalize", help="Finalize a publication.", action="store_true")
parser.add_argument("-t", "--citedrangetext", help="Do not try to parse cited range values.", action="store_true")
parser.add_argument("-b", "--bibtype", help="Specify the type of bibliography, possible values: anthology, monograph.", default="monograph")
parser.add_argument("-c", "--chapter", help="Treat the TEI as one chapter, discards header.", action="store_true")
parser.add_argument("-a", "--addbibliography", help="Add a section with bibliography PI.", action="store_true")
parser.add_argument("-l", "--printlog", help="Write logfile to stdout instead of writing to file.", action="store_true")
parser.add_argument("teifile", help="Output from oxgarage/metypeset, an TEI XML file.")
parser.add_argument("bibfile", help="The bibliography database of the publication.")
parser.add_argument("figdir", help="The directory that contains the figures belonging to the publication.")
Expand Down

0 comments on commit 2c1d168

Please sign in to comment.