From 2c1d168981496a12dcff6588cedf38baedde6502 Mon Sep 17 00:00:00 2001 From: kthoden Date: Fri, 21 Dec 2018 13:35:05 +0100 Subject: [PATCH] Option for disabling citedRange parser --- fix_tei.py | 50 +++++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/fix_tei.py b/fix_tei.py index b52c542..b05af46 100644 --- a/fix_tei.py +++ b/fix_tei.py @@ -136,7 +136,7 @@ def convert_references(string): return string # def convert_references ends here -def convert_citations(string): +def convert_citations(string, citedrangetext): """Find citation shorthand using regex. Return a tuple of the modified string and a list of found citations @@ -155,12 +155,18 @@ def convert_citations(string): year_citations_pattern = re.compile(r"(§|§|§)(§|§|§)(?P.+?)(\!(?P.*?))?(§|§|§)(§|§|§)") year_citations = re.findall(year_citations_pattern, string) logging.info("Found %s year citations." % len(year_citations)) - string = re.sub(year_citations_pattern, r"", string) + if citedrangetext: + string = re.sub(year_citations_pattern, r"\g", string) + else: + string = re.sub(year_citations_pattern, r"", string) authoryear_citation_pattern = re.compile(r"(§|§|§)(?P.+?)(\!(?P.*?))?(§|§|§)") authoryear_citations = re.findall(authoryear_citation_pattern, string) logging.info("Found %s author/year citations." % len(authoryear_citations)) - string = re.sub(authoryear_citation_pattern, r"", string) + if citedrangetext: + string = re.sub(authoryear_citation_pattern, r"\g", string) + else: + string = re.sub(authoryear_citation_pattern, r"", string) for year_citation in year_citations: citations.append(year_citation[2]) @@ -179,23 +185,27 @@ def parse_cited_range(list_of_xml_elements): cited_range = reference.find("t:citedRange", namespaces=NS_MAP) from_value = (cited_range.get("from")) - split_values = re.findall(r"[\w']+", from_value) - if len(from_value) == 0: - cited_range.tag = "tagtobestripped" - cited_range.attrib.pop("from") - elif len(split_values) == 1: - cited_range.set("from", split_values[0]) - elif len(split_values) == 2: - cited_range.set("from", split_values[0]) - cited_range.set("to", split_values[1]) - elif len(split_values) == 3: - cited_range.set("from", split_values[0]) - cited_range.set("to", split_values[2]) + if from_value is None: + pass else: - logging.info("Splitting the page range produced unexpected result. Tried to split %s. Wrote to text field." % from_value) - cited_range.text = from_value - cited_range.attrib.pop("from") - unsplittable_pageref.append(from_value) + + split_values = re.findall(r"[\w']+", from_value) + if len(from_value) == 0: + cited_range.tag = "tagtobestripped" + cited_range.attrib.pop("from") + elif len(split_values) == 1: + cited_range.set("from", split_values[0]) + elif len(split_values) == 2: + cited_range.set("from", split_values[0]) + cited_range.set("to", split_values[1]) + elif len(split_values) == 3: + cited_range.set("from", split_values[0]) + cited_range.set("to", split_values[2]) + else: + logging.info("Splitting the page range produced unexpected result. Tried to split %s. Wrote to text field." % from_value) + cited_range.text = from_value + cited_range.attrib.pop("from") + unsplittable_pageref.append(from_value) return unsplittable_pageref # def parse_cited_range ends here @@ -551,9 +561,11 @@ def main(): parser.add_argument("-d", "--dochighestorder", default='chapter', help="Specify which divider is at the highest level, possible values: part, chapter. Default is chapter.") parser.add_argument("-p", "--bibtexparserlog", help="Display logging output of bibtexparser", action="store_true") parser.add_argument("-f", "--finalize", help="Finalize a publication.", action="store_true") + parser.add_argument("-t", "--citedrangetext", help="Do not try to parse cited range values.", action="store_true") parser.add_argument("-b", "--bibtype", help="Specify the type of bibliography, possible values: anthology, monograph.", default="monograph") parser.add_argument("-c", "--chapter", help="Treat the TEI as one chapter, discards header.", action="store_true") parser.add_argument("-a", "--addbibliography", help="Add a section with bibliography PI.", action="store_true") + parser.add_argument("-l", "--printlog", help="Write logfile to stdout instead of writing to file.", action="store_true") parser.add_argument("teifile", help="Output from oxgarage/metypeset, an TEI XML file.") parser.add_argument("bibfile", help="The bibliography database of the publication.") parser.add_argument("figdir", help="The directory that contains the figures belonging to the publication.")