diff --git a/gather_pickledata.py b/gather_pickledata.py new file mode 100644 index 0000000..da66a5a --- /dev/null +++ b/gather_pickledata.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8; mode: python -*- + +""" +Gather some data for further conversion steps. This is originally part of fix_tei. +""" + +__version__ = "1.0" +__date__ = "20190718" +__author__ = "kthoden@mpiwg-berlin.mpg.de" + +import argparse +import logging +import pickle +import fix_tei +from lxml import etree + +ns_tei = "http://www.tei-c.org/ns/1.0" +NS_MAP = {"t" : ns_tei} + +logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') + +def main(): + """The main bit""" + + parser = argparse.ArgumentParser() + parser.add_argument("teifile", help="The XML file from which data is pickled.") + parser.add_argument("bibfile", help="The bibliography file for checking the references.") + args = parser.parse_args() + + xml_tree = etree.parse(args.teifile) + + bibdata = fix_tei.parse_bibtex(args.bibfile) + + cited = xml_tree.xpath("//t:bibl/t:ref/@target", namespaces=NS_MAP) + used_citekeys = [fix_tei.unescape(c[1:]) for c in cited] + citekeys_not_in_bib = fix_tei.validate_citations(used_citekeys, bibdata) + + picklefile = "output/imxml/tmp_files/data.pickle" + fix_tei.pickle_data(citekeys_not_in_bib, used_citekeys, picklefile) +# def main ends here + +if __name__ == '__main__': + main() +# finis