Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
New script
  • Loading branch information
kthoden committed Jul 18, 2019
1 parent f537623 commit 6ab7319
Showing 1 changed file with 45 additions and 0 deletions.
45 changes: 45 additions & 0 deletions gather_pickledata.py
@@ -0,0 +1,45 @@
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-

"""
Gather some data for further conversion steps. This is originally part of fix_tei.
"""

__version__ = "1.0"
__date__ = "20190718"
__author__ = "kthoden@mpiwg-berlin.mpg.de"

import argparse
import logging
import pickle
import fix_tei
from lxml import etree

ns_tei = "http://www.tei-c.org/ns/1.0"
NS_MAP = {"t" : ns_tei}

logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')

def main():
"""The main bit"""

parser = argparse.ArgumentParser()
parser.add_argument("teifile", help="The XML file from which data is pickled.")
parser.add_argument("bibfile", help="The bibliography file for checking the references.")
args = parser.parse_args()

xml_tree = etree.parse(args.teifile)

bibdata = fix_tei.parse_bibtex(args.bibfile)

cited = xml_tree.xpath("//t:bibl/t:ref/@target", namespaces=NS_MAP)
used_citekeys = [fix_tei.unescape(c[1:]) for c in cited]
citekeys_not_in_bib = fix_tei.validate_citations(used_citekeys, bibdata)

picklefile = "output/imxml/tmp_files/data.pickle"
fix_tei.pickle_data(citekeys_not_in_bib, used_citekeys, picklefile)
# def main ends here

if __name__ == '__main__':
main()
# finis

0 comments on commit 6ab7319

Please sign in to comment.