Skip to content
Permalink
99107d1ad2
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 93 lines (77 sloc) 2.46 KB
#!/usr/bin/env python3
# -*- coding: utf-8; mode: python -*-
"""
Gather some data for further conversion steps. This is originally part of fix_tei.
"""
__version__ = "1.0"
__date__ = "20190718"
__author__ = "kthoden@mpiwg-berlin.mpg.de"
from utils.load_config import load_config
from pathlib import Path
import os
import shutil
import argparse
import logging
import pickle
import fix_tei
from lxml import etree
DEFAULT_INPUT_DIR = \
Path(os.environ['INPUT_DIR'] if 'INPUT_DIR' in os.environ else './input')
DEFAULT_OUTPUT_DIR = \
Path(os.environ['OUTPUT_DIR'] if 'OUTPUT_DIR' in os.environ else './output')
ns_tei = "http://www.tei-c.org/ns/1.0"
NS_MAP = {"t" : ns_tei}
logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
def main(
teifile,
bibfile,
output,
):
"""The main bit"""
xml_tree = etree.parse(args.teifile)
bibdata = fix_tei.parse_bibtex(args.bibfile)
cited = xml_tree.xpath("//t:bibl/t:ref/@target", namespaces=NS_MAP)
used_citekeys = [fix_tei.unescape(c[1:]) for c in cited]
citekeys_not_in_bib = fix_tei.validate_citations(used_citekeys, bibdata)
fix_tei.pickle_data(citekeys_not_in_bib, used_citekeys, output)
# def main ends here
if __name__ == '__main__':
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"teifile",
help="The XML file from which data is pickled."
)
parser.add_argument(
"bibfile",
help="The bibliography file for checking the references."
)
parser.add_argument(
"-o", "--output-dir",
required = True,
metavar = "OUTPUT_DIR",
help="output directory"
)
# picklefile = "output/imxml/tmp_files/data.pickle"
parser.add_argument(
"-!", "--overwrite",
action = "store_true",
default = False,
help="overwrite files at OUTPUT_DIR"
)
args = parser.parse_args()
output_dir = Path( args.output_dir )
if output_dir.exists():
if args.overwrite:
shutil.rmtree( output_dir )
else:
raise( Exception( f"output directory already existing: '{output_dir}'!" ) )
if not output_dir.exists():
os.mkdir( output_dir )
main(
teifile = args.teifile,
bibfile = args.bibfile,
output = output_dir / "data.pickle",
)
# finis