From 56a39f736ded10aaff56be993108d7507a3162e6 Mon Sep 17 00:00:00 2001 From: EsGeh Date: Tue, 7 Jan 2020 11:53:56 +0100 Subject: [PATCH] add process_tei.py script, adjust some scripts where needed --- README.md | 25 ++------ src/gather_pickledata.py | 93 --------------------------- src/process_eoa_latex.py | 7 --- src/process_tei.py | 92 +++++++++++++++++++++++++++ src/tei2html.py | 27 ++++---- src/tei_add_bibl.py | 21 +++++++ src/tei_pickle.py | 131 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 262 insertions(+), 134 deletions(-) delete mode 100755 src/gather_pickledata.py create mode 100755 src/process_tei.py create mode 100755 src/tei_pickle.py diff --git a/README.md b/README.md index b98fb34..616a4cf 100644 --- a/README.md +++ b/README.md @@ -93,9 +93,9 @@ In order to apply the workflow to any other publication copy it into the `runtim $ ./scripts/run.py # run if not yet running $ ./scripts/exec_in_container.py # enter container -1. eoatex -> pdf +1. process eoatex: - $ process_eoa_latex.py -f input/example/124_eoatex/EOASample.tex + $ process_eoa_latex.py input/example/124_eoatex (adjust filename if necessary) @@ -111,26 +111,9 @@ In order to apply the workflow to any other publication copy it into the `input/ $ ./scripts/run.py # run if not yet running $ ./scripts/exec_in_container.py # enter container -1. eoaTEI -> eoaTEI with bibliography +1. process tei - $ tei_add_bibl.py input/example/125_tei_part - -1. eoaTEI -> eoaTEX - - $ tei2eoatex.py input/example/125_tei_part - -1. eoaTEX -> pdf - - $ eoatex2pdf.py -o output/125_tei_part/pdf output/125_tei_part/eoatex - -1. eoaTEI -> imxml (to intermediate xml) - - $ gather_pickledata.py -o output/125_tei_part/pickle output/with_bibl/125_tei_part/{tei_part_with_bibl.xml,texfiles/example.bib} - tei2imxml.py -f tei_part.xml output/with_bibl/125_tei_part - -1. eoaTEI -> html - - $ tei2html.py output/with_bibl/125_tei_part + $ process_tei.py input/example/125_tei_part ## The DocX workflow (DocX -> TEI -> ...) (TODO: describe how) diff --git a/src/gather_pickledata.py b/src/gather_pickledata.py deleted file mode 100755 index da43662..0000000 --- a/src/gather_pickledata.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8; mode: python -*- - -""" -Gather some data for further conversion steps. This is originally part of fix_tei. -""" - -__version__ = "1.0" -__date__ = "20190718" -__author__ = "kthoden@mpiwg-berlin.mpg.de" - -from utils.load_config import load_config - -from pathlib import Path -import os -import shutil -import argparse -import logging -import pickle -import fix_tei -from lxml import etree - -DEFAULT_INPUT_DIR = \ - Path(os.environ['INPUT_DIR'] if 'INPUT_DIR' in os.environ else './input') -DEFAULT_OUTPUT_DIR = \ - Path(os.environ['OUTPUT_DIR'] if 'OUTPUT_DIR' in os.environ else './output') - - -ns_tei = "http://www.tei-c.org/ns/1.0" -NS_MAP = {"t" : ns_tei} - -logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s') - -def main( - teifile, - bibfile, - output, -): - """The main bit""" - xml_tree = etree.parse(args.teifile) - - bibdata = fix_tei.parse_bibtex(args.bibfile) - - cited = xml_tree.xpath("//t:bibl/t:ref/@target", namespaces=NS_MAP) - used_citekeys = [fix_tei.unescape(c[1:]) for c in cited] - citekeys_not_in_bib = fix_tei.validate_citations(used_citekeys, bibdata) - - fix_tei.pickle_data(citekeys_not_in_bib, used_citekeys, output) -# def main ends here - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument( - "teifile", - help="The XML file from which data is pickled." - ) - parser.add_argument( - "bibfile", - help="The bibliography file for checking the references." - ) - parser.add_argument( - "-o", "--output-dir", - required = True, - metavar = "OUTPUT_DIR", - help="output directory" - ) - # picklefile = "output/imxml/tmp_files/data.pickle" - parser.add_argument( - "-!", "--overwrite", - action = "store_true", - default = False, - help="overwrite files at OUTPUT_DIR" - ) - args = parser.parse_args() - - output_dir = Path( args.output_dir ) - - if output_dir.exists(): - if args.overwrite: - shutil.rmtree( output_dir ) - else: - raise( Exception( f"output directory already existing: '{output_dir}'!" ) ) - if not output_dir.exists(): - os.mkdir( output_dir ) - - main( - teifile = args.teifile, - bibfile = args.bibfile, - output = output_dir / "data.pickle", - ) -# finis diff --git a/src/process_eoa_latex.py b/src/process_eoa_latex.py index c604831..809a1fd 100755 --- a/src/process_eoa_latex.py +++ b/src/process_eoa_latex.py @@ -49,13 +49,6 @@ def main( parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter ) - ''' - parser.add_argument( - "-f", "--filename", - required = True, - help="Name of main EOATeX file (without suffix!)." - ) - ''' parser.add_argument( "-c", "--config", default = BASE_DIR / "config" / "eoaconvert.cfg", diff --git a/src/process_tei.py b/src/process_tei.py new file mode 100755 index 0000000..6b42cf5 --- /dev/null +++ b/src/process_tei.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +from utils.load_config import load_config, exec_command + +# imports +import argparse +from pathlib import Path +import glob +import os +import subprocess +import shutil +import logging + +BASE_DIR = Path( __file__ ).resolve().parent +SCRIPT_PATH = Path( __file__ ) +SCRIPT_NAME = SCRIPT_PATH.stem + +DEFAULT_INPUT_DIR = \ + Path(os.environ['INPUT_DIR'] if 'INPUT_DIR' in os.environ else './input') + +DEFAULT_OUTPUT_DIR = \ + Path(os.environ['OUTPUT_DIR'] if 'OUTPUT_DIR' in os.environ else './output') + +DEFAULT_DEPENDENCIES_DIR = \ + Path(os.environ['DEPENDENCIES_DIR'] if 'DEPENDENCIES_DIR' in os.environ else './dependencies') + +def main( + publ_dir +): + PUBL_NAME = publ_dir.resolve().stem + exec_command( + f"tei_add_bibl.py -! \"{publ_dir}\"" + ) + exec_command( + f"tei2eoatex.py -! \"{publ_dir}\"" + ) + exec_command( + f"eoatex2pdf.py --output-dir \"{DEFAULT_OUTPUT_DIR}/{PUBL_NAME}/pdf\" \"{DEFAULT_OUTPUT_DIR}/{PUBL_NAME}/eoatex\"" + ) + exec_command( + f"tei_pickle.py -! \"{DEFAULT_OUTPUT_DIR}/with_bibl/{PUBL_NAME}\"" + ) + exec_command( + f"tei2imxml.py --filename no_bibl.xml \"{DEFAULT_OUTPUT_DIR}/with_bibl/{PUBL_NAME}\"" + ) + exec_command( + f"tei2html.py -! --filename \"with_bibl.xml\" \"{DEFAULT_OUTPUT_DIR}/with_bibl/{PUBL_NAME}\"" + ) + +if __name__ == "__main__": + + ##################### + # Parsing arguments # + ##################### + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "-c", "--config", + default = BASE_DIR / "config" / "eoaconvert.cfg", + dest="CONFIG_FILE", + help="Name of configuration file", + metavar="CONFIGURATION" + ) + parser.add_argument( + "-l", "--log-file", + default = (DEFAULT_OUTPUT_DIR / 'logs' / SCRIPT_NAME).with_suffix(".log"), + help="logfile" + ) + parser.add_argument( + "--log-level", + default = "INFO", + help="log level: choose between DEBUG, INFO, WARNING, ERROR, CRITICAL" + ) + parser.add_argument( + "PUBLICATION_DIR", + help = "directory containing the publication (including resources like pictures, etc.)", + type = Path, +) + + args = parser.parse_args() + + CONFIG = load_config( + args.CONFIG_FILE, + args.log_level, + args.log_file, + ) + + main( + publ_dir = Path( args.PUBLICATION_DIR ) + ) diff --git a/src/tei2html.py b/src/tei2html.py index a13a1d5..67d3458 100755 --- a/src/tei2html.py +++ b/src/tei2html.py @@ -70,12 +70,14 @@ def copy_dir( default = "INFO", help="log level: choose between DEBUG, INFO, WARNING, ERROR, CRITICAL" ) + ''' parser.add_argument( "--root-dir", default = DEFAULT_OUTPUT_DIR / "html_from_tei", type = Path, help="" ) + ''' parser.add_argument( "-p", "--param", action = 'append', @@ -91,7 +93,7 @@ def copy_dir( ) parser.add_argument( "-f", "--filename", - default = Path("*_with_bibl.xml"), + default = Path("with_bibl.xml"), type = Path, help = "xml file inside PUBLICATION_DIR, or absolute path. Patterns like '*.xml' are also acceptable" ) @@ -150,11 +152,20 @@ def copy_dir( output_dir = \ args.output_dir if args.output_dir is not None else (DEFAULT_OUTPUT_DIR / publ_dir.resolve().stem) / "html" - CONFIG_FILE = args.config + if not tei_filename.is_file(): + raise( Exception( + f"not a valid input file: {tei_filename}" + ) ) + + if output_dir.exists(): + if args.overwrite: + rmtree( output_dir ) + else: + raise( Exception( f"output directory already existing: '{output_dir}'!" ) ) + CONFIG_FILE = args.config log_dir = output_dir / "log" log_file = (log_dir / SCRIPT_NAME) . with_suffix( ".log" ) - CONFIG = load_config( CONFIG_FILE, args.log_level, @@ -163,16 +174,6 @@ def copy_dir( logging.info( f"tei_file: {tei_filename}, publ_dir: {publ_dir}" ) - if not tei_filename.is_file(): - raise( Exception( - f"not a valid input file: {tei_filename}" - ) ) - - if output_dir.exists(): - if args.overwrite: - rmtree( output_dir ) - else: - raise( Exception( f"output directory already existing: '{output_dir}'!" ) ) if not output_dir.exists(): mkdir( output_dir ) ## copy webdesign: diff --git a/src/tei_add_bibl.py b/src/tei_add_bibl.py index caa0eb1..131af4a 100755 --- a/src/tei_add_bibl.py +++ b/src/tei_add_bibl.py @@ -220,3 +220,24 @@ def create_bibl( ) else: raise( Exception("unknown publication type!")) + + # create uniquely named links to + # original tei file and the one with added bibliography + orig_link = output_dir / "no_bibl.xml" + if orig_link.exists(): + logging.error( + f"error while creating unique link: file already exists '{orig_link}'" + ) + exit(1) + orig_link . symlink_to( + tei_file . name + ) + with_bibl_link = output_dir / "with_bibl.xml" + if with_bibl_link.exists(): + logging.error( + f"error while creating unique link: file already exists '{with_bibl_link}'" + ) + exit(1) + with_bibl_link . symlink_to( + tei_with_bibl_file . name + ) diff --git a/src/tei_pickle.py b/src/tei_pickle.py new file mode 100755 index 0000000..b162a59 --- /dev/null +++ b/src/tei_pickle.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8; mode: python -*- + +""" +Gather some data for further conversion steps. This is originally part of fix_tei. +""" + +__version__ = "1.0" +__date__ = "20190718" +__author__ = "kthoden@mpiwg-berlin.mpg.de" + +from utils.load_config import load_config + +from pathlib import Path +import os +import shutil +import argparse +import logging +import pickle +import fix_tei +from lxml import etree + + +BASE_DIR = Path( os.path.realpath(__file__) ).parent +SCRIPT_NAME = Path( __file__).stem + +DEFAULT_INPUT_DIR = \ + Path(os.environ['INPUT_DIR'] if 'INPUT_DIR' in os.environ else './input') +DEFAULT_OUTPUT_DIR = \ + Path(os.environ['OUTPUT_DIR'] if 'OUTPUT_DIR' in os.environ else './output') + + +ns_tei = "http://www.tei-c.org/ns/1.0" +NS_MAP = {"t" : ns_tei} + +def main( + tei_file, + bib_file, + output, +): + """The main bit""" + xml_tree = etree.parse(str(tei_file)) + + bibdata = fix_tei.parse_bibtex(bib_file) + + cited = xml_tree.xpath("//t:bibl/t:ref/@target", namespaces=NS_MAP) + used_citekeys = [fix_tei.unescape(c[1:]) for c in cited] + citekeys_not_in_bib = fix_tei.validate_citations(used_citekeys, bibdata) + + fix_tei.pickle_data(citekeys_not_in_bib, used_citekeys, output) +# def main ends here + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "-c", "--config", + dest="CONFIG_FILE", + default = BASE_DIR / "config" / "eoaconvert.cfg", + help="Name of configuration file", + metavar="CONFIGURATION" + ) + parser.add_argument( + "--log-level", + default = "INFO", + help="log level: choose between DEBUG, INFO, WARNING, ERROR, CRITICAL" + ) + parser.add_argument( + "-o", "--output-dir", + help = f"output directory. default: {DEFAULT_OUTPUT_DIR}/PUBLICATION_NAME/pickle", + type = Path, + ) + parser.add_argument( + "-f", "--filename", + default = Path("*.xml"), + type = Path, + help = "eoaTEI file inside PUBLICATION_DIR, or absolute path. Patterns like '*.xml' are also acceptable" + ) + parser.add_argument( + "-b", "--bib-file", + default = Path("**/*.bib"), + type = Path, + help = "bibtex file inside PUBLICATION_DIR, or absolute path. Patterns like '**/*.bib' are also acceptable" + ) + parser.add_argument( + "PUBLICATION_DIR", + help = "directory containing the publication (including resources like pictures, etc.)", + type = Path, + ) + parser.add_argument( + "-!", "--overwrite", + action = "store_true", + default = False, + help="overwrite files at OUTPUT_DIR" + ) + args = parser.parse_args() + + input_dir = args.PUBLICATION_DIR + tei_file = \ + args.filename if args.filename . is_absolute() else list( input_dir . glob( str(args.filename) ))[0] + bib_file = \ + args.bib_file if args.bib_file . is_absolute() else list( input_dir . glob( str(args.bib_file) ))[0] + + output_dir = \ + args.output_dir if args.output_dir is not None else (DEFAULT_OUTPUT_DIR / input_dir.resolve().stem) / "pickle" + log_dir = output_dir / "log" + + config_file = args.CONFIG_FILE + print("The config file is ", config_file) + + if output_dir.exists(): + if args.overwrite: + shutil.rmtree( output_dir ) + else: + raise( Exception( f"output directory already existing: '{output_dir}'!" ) ) + if not output_dir.exists(): + os.mkdir( output_dir ) + + CONFIG = load_config( + config_file, + args.log_level, + (log_dir / SCRIPT_NAME) . with_suffix( ".log" ), + ) + + main( + tei_file = tei_file, + bib_file = bib_file, + output = output_dir / "data.pickle", + ) +# finis