run.py

#!/usr/bin/env python3
import argparse
import sys

from pipeline.check.sanity import check_sanity_config, check_sanity_data
from pipeline.interpro import InterProPipeline
from pipeline.transcriptome import TranscriptomePipeline
from pipeline.orthology import OrthologyPipeline


def run_pipeline(args):
    """
    Runs pipeline based on settings in args.

    :param args: Parsed arguments from argparse
    """
    if check_sanity_config(args.config) and check_sanity_data(args.data):
        if args.transcriptomics:
            tp = TranscriptomePipeline(args.config,
                                       args.data,
                                       enable_log=args.enable_log,
                                       use_hisat2=args.use_hisat2)

            if args.indexing:
                tp.prepare_genome()
            else:
                print("Skipping Indexing", file=sys.stderr)

            if args.trim_fastq:
                tp.trim_fastq()
            else:
                print("Skipping Trimmomatic", file=sys.stderr)

            if args.alignment:
                    tp.run_alignment(keep_previous=args.keep_intermediate)
            else:
                print("Skipping Alignment", file=sys.stderr)

            if args.htseq:
                tp.run_htseq_count(keep_previous=args.keep_intermediate)
            else:
                print("Skipping htseq-counts", file=sys.stderr)

            if args.qc:
                tp.check_quality()
            else:
                print("Skipping quality control", file=sys.stderr)

            if args.exp_matrix:
                tp.htseq_to_matrix()
                tp.normalize_rpkm()
                tp.normalize_tpm()
            else:
                print("Skipping expression matrix", file=sys.stderr)

            if args.pcc:
                tp.run_pcc()
            else:
                print("Skipping PCC calculations", file=sys.stderr)

            if args.mcl:
                tp.cluster_pcc()
            else:
                print("Skipping MCL clustering of PCC values", file=sys.stderr)
        else:
            print("Skipping transcriptomics", file=sys.stderr)

        # Run InterPro Section
        if args.interpro:
            ip = InterProPipeline(args.config, args.data)
            ip.run_interproscan()
        else:
            print("Skipping Interpro", file=sys.stderr)

        # Run Orthology Section
        if args.orthology:
            op = OrthologyPipeline(args.config, args.data)
            if args.orthofinder:
                op.run_orthofinder()

            if args.mcl_families:
                op.run_mcl()
        else:
            print("Skipping Orthology", file=sys.stderr)

    else:
        print("Sanity check failed, cannot start pipeline", file=sys.stderr)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(prog="./run.py")

    parser.add_argument('config', help='path to config.ini')
    parser.add_argument('data', help='path to data.ini')

    # Optional arguments
    parser.add_argument('--skip-transcriptomics', dest='transcriptomics', action='store_false', help='add --skip-transcriptomics to skip the entire transcriptome step')
    parser.add_argument('--enable-interpro', dest='interpro', action='store_true', help='Runs InterProScan to detect protein domains and provide functional annotation')
    parser.add_argument('--enable-orthology', dest='orthology', action='store_true', help='Runs OrhtoFinder and MCL to detect orthogroups, gene families and orthologs')

    parser.add_argument('--use-hisat2', dest='use_hisat2', action='store_true', help='Use HISAT2 to build the index and align reads instead of BowTie2 and TopHat2')

    parser.add_argument('--skip-indexing', dest='indexing', action='store_false', help='add --skip-indexing to skip building an index (for read alignment) on the genome (BowTie2 or HISAT2)')
    parser.add_argument('--skip-trim-fastq', dest='trim_fastq', action='store_false', help='add --skip-trim-fastq to skip trimming fastq files using trimmomatic')
    parser.add_argument('--skip-alignment', dest='alignment', action='store_false', help='add --skip-alignment to skip the read alignment step (TopHat 2 or HISAT2)')
    parser.add_argument('--skip-htseq', dest='htseq', action='store_false', help='add --skip-htseq to skip counting reads per gene with htseq-count')
    parser.add_argument('--skip-qc', dest='qc', action='store_false', help='add --skip-qc to skip quality control of tophat and htseq output')
    parser.add_argument('--skip-exp-matrix', dest='exp_matrix', action='store_false', help='add --skip-exp-matrix to skip converting htseq files to an expression matrix')
    parser.add_argument('--skip-pcc', dest='pcc', action='store_false', help='add --skip-pcc to skip calculating PCC values')
    parser.add_argument('--skip-mcl', dest='mcl', action='store_false', help='add --skip-mcl to skip clustering PCC values using MCL')

    parser.add_argument('--skip-orthofinder', dest='orthofinder', action='store_false', help='add --skip-orthofinder to skip the orthology detection')
    parser.add_argument('--skip-mcl-families', dest='mcl_families', action='store_false', help='add --skip-mcl to skip clustering blast with MCL')

    parser.add_argument('--remove-intermediate', dest='keep_intermediate', action='store_false', help='add --remove-intermediate to clear trimmomatic and tophat files after completing those steps')
    parser.add_argument('--disable-log', dest='enable_log', action='store_false',
                        help='add --disable-log to disable writing additional statistics.')

    # Flags for the big sections of the pipeline
    parser.set_defaults(transcriptomics=True)
    parser.set_defaults(interpro=False)
    parser.set_defaults(orthology=False)

    parser.set_defaults(use_hisat2=False)

    # Flags for individual tools for transcriptomics
    parser.set_defaults(indexing=True)
    parser.set_defaults(trim_fastq=True)
    parser.set_defaults(alignment=True)
    parser.set_defaults(htseq=True)
    parser.set_defaults(qc=True)
    parser.set_defaults(exp_matrix=True)
    parser.set_defaults(pcc=True)
    parser.set_defaults(mcl=True)

    parser.set_defaults(orthofinder=True)
    parser.set_defaults(mcl_families=True)

    parser.set_defaults(keep_intermediate=True)
    parser.set_defaults(enable_log=True)

    # Parse arguments and start pipeline
    args = parser.parse_args()

    run_pipeline(args)