From a6cce89634be4694a5a5d56e416d8aa2bd397708 Mon Sep 17 00:00:00 2001 From: Sebastian Proost Date: Fri, 21 Jul 2017 10:07:10 +0200 Subject: [PATCH] Adding in HISAT2 support --- README.md | 17 +++----- config.template.ini | 84 ++++++++++++++++++++++++++++++--------- pipeline/base.py | 6 ++- pipeline/transcriptome.py | 9 +++-- run.py | 39 ++++++++++++------ 5 files changed, 106 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index 4135c52..259deea 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ Configure config.ini and data.ini using the guidelines below ## Configuration of LSTrAP After copying the templates, **config.ini** needs to be set up to run on your system. It requires the path to Trimmomatic's jar and the -modules where Bowtie, Tophat ... are installed in. +modules where Bowtie, Tophat ... are installed in (if the [modules](http://modules.sourceforge.net/) environment is used. The location of the transcriptome data, the refrence genome and a few per-species options need to be defined in **data.ini**. @@ -62,7 +62,7 @@ Options to enable InterProScan and/or OrthoFinder or to skip certain steps of th ## Quality report After running LSTrAP a log file (*lstrap.log*) is written, in which samples which failed a quality measure -are reported. Note that no samples are excluded from the final network. In case certain samples need to be excluded +are reported. Note that __no samples are excluded from the final network__. In case certain samples need to be excluded from the final network remove the htseq file for the sample you which to exclude and re-run the pipeline skipping all steps prior to building the network. @@ -94,16 +94,9 @@ for LSTrAP can be generated. ## Adapting LSTrAP to other cluster managers -LSTrAP is designed and tested on a cluster running the Oracle Grid Engine, though with minimal effort it can be adopted to run on PBS and Torque -based systems (and likely others). First, in the configuration file, check the qsub parameters (e.g. jobs that require multiple -CPUs to run *-pe cores 4*), that differ between systems are set up properly (the nodes and cores on Torque and PBS need to be -set using *-l nodes=4:ppn=2* to request 4 nodes with 2 processes per node). - -Furthermore the submission script might differ, these are located in **./cluster/templates.py** . For PBS based systems some -settings need to be included by adding *#PBS ...*. - -We strive to get LSTrAP running on as many systems as possible. Do not hesitate to contact us in case you experience difficulties -running LSTrAP on your system. +LSTrAP is designed and tested on a cluster running the Oracle Grid Engine (default), PBS/Torque is also supported. + +Though due to differences in parameters ## Contact diff --git a/config.template.ini b/config.template.ini index 302d7dc..824fab0 100644 --- a/config.template.ini +++ b/config.template.ini @@ -1,45 +1,44 @@ [TOOLS] -; In case there is no module load system on the system set the module name to None +; Tool Configuration +; +; Some tools require additional files or might require a hard coded path to the script. +; Please make sure these are set up correctly. + ; Trimmomatic Path +; ADJUST THIS trimmomatic_path=/home/sepro/tools/Trimmomatic-0.36/trimmomatic-0.36.jar -; Module names -bowtie_module=biotools/bowtie2-2.2.6 -samtools_module=biotools/samtools-1.3 -sratoolkit_module=biotools/sratoolkit-2.5.7 -tophat_module=biotools/tophat-2.1.0 - -interproscan_module=biotools/interproscan-5.16-55.0 - -blast_module=biotools/ncbi-blast-2.3.0+ -mcl_module=biotools/mcl-14.137 +; COMMANDS to run tools +; +; Here the commands used to start different steps are defined, ${name} are variables that will be set by LSTrAP for +; each job. -python_module=devel/Python-2.7.10 -python3_module=devel/Python-3.5.1 - -; commands to run tools +; Note that in some cases hard coded paths were required, adjust these to match the location of these files on +; your system bowtie_cmd=bowtie2-build ${in} ${out} +; ADJUST PATHS TO ADAPTERS trimmomatic_se_command=java -jar ${jar} SE -threads 1 ${in} ${out} ILLUMINACLIP:/home/sepro/tools/Trimmomatic-0.36/adapters/TruSeq3-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 trimmomatic_pe_command=java -jar ${jar} PE -threads 1 ${ina} ${inb} ${outap} ${outau} ${outbp} ${outbu} ILLUMINACLIP:/home/sepro/tools/Trimmomatic-0.36/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 tophat_se_cmd=tophat -p 3 -o ${out} ${genome} ${fq} tophat_pe_cmd=tophat -p 3 -o ${out} ${genome} ${forward},${reverse} -htseq_count_cmd=htseq-count -s no -f bam -t ${feature} -i ${field} ${bam} ${gff} > ${out} +htseq_count_cmd=htseq-count -s no -f ${itype} -t ${feature} -i ${field} ${bam} ${gff} > ${out} interproscan_cmd=interproscan.sh -i ${in_dir}/${in_prefix}${SGE_TASK_ID} -o ${out_dir}/${out_prefix}${SGE_TASK_ID} -f tsv -dp -iprlookup -goterms --tempdir /tmp pcc_cmd=python3 ./scripts/pcc.py ${in} ${out} ${mcl_out} mcl_cmd=mcl ${in} --abc -o ${out} -te 4 +; ADJUST THIS mcxdeblast_cmd=perl /apps/biotools/mcl-14.137/bin/mcxdeblast --m9 --line-mode=abc ${blast_in} > ${abc_out} +; ADJUST THIS orthofinder_cmd=python /home/sepro/OrthoFinder-0.4/orthofinder.py -f ${fasta_dir} -t 8 - -; qsub parameters +; qsub parameters (OGE) qsub_bowtie='' qsub_trimmomatic='' @@ -49,4 +48,51 @@ qsub_interproscan='-pe cores 5' qsub_pcc='' qsub_mcl='-pe cores 4' qsub_orthofinder='-pe cores 8' -qsub_mcxdeblast='' \ No newline at end of file +qsub_mcxdeblast='' + +; qsub parameters (PBS/Torque) + +; qsub_bowtie='' +; qsub_trimmomatic='' +; qsub_tophat='-l nodes=1,ppn=4' +; qsub_htseq_count='' +; qsub_interproscan='-l nodes=1,ppn=5' +; qsub_pcc='' +; qsub_mcl='-l nodes=1,ppn=4' +; qsub_orthofinder='-l nodes=1,ppn=8' +; qsub_mcxdeblast='' + +; qsub parameters (PBS/Torque with walltimes) + +; qsub_bowtie='-l walltime=00:10:00' +; qsub_trimmomatic='-l walltime=00:10:00' +; qsub_tophat='-l nodes=1,ppn=4 -l walltime=00:10:00' +; qsub_htseq_count=' -l walltime=00:02:00' +; qsub_interproscan='-l nodes=1,ppn=5 -l walltime=00:10:00' +; qsub_pcc=' -l walltime=00:10:00' +; qsub_mcl='-l nodes=1,ppn=4 -l walltime=00:10:00' +; qsub_orthofinder='-l nodes=1,ppn=8 -l walltime=01:00:00' +; qsub_mcxdeblast='-l walltime=00:10:00' + +; Module names +; These need to be configured if the required tools are installed in the environment modules. +; You can find the modules installed on your system using +; +; module avail +; +; In case there is no module load system on the system set the module name to None + +bowtie_module=biotools/bowtie2-2.2.6 +samtools_module=biotools/samtools-1.3 +sratoolkit_module=biotools/sratoolkit-2.5.7 +tophat_module=biotools/tophat-2.1.0 + +hisat2_module= + +interproscan_module=biotools/interproscan-5.16-55.0 + +blast_module=biotools/ncbi-blast-2.3.0+ +mcl_module=biotools/mcl-14.137 + +python_module=devel/Python-2.7.10 +python3_module=devel/Python-3.5.1 \ No newline at end of file diff --git a/pipeline/base.py b/pipeline/base.py index 2da5288..89c2926 100644 --- a/pipeline/base.py +++ b/pipeline/base.py @@ -7,11 +7,11 @@ class PipelineBase: - def __init__(self, config, data, enable_log=False): + def __init__(self, config, data, enable_log=False, use_hisat2=False): """ Constructor run with path to ini file with settings - :param config: path to setttings ini file + :param config: path to settings ini file """ self.cp = configparser.ConfigParser() self.cp.read(config) @@ -24,6 +24,7 @@ def __init__(self, config, data, enable_log=False): self.blast_module = None if self.cp['TOOLS']['blast_module'] is 'None' else self.cp['TOOLS']['blast_module'] self.bowtie_module = None if self.cp['TOOLS']['bowtie_module'] is 'None' else self.cp['TOOLS']['bowtie_module'] self.tophat_module = '' if self.cp['TOOLS']['tophat_module'] is 'None' else self.cp['TOOLS']['tophat_module'] + self.hisat2_module = '' if self.cp['TOOLS']['hisat2_module'] is 'None' else self.cp['TOOLS']['hisat2_module'] self.samtools_module = None if self.cp['TOOLS']['samtools_module'] is 'None' else self.cp['TOOLS']['samtools_module'] self.python_module = None if self.cp['TOOLS']['python_module'] is 'None' else self.cp['TOOLS']['python_module'] self.python3_module = None if self.cp['TOOLS']['python3_module'] is 'None' else self.cp['TOOLS']['python3_module'] @@ -57,6 +58,7 @@ def __init__(self, config, data, enable_log=False): self.email = None if self.dp['GLOBAL']['email'] == 'None' else self.cp['DEFAULT']['email'] self.enable_log = enable_log + self.use_hisat2 = use_hisat2 if self.enable_log: self.log = open('lstrap.log', 'w') diff --git a/pipeline/transcriptome.py b/pipeline/transcriptome.py index a1dd12f..5a2aa15 100644 --- a/pipeline/transcriptome.py +++ b/pipeline/transcriptome.py @@ -248,7 +248,9 @@ def run_htseq_count(self, keep_previous=False): htseq_out = os.path.join(htseq_output, d + '.htseq') print(d, bam_file, htseq_out) - command = ["qsub"] + self.qsub_htseq_count + ["-v", "feature=%s,field=%s,bam=%s,gff=%s,out=%s" % (gff_feature, gff_id, bam_file, gff_file, htseq_out), filename] + command = ["qsub"] + self.qsub_htseq_count + ["-v", "itype=bam,feature=%s,field=%s,bam=%s,gff=%s,out=%s" + % (gff_feature, gff_id, bam_file, gff_file, htseq_out), + filename] subprocess.call(command) # wait for all jobs to complete @@ -426,9 +428,8 @@ def cluster_pcc(self): "cluster_pcc_%d.sh") for g in self.genomes: - # TODO naming convention here is confusing, improve this ! - mcl_out = self.dp[g]['pcc_mcl_output'] - mcl_clusters = self.dp[g]['mcl_cluster_output'] + mcl_out = self.dp[g]['pcc_mcl_output'] # This is the PCC table in mcl format + mcl_clusters = self.dp[g]['mcl_cluster_output'] # Desired path for the clusters command = ["qsub"] + self.qsub_mcl + ["-v", "in=%s,out=%s" % (mcl_out, mcl_clusters), filename] subprocess.call(command) diff --git a/run.py b/run.py index e73a4ec..a5a5407 100755 --- a/run.py +++ b/run.py @@ -16,22 +16,33 @@ def run_pipeline(args): """ if check_sanity_config(args.config) and check_sanity_data(args.data): if args.transcriptomics: - tp = TranscriptomePipeline(args.config, args.data, enable_log=args.enable_log) - - if args.bowtie_build: - tp.prepare_genome() + tp = TranscriptomePipeline(args.config, + args.data, + enable_log=args.enable_log, + use_hisat2=args.use_hisat2) + + if args.indexing: + if args.use_hisat2: + print("alignment using hisat2 not implemented yet!", file=sys.argv) + quit() + else: + tp.prepare_genome() else: - print("Skipping Bowtie-build", file=sys.stderr) + print("Skipping Indexing", file=sys.stderr) if args.trim_fastq: tp.trim_fastq() else: print("Skipping Trimmomatic", file=sys.stderr) - if args.tophat: - tp.run_tophat(keep_previous=args.keep_intermediate) + if args.alignment: + if args.use_hisat2: + print("alignment using hisat2 not implemented yet!", file=sys.argv) + quit() + else: + tp.run_tophat(keep_previous=args.keep_intermediate) else: - print("Skipping Tophat", file=sys.stderr) + print("Skipping Alignment", file=sys.stderr) if args.htseq: tp.run_htseq_count(keep_previous=args.keep_intermediate) @@ -95,9 +106,11 @@ def run_pipeline(args): parser.add_argument('--enable-interpro', dest='interpro', action='store_true', help='Runs InterProScan to detect protein domains and provide functional annotation') parser.add_argument('--enable-orthology', dest='orthology', action='store_true', help='Runs OrhtoFinder and MCL to detect orthogroups, gene families and orthologs') - parser.add_argument('--skip-bowtie-build', dest='bowtie_build', action='store_false', help='add --skip-bowtie-build to skip the step that indexes the genomes using bowtie-build') + parser.add_argument('--use-hisat2', dest='use_hisat2', action='store_true', help='Use HISAT2 to build the index and align reads instead of BowTie2 and TopHat2') + + parser.add_argument('--skip-indexing', dest='indexing', action='store_false', help='add --skip-indexing to skip building an index (for read alignment) on the genome (BowTie2 or HISAT2)') parser.add_argument('--skip-trim-fastq', dest='trim_fastq', action='store_false', help='add --skip-trim-fastq to skip trimming fastq files using trimmomatic') - parser.add_argument('--skip-tophat', dest='tophat', action='store_false', help='add --skip-tophat to skip read mapping with tophat') + parser.add_argument('--skip-alignment', dest='alignment', action='store_false', help='add --skip-alignment to skip the read alignment step (TopHat 2 or HISAT2)') parser.add_argument('--skip-htseq', dest='htseq', action='store_false', help='add --skip-htseq to skip counting reads per gene with htseq-count') parser.add_argument('--skip-qc', dest='qc', action='store_false', help='add --skip-qc to skip quality control of tophat and htseq output') parser.add_argument('--skip-exp-matrix', dest='exp_matrix', action='store_false', help='add --skip-exp-matrix to skip converting htseq files to an expression matrix') @@ -116,10 +129,12 @@ def run_pipeline(args): parser.set_defaults(interpro=False) parser.set_defaults(orthology=False) + parser.set_defaults(use_hisat2=False) + # Flags for individual tools for transcriptomics - parser.set_defaults(bowtie_build=True) + parser.set_defaults(indexing=True) parser.set_defaults(trim_fastq=True) - parser.set_defaults(tophat=True) + parser.set_defaults(alignment=True) parser.set_defaults(htseq=True) parser.set_defaults(qc=True) parser.set_defaults(exp_matrix=True)