Adding in HISAT2 support

proost · Jul 21, 2017 · a6cce89 · a6cce89
1 parent 446aaed
commit a6cce89
Show file tree

Hide file tree

Showing 5 changed files with 106 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -36,7 +36,7 @@ Configure config.ini and data.ini using the guidelines below
 ## Configuration of LSTrAP
 
 After copying the templates, **config.ini** needs to be set up to run on your system. It requires the path to Trimmomatic's jar and the
-modules where Bowtie, Tophat ... are installed in.
+modules where Bowtie, Tophat ... are installed in (if the [modules](http://modules.sourceforge.net/) environment is used.
 
 The location of the transcriptome data, the refrence genome and a few per-species options need to be defined in **data.ini**. 
 
@@ -62,7 +62,7 @@ Options to enable InterProScan and/or OrthoFinder or to skip certain steps of th
 ## Quality report
 
 After running LSTrAP a log file (*lstrap.log*) is written, in which samples which failed a quality measure
-are reported. Note that no samples are excluded from the final network. In case certain samples need to be excluded
+are reported. Note that __no samples are excluded from the final network__. In case certain samples need to be excluded
 from the final network remove the htseq file for the sample you which to exclude and re-run the pipeline skipping all
 steps prior to building the network.
 
@@ -94,16 +94,9 @@ for LSTrAP can be generated.
 
 ## Adapting LSTrAP to other cluster managers
 
-LSTrAP is designed and tested on a cluster running the Oracle Grid Engine, though with minimal effort it can be adopted to run on PBS and Torque
-based systems (and likely others). First, in the configuration file, check the qsub parameters (e.g. jobs that require multiple
-CPUs to run *-pe cores 4*), that differ between systems are set up properly (the nodes and cores on Torque and PBS need to be 
-set using *-l nodes=4:ppn=2* to request 4 nodes with 2 processes per node). 
-
-Furthermore the submission script might differ, these are located in **./cluster/templates.py** . For PBS based systems some
-settings need to be included by adding *#PBS ...*. 
-
-We strive to get LSTrAP running on as many systems as possible. Do not hesitate to contact us in case you experience difficulties 
-running LSTrAP on your system.
+LSTrAP is designed and tested on a cluster running the Oracle Grid Engine (default), PBS/Torque is also supported.
+
+Though due to differences in parameters 
 
 
 ## Contact

diff --git a/config.template.ini b/config.template.ini
@@ -1,45 +1,44 @@
 [TOOLS]
-; In case there is no module load system on the system set the module name to None
+; Tool Configuration
+;
+; Some tools require additional files or might require a hard coded path to the script.
+; Please make sure these are set up correctly.
+
 
 ; Trimmomatic Path
+; ADJUST THIS
 trimmomatic_path=/home/sepro/tools/Trimmomatic-0.36/trimmomatic-0.36.jar
 
-; Module names
-bowtie_module=biotools/bowtie2-2.2.6
-samtools_module=biotools/samtools-1.3
-sratoolkit_module=biotools/sratoolkit-2.5.7
-tophat_module=biotools/tophat-2.1.0
-
-interproscan_module=biotools/interproscan-5.16-55.0
-
-blast_module=biotools/ncbi-blast-2.3.0+
-mcl_module=biotools/mcl-14.137
+; COMMANDS to run tools
+;
+; Here the commands used to start different steps are defined, ${name} are variables that will be set by LSTrAP for
+; each job.
 
-python_module=devel/Python-2.7.10
-python3_module=devel/Python-3.5.1
-
-; commands to run tools
+; Note that in some cases hard coded paths were required, adjust these to match the location of these files on
+; your system
 bowtie_cmd=bowtie2-build ${in} ${out}
 
+; ADJUST PATHS TO ADAPTERS
 trimmomatic_se_command=java -jar ${jar} SE -threads 1  ${in} ${out}  ILLUMINACLIP:/home/sepro/tools/Trimmomatic-0.36/adapters/TruSeq3-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
 trimmomatic_pe_command=java -jar ${jar} PE -threads 1  ${ina} ${inb} ${outap} ${outau} ${outbp} ${outbu} ILLUMINACLIP:/home/sepro/tools/Trimmomatic-0.36/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
 
 tophat_se_cmd=tophat -p 3 -o ${out} ${genome} ${fq}
 tophat_pe_cmd=tophat -p 3 -o ${out} ${genome} ${forward},${reverse}
 
-htseq_count_cmd=htseq-count -s no -f bam -t ${feature} -i ${field} ${bam} ${gff} > ${out}
+htseq_count_cmd=htseq-count -s no -f ${itype} -t ${feature} -i ${field} ${bam} ${gff} > ${out}
 
 interproscan_cmd=interproscan.sh -i ${in_dir}/${in_prefix}${SGE_TASK_ID} -o ${out_dir}/${out_prefix}${SGE_TASK_ID} -f tsv -dp -iprlookup -goterms --tempdir /tmp
 
 pcc_cmd=python3 ./scripts/pcc.py ${in} ${out} ${mcl_out}
 mcl_cmd=mcl ${in} --abc -o ${out} -te 4
 
+; ADJUST THIS
 mcxdeblast_cmd=perl /apps/biotools/mcl-14.137/bin/mcxdeblast --m9 --line-mode=abc ${blast_in} > ${abc_out}
 
+; ADJUST THIS
 orthofinder_cmd=python /home/sepro/OrthoFinder-0.4/orthofinder.py -f ${fasta_dir} -t 8
 
-
-; qsub parameters
+; qsub parameters (OGE)
 
 qsub_bowtie=''
 qsub_trimmomatic=''
@@ -49,4 +48,51 @@ qsub_interproscan='-pe cores 5'
 qsub_pcc=''
 qsub_mcl='-pe cores 4'
 qsub_orthofinder='-pe cores 8'
-qsub_mcxdeblast=''
+qsub_mcxdeblast=''
+
+; qsub parameters (PBS/Torque)
+
+; qsub_bowtie=''
+; qsub_trimmomatic=''
+; qsub_tophat='-l nodes=1,ppn=4'
+; qsub_htseq_count=''
+; qsub_interproscan='-l nodes=1,ppn=5'
+; qsub_pcc=''
+; qsub_mcl='-l nodes=1,ppn=4'
+; qsub_orthofinder='-l nodes=1,ppn=8'
+; qsub_mcxdeblast=''
+
+; qsub parameters (PBS/Torque with walltimes)
+
+; qsub_bowtie='-l walltime=00:10:00'
+; qsub_trimmomatic='-l walltime=00:10:00'
+; qsub_tophat='-l nodes=1,ppn=4  -l walltime=00:10:00'
+; qsub_htseq_count=' -l walltime=00:02:00'
+; qsub_interproscan='-l nodes=1,ppn=5  -l walltime=00:10:00'
+; qsub_pcc=' -l walltime=00:10:00'
+; qsub_mcl='-l nodes=1,ppn=4  -l walltime=00:10:00'
+; qsub_orthofinder='-l nodes=1,ppn=8  -l walltime=01:00:00'
+; qsub_mcxdeblast='-l walltime=00:10:00'
+
+; Module names
+; These need to be configured if the required tools are installed in the environment modules.
+; You can find the modules installed on your system using
+;
+;       module avail
+;
+; In case there is no module load system on the system set the module name to None
+
+bowtie_module=biotools/bowtie2-2.2.6
+samtools_module=biotools/samtools-1.3
+sratoolkit_module=biotools/sratoolkit-2.5.7
+tophat_module=biotools/tophat-2.1.0
+
+hisat2_module=
+
+interproscan_module=biotools/interproscan-5.16-55.0
+
+blast_module=biotools/ncbi-blast-2.3.0+
+mcl_module=biotools/mcl-14.137
+
+python_module=devel/Python-2.7.10
+python3_module=devel/Python-3.5.1
diff --git a/pipeline/base.py b/pipeline/base.py
@@ -7,11 +7,11 @@
 
 
 class PipelineBase:
-    def __init__(self, config, data, enable_log=False):
+    def __init__(self, config, data, enable_log=False, use_hisat2=False):
         """
         Constructor run with path to ini file with settings
 
-        :param config: path to setttings ini file
+        :param config: path to settings ini file
         """
         self.cp = configparser.ConfigParser()
         self.cp.read(config)
@@ -24,6 +24,7 @@ def __init__(self, config, data, enable_log=False):
         self.blast_module = None if self.cp['TOOLS']['blast_module'] is 'None' else self.cp['TOOLS']['blast_module']
         self.bowtie_module = None if self.cp['TOOLS']['bowtie_module'] is 'None' else self.cp['TOOLS']['bowtie_module']
         self.tophat_module = '' if self.cp['TOOLS']['tophat_module'] is 'None' else self.cp['TOOLS']['tophat_module']
+        self.hisat2_module = '' if self.cp['TOOLS']['hisat2_module'] is 'None' else self.cp['TOOLS']['hisat2_module']
         self.samtools_module = None if self.cp['TOOLS']['samtools_module'] is 'None' else self.cp['TOOLS']['samtools_module']
         self.python_module = None if self.cp['TOOLS']['python_module'] is 'None' else self.cp['TOOLS']['python_module']
         self.python3_module = None if self.cp['TOOLS']['python3_module'] is 'None' else self.cp['TOOLS']['python3_module']
@@ -57,6 +58,7 @@ def __init__(self, config, data, enable_log=False):
         self.email = None if self.dp['GLOBAL']['email'] == 'None' else self.cp['DEFAULT']['email']
 
         self.enable_log = enable_log
+        self.use_hisat2 = use_hisat2
 
         if self.enable_log:
             self.log = open('lstrap.log', 'w')

diff --git a/pipeline/transcriptome.py b/pipeline/transcriptome.py
@@ -248,7 +248,9 @@ def run_htseq_count(self, keep_previous=False):
                 htseq_out = os.path.join(htseq_output, d + '.htseq')
                 print(d, bam_file, htseq_out)
 
-                command = ["qsub"] + self.qsub_htseq_count + ["-v", "feature=%s,field=%s,bam=%s,gff=%s,out=%s" % (gff_feature, gff_id, bam_file, gff_file, htseq_out), filename]
+                command = ["qsub"] + self.qsub_htseq_count + ["-v", "itype=bam,feature=%s,field=%s,bam=%s,gff=%s,out=%s"
+                                                              % (gff_feature, gff_id, bam_file, gff_file, htseq_out),
+                                                              filename]
                 subprocess.call(command)
 
         # wait for all jobs to complete
@@ -426,9 +428,8 @@ def cluster_pcc(self):
                                                          "cluster_pcc_%d.sh")
 
         for g in self.genomes:
-            # TODO naming convention here is confusing, improve this !
-            mcl_out = self.dp[g]['pcc_mcl_output']
-            mcl_clusters = self.dp[g]['mcl_cluster_output']
+            mcl_out = self.dp[g]['pcc_mcl_output']          # This is the PCC table in mcl format
+            mcl_clusters = self.dp[g]['mcl_cluster_output'] # Desired path for the clusters
 
             command = ["qsub"] + self.qsub_mcl + ["-v", "in=%s,out=%s" % (mcl_out, mcl_clusters), filename]
             subprocess.call(command)

diff --git a/run.py b/run.py
@@ -16,22 +16,33 @@ def run_pipeline(args):
     """
     if check_sanity_config(args.config) and check_sanity_data(args.data):
         if args.transcriptomics:
-            tp = TranscriptomePipeline(args.config, args.data, enable_log=args.enable_log)
-
-            if args.bowtie_build:
-                tp.prepare_genome()
+            tp = TranscriptomePipeline(args.config,
+                                       args.data,
+                                       enable_log=args.enable_log,
+                                       use_hisat2=args.use_hisat2)
+
+            if args.indexing:
+                if args.use_hisat2:
+                    print("alignment using hisat2 not implemented yet!", file=sys.argv)
+                    quit()
+                else:
+                    tp.prepare_genome()
             else:
-                print("Skipping Bowtie-build", file=sys.stderr)
+                print("Skipping Indexing", file=sys.stderr)
 
             if args.trim_fastq:
                 tp.trim_fastq()
             else:
                 print("Skipping Trimmomatic", file=sys.stderr)
 
-            if args.tophat:
-                tp.run_tophat(keep_previous=args.keep_intermediate)
+            if args.alignment:
+                if args.use_hisat2:
+                    print("alignment using hisat2 not implemented yet!", file=sys.argv)
+                    quit()
+                else:
+                    tp.run_tophat(keep_previous=args.keep_intermediate)
             else:
-                print("Skipping Tophat", file=sys.stderr)
+                print("Skipping Alignment", file=sys.stderr)
 
             if args.htseq:
                 tp.run_htseq_count(keep_previous=args.keep_intermediate)
@@ -95,9 +106,11 @@ def run_pipeline(args):
     parser.add_argument('--enable-interpro', dest='interpro', action='store_true', help='Runs InterProScan to detect protein domains and provide functional annotation')
     parser.add_argument('--enable-orthology', dest='orthology', action='store_true', help='Runs OrhtoFinder and MCL to detect orthogroups, gene families and orthologs')
 
-    parser.add_argument('--skip-bowtie-build', dest='bowtie_build', action='store_false', help='add --skip-bowtie-build to skip the step that indexes the genomes using bowtie-build')
+    parser.add_argument('--use-hisat2', dest='use_hisat2', action='store_true', help='Use HISAT2 to build the index and align reads instead of BowTie2 and TopHat2')
+
+    parser.add_argument('--skip-indexing', dest='indexing', action='store_false', help='add --skip-indexing to skip building an index (for read alignment) on the genome (BowTie2 or HISAT2)')
     parser.add_argument('--skip-trim-fastq', dest='trim_fastq', action='store_false', help='add --skip-trim-fastq to skip trimming fastq files using trimmomatic')
-    parser.add_argument('--skip-tophat', dest='tophat', action='store_false', help='add --skip-tophat to skip read mapping with tophat')
+    parser.add_argument('--skip-alignment', dest='alignment', action='store_false', help='add --skip-alignment to skip the read alignment step (TopHat 2 or HISAT2)')
     parser.add_argument('--skip-htseq', dest='htseq', action='store_false', help='add --skip-htseq to skip counting reads per gene with htseq-count')
     parser.add_argument('--skip-qc', dest='qc', action='store_false', help='add --skip-qc to skip quality control of tophat and htseq output')
     parser.add_argument('--skip-exp-matrix', dest='exp_matrix', action='store_false', help='add --skip-exp-matrix to skip converting htseq files to an expression matrix')
@@ -116,10 +129,12 @@ def run_pipeline(args):
     parser.set_defaults(interpro=False)
     parser.set_defaults(orthology=False)
 
+    parser.set_defaults(use_hisat2=False)
+
     # Flags for individual tools for transcriptomics
-    parser.set_defaults(bowtie_build=True)
+    parser.set_defaults(indexing=True)
     parser.set_defaults(trim_fastq=True)
-    parser.set_defaults(tophat=True)
+    parser.set_defaults(alignment=True)
     parser.set_defaults(htseq=True)
     parser.set_defaults(qc=True)
     parser.set_defaults(exp_matrix=True)