From b8923d66edb0c336634c0bf17c467a83a95b9a70 Mon Sep 17 00:00:00 2001 From: Sebastian Proost Date: Fri, 21 Jul 2017 12:04:16 +0200 Subject: [PATCH] more gruntwork to support HISAT2 --- config.template.ini | 3 +++ pipeline/base.py | 10 +++++++-- pipeline/check/sanity.py | 3 ++- pipeline/transcriptome.py | 44 +++++++++++++++++++++++++++++++++++++-- run.py | 6 +----- 5 files changed, 56 insertions(+), 10 deletions(-) diff --git a/config.template.ini b/config.template.ini index ead1a8d..955a2ab 100644 --- a/config.template.ini +++ b/config.template.ini @@ -26,6 +26,9 @@ trimmomatic_pe_command=java -jar ${jar} PE -threads 1 ${ina} ${inb} ${outap} ${ tophat_se_cmd=tophat -p 3 -o ${out} ${genome} ${fq} tophat_pe_cmd=tophat -p 3 -o ${out} ${genome} ${forward},${reverse} +hisat2_se_cmd=hisat2 -p 3 -x ${genome} -U ${fq} -S ${out} 2> ${stats} +hisat2_pe_cmd=hisat2 -p 3 -x ${genome} -1 ${forward} -2 ${reverse} -S ${out} 2> ${stats} + htseq_count_cmd=htseq-count -s no -f ${itype} -t ${feature} -i ${field} ${bam} ${gff} > ${out} interproscan_cmd=interproscan.sh -i ${in_dir}/${in_prefix}${SGE_TASK_ID} -o ${out_dir}/${out_prefix}${SGE_TASK_ID} -f tsv -dp -iprlookup -goterms --tempdir /tmp diff --git a/pipeline/base.py b/pipeline/base.py index 5850781..e707d3f 100644 --- a/pipeline/base.py +++ b/pipeline/base.py @@ -33,18 +33,24 @@ def __init__(self, config, data, enable_log=False, use_hisat2=False): self.bowtie_build_cmd = self.cp['TOOLS']['bowtie_cmd'] self.hisat2_build_cmd = self.cp['TOOLS']['hisat2_build_cmd'] + self.trimmomatic_se_cmd = self.cp['TOOLS']['trimmomatic_se_command'] self.trimmomatic_pe_cmd = self.cp['TOOLS']['trimmomatic_pe_command'] + self.tophat_se_cmd = self.cp['TOOLS']['tophat_se_cmd'] self.tophat_pe_cmd = self.cp['TOOLS']['tophat_pe_cmd'] + self.hisat2_se_cmd = self.cp['TOOLS']['hisat2_se_cmd'] + self.hisat2_pe_cmd = self.cp['TOOLS']['hisat2_pe_cmd'] + self.htseq_count_cmd = self.cp['TOOLS']['htseq_count_cmd'] - self.interproscan_cmd = self.cp['TOOLS']['interproscan_cmd'] - self.orthofinder_cmd = self.cp['TOOLS']['orthofinder_cmd'] self.pcc_cmd = self.cp['TOOLS']['pcc_cmd'] self.mcl_cmd = self.cp['TOOLS']['mcl_cmd'] self.mcxdeblast_cmd = self.cp['TOOLS']['mcxdeblast_cmd'] + self.interproscan_cmd = self.cp['TOOLS']['interproscan_cmd'] + self.orthofinder_cmd = self.cp['TOOLS']['orthofinder_cmd'] + self.qsub_indexing = shlex.split(self.cp['TOOLS']['qsub_indexing'].strip('\'')) self.qsub_trimmomatic = shlex.split(self.cp['TOOLS']['qsub_trimmomatic'].strip('\'')) self.qsub_tophat = shlex.split(self.cp['TOOLS']['qsub_tophat'].strip('\'')) diff --git a/pipeline/check/sanity.py b/pipeline/check/sanity.py index 4267dcd..686acdc 100644 --- a/pipeline/check/sanity.py +++ b/pipeline/check/sanity.py @@ -70,7 +70,8 @@ def check_sanity_config(filename): 'trimmomatic_pe_command', 'tophat_se_cmd', 'tophat_pe_cmd', 'htseq_count_cmd', 'interproscan_cmd', 'pcc_cmd', 'mcl_cmd', 'orthofinder_cmd', 'mcxdeblast_cmd', 'trimmomatic_path', 'qsub_indexing', 'qsub_trimmomatic', 'qsub_tophat', 'qsub_htseq_count', - 'qsub_interproscan', 'qsub_pcc', 'qsub_mcl', 'qsub_orthofinder', 'qsub_mcxdeblast'] + 'qsub_interproscan', 'qsub_pcc', 'qsub_mcl', 'qsub_orthofinder', 'qsub_mcxdeblast', + 'hisat2_se_cmd', 'hisat2_pe_cmd'] required_paths = ['trimmomatic_path'] if 'TOOLS' in cp: diff --git a/pipeline/transcriptome.py b/pipeline/transcriptome.py index 926a447..0134284 100644 --- a/pipeline/transcriptome.py +++ b/pipeline/transcriptome.py @@ -141,7 +141,7 @@ def trim_fastq(self, overwrite=False): print("Done\n\n") - def run_tophat(self, overwrite=False, keep_previous=False): + def __run_tophat(self, overwrite=False, keep_previous=False): """ Maps the reads from the trimmed fastq files to the bowtie-indexed genome @@ -162,7 +162,7 @@ def run_tophat(self, overwrite=False, keep_previous=False): for g in self.genomes: tophat_output = self.dp[g]['tophat_output'] - bowtie_output = self.dp[g]['bowtie_output'] + bowtie_output = self.dp[g]['indexing_output'] trimmed_fastq_dir = self.dp[g]['trimmomatic_output'] os.makedirs(tophat_output, exist_ok=True) @@ -221,6 +221,46 @@ def run_tophat(self, overwrite=False, keep_previous=False): # remove OUT_ files PipelineBase.clean_out_files(jobname) + def __run_hisat2(self, overwrite=False, keep_previous=False): + """ + Maps the reads from the trimmed fastq files to the bowtie-indexed genome + + :param overwrite: when true the pipeline will start tophat even if the output exists + :param keep_previous: when true trimmed fastq files will not be removed after tophat completes + """ + filename_se, jobname = self.write_submission_script("hisat2_%d", + self.hisat2_module, + self.hisat2_se_cmd, + "hisat2_se_%d.sh") + + filename_pe, jobname = self.write_submission_script("hisat2_%d", + self.hisat2_module, + self.hisat2_pe_cmd, + "hisat2_pe_%d.sh") + + print('Mapping reads with HISAT2...') + + # remove the submission script + os.remove(filename_se) + os.remove(filename_pe) + + # remove OUT_ files + PipelineBase.clean_out_files(jobname) + + def run_alignment(self, overwrite=False, keep_previous=False): + """ + + Determine which aligner to use and align reads to indexed genome + + :param overwrite: will overwrite existing data when True, otherwise existing runs will be skipped + :param keep_previous: will keep trimmed reads upon completion when true, + otherwise the trimmed reads will be deleted + """ + if self.use_hisat2: + self.__run_hisat2(overwrite=overwrite, keep_previous=keep_previous) + else: + self.__run_tophat(overwrite=overwrite, keep_previous=keep_previous) + print("Done\n\n") def run_htseq_count(self, keep_previous=False): diff --git a/run.py b/run.py index 2d229dd..474e040 100755 --- a/run.py +++ b/run.py @@ -32,11 +32,7 @@ def run_pipeline(args): print("Skipping Trimmomatic", file=sys.stderr) if args.alignment: - if args.use_hisat2: - print("alignment using hisat2 not implemented yet!", file=sys.argv) - quit() - else: - tp.run_tophat(keep_previous=args.keep_intermediate) + tp.run_alignment(keep_previous=args.keep_intermediate) else: print("Skipping Alignment", file=sys.stderr)