Skip to content
Permalink
3228d4169c
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
321 lines (239 sloc) 13.7 KB
import subprocess
import os
from cluster import wait_for_job
from utils.matrix import read_matrix, write_matrix, normalize_matrix_counts, normalize_matrix_length
from pipeline.base import PipelineBase
from pipeline.check.quality import htseq_count_quality
class TranscriptomePipeline(PipelineBase):
"""
TranscriptomePipeline class. Reads a settings ini file and runs the transcriptome pipeline
"""
def prepare_genome(self):
"""
Runs bowtie-build for each genome on the cluster. All settings are obtained from the settings fasta file
"""
filename, jobname = self.write_submission_script("bowtie_build_%d",
self.bowtie_module,
self.bowtie_build_cmd,
"bowtie_build_%d.sh")
for g in self.genomes:
con_file = self.dp[g]['genome_fasta']
output = self.dp[g]['bowtie_output']
os.makedirs(os.path.dirname(output), exist_ok=True)
subprocess.call(["qsub", "-v", "in=" + con_file + ",out=" + output, filename])
print("Preparing the genomic fasta file...")
# wait for all jobs to complete
wait_for_job(jobname)
# remove the submission script
os.remove(filename)
# remove OUT_ files
PipelineBase.clean_out_files(jobname)
print("Done\n\n")
def trim_fastq(self):
"""
Runs Trimmomatic on all fastq files
"""
filename_se, jobname = self.write_submission_script("trimmomatic_%d",
None,
self.trimmomatic_se_cmd,
"trimmomatic_se_%d.sh")
filename_pe, jobname = self.write_submission_script("trimmomatic_%d",
None,
self.trimmomatic_pe_cmd,
"trimmomatic_pe_%d.sh")
for g in self.genomes:
fastq_input_dir = self.dp[g]['fastq_dir']
trimmed_output = self.dp[g]['trimmomatic_output']
os.makedirs(trimmed_output, exist_ok=True)
fastq_files = []
for file in os.listdir(fastq_input_dir):
if file.endswith('.fq.gz') or file.endswith('.fastq.gz'):
fastq_files.append(file)
# sort required to make sure _1 files are before _2
fastq_files.sort()
while len(fastq_files) > 0:
file = fastq_files.pop(0)
if '_1.' in file:
pair_file = file.replace('_1.', '_2.')
if pair_file in fastq_files:
fastq_files.remove(pair_file)
ina = os.path.join(fastq_input_dir, file)
inb = os.path.join(fastq_input_dir, pair_file)
outap = file.replace('.fq.gz', '.trimmed.paired.fq.gz') if file.endswith('.fq.gz') else file.replace('.fastq.gz', '.trimmed.paired.fastq.gz')
outau = file.replace('.fq.gz', '.trimmed.unpaired.fq.gz') if file.endswith('.fq.gz') else file.replace('.fastq.gz', '.trimmed.unpaired.fastq.gz')
outbp = pair_file.replace('.fq.gz', '.trimmed.paired.fq.gz') if pair_file.endswith('.fq.gz') else pair_file.replace('.fastq.gz', '.trimmed.paired.fastq.gz')
outbu = pair_file.replace('.fq.gz', '.trimmed.unpaired.fq.gz') if pair_file.endswith('.fq.gz') else pair_file.replace('.fastq.gz', '.trimmed.unpaired.fastq.gz')
outap = os.path.join(trimmed_output, outap)
outau = os.path.join(trimmed_output, outau)
outbp = os.path.join(trimmed_output, outbp)
outbu = os.path.join(trimmed_output, outbu)
print('Submitting pair %s, %s' % (file, pair_file))
subprocess.call(["qsub", "-v", "ina=%s,inb=%s,outap=%s,outau=%s,outbp=%s,outbu=%s" % (ina, inb, outap, outau, outbp, outbu), filename_pe])
else:
print('Submitting single %s' % file)
outfile = file.replace('.fq.gz', '.trimmed.fq.gz') if file.endswith('.fq.gz') else file.replace('.fastq.gz', '.trimmed.fastq.gz')
subprocess.call(["qsub", "-v", "in=" + os.path.join(fastq_input_dir, file) + ",out=" + os.path.join(trimmed_output, outfile), filename_se])
else:
print('Submitting single %s' % file)
outfile = file.replace('.fq.gz', '.trimmed.fq.gz') if file.endswith('.fq.gz') else file.replace('.fastq.gz', '.trimmed.fastq.gz')
subprocess.call(["qsub", "-v", "in=" + os.path.join(fastq_input_dir, file) + ",out=" + os.path.join(trimmed_output, outfile), filename_se])
print('Trimming fastq files...')
# wait for all jobs to complete
wait_for_job(jobname, sleep_time=1)
# remove the submission script
os.remove(filename_se)
os.remove(filename_pe)
# remove OUT_ files
PipelineBase.clean_out_files(jobname)
print("Done\n\n")
def run_tophat(self):
"""
Maps the reads from the trimmed fastq files to the bowtie-indexed genome
"""
filename_se, jobname = self.write_submission_script("tophat_%d",
self.bowtie_module + ' ' + self.tophat_module,
self.tophat_se_cmd,
"tophat_se_%d.sh")
filename_pe, jobname = self.write_submission_script("tophat_%d",
self.bowtie_module + ' ' + self.tophat_module,
self.tophat_pe_cmd,
"tophat_pe_%d.sh")
print('Mapping reads with tophat...')
for g in self.genomes:
tophat_output = self.dp[g]['tophat_output']
bowtie_output = self.dp[g]['bowtie_output']
trimmed_fastq_dir = self.dp[g]['trimmomatic_output']
os.makedirs(tophat_output, exist_ok=True)
pe_files = []
se_files = []
for file in os.listdir(trimmed_fastq_dir):
if file.endswith('.paired.fq.gz') or file.endswith('.paired.fastq.gz'):
pe_files.append(file)
elif not (file.endswith('.unpaired.fq.gz') or file.endswith('.unpaired.fastq.gz')):
se_files.append(file)
# sort required to make sure _1 files are before _2
pe_files.sort()
se_files.sort()
for pe_file in pe_files:
if '_1.trimmed.paired.' in pe_file:
pair_file = pe_file.replace('_1.trimmed.paired.', '_2.trimmed.paired.')
output_dir = pe_file.replace('_1.trimmed.paired.fq.gz', '').replace('_1.trimmed.paired.fastq.gz', '')
output_dir = os.path.join(tophat_output, output_dir)
forward = os.path.join(trimmed_fastq_dir, pe_file)
reverse = os.path.join(trimmed_fastq_dir, pair_file)
print('Submitting pair %s, %s' % (pe_file, pair_file))
subprocess.call(["qsub", "-v", "out=%s,genome=%s,forward=%s,reverse=%s" % (output_dir, bowtie_output, forward, reverse), filename_pe])
for se_file in se_files:
print('Submitting single %s' % se_file)
output_dir = se_file.replace('.trimmed.fq.gz', '').replace('.trimmed.fastq.gz', '')
output_dir = os.path.join(tophat_output, output_dir)
subprocess.call(["qsub", "-v", "out=%s,genome=%s,fq=%s" % (output_dir, bowtie_output, os.path.join(trimmed_fastq_dir, se_file)), filename_se])
# wait for all jobs to complete
wait_for_job(jobname, sleep_time=1)
# remove the submission script
os.remove(filename_se)
os.remove(filename_pe)
# remove OUT_ files
PipelineBase.clean_out_files(jobname)
print("Done\n\n")
def run_samtools(self):
"""
Convert tophat output (bam file) to sam file
"""
filename, jobname = self.write_submission_script("samtools_%d",
self.samtools_module,
self.samtools_cmd,
"samtools_%d.sh")
for g in self.genomes:
tophat_output = self.dp[g]['tophat_output']
samtools_output = self.dp[g]['samtools_output']
os.makedirs(samtools_output, exist_ok=True)
dirs = [o for o in os.listdir(tophat_output) if os.path.isdir(os.path.join(tophat_output, o))]
for d in dirs:
bam_file = os.path.join(tophat_output, d, 'accepted_hits.bam')
if os.path.exists(bam_file):
sam_file = os.path.join(samtools_output, d + '.sam')
print("Converting %s to %s" % (sam_file, bam_file))
subprocess.call(["qsub", "-v", "out=%s,bam=%s" % (sam_file, bam_file), filename])
# wait for all jobs to complete
wait_for_job(jobname, sleep_time=1)
# remove the submission script
os.remove(filename)
# remove OUT_ files
PipelineBase.clean_out_files(jobname)
print("Done\n\n")
def run_htseq_count(self):
"""
Based on the gff file and sam file counts the number of reads that map to a given gene
"""
filename, jobname = self.write_submission_script("htseq_count_%d",
self.python_module,
self.htseq_count_cmd,
"htseq_count_%d.sh")
for g in self.genomes:
samtools_output = self.dp[g]['samtools_output']
htseq_output = self.dp[g]['htseq_output']
os.makedirs(htseq_output, exist_ok=True)
gff_file = self.dp[g]['gff_file']
gff_feature = self.dp[g]['gff_feature']
gff_id = self.dp[g]['gff_id']
sam_files = []
for file in os.listdir(samtools_output):
if file.endswith('.sam'):
sam_files.append(file)
for sam_file in sam_files:
sam_in = os.path.join(samtools_output, sam_file)
htseq_out = os.path.join(htseq_output, sam_file.replace('.sam', '.htseq'))
subprocess.call(["qsub", "-v", "feature=%s,field=%s,sam=%s,gff=%s,out=%s" % (gff_feature, gff_id, sam_in, gff_file, htseq_out), filename])
# wait for all jobs to complete
wait_for_job(jobname, sleep_time=1)
# remove the submission script
os.remove(filename)
# remove OUT_ files
PipelineBase.clean_out_files(jobname)
print("Done\n\n")
def htseq_to_matrix(self):
"""
Groups all htseq files into one expression matrix
"""
for g in self.genomes:
htseq_output = self.dp[g]['htseq_output']
os.makedirs(os.path.dirname(htseq_output), exist_ok=True)
htseq_files = [f for f in os.listdir(htseq_output) if f.endswith('.htseq')]
counts = {}
for file in htseq_files:
full_path = os.path.join(htseq_output, file)
if htseq_count_quality(full_path, 1):
with open(full_path, "r") as f:
for row in f:
gene_id, count = row.strip().split('\t')
if gene_id not in counts.keys():
counts[gene_id] = {}
counts[gene_id][file] = count
output_file = self.dp[g]['exp_matrix_output']
with open(output_file, "w") as f_out:
header = '\t'.join(htseq_files)
print('gene\t' + header, file=f_out)
bad_fields = ['no_feature', 'ambiguous', 'too_low_aQual', 'not_aligned', 'alignment_not_unique']
for gene_id in counts:
values = []
for f in htseq_files:
if f in counts[gene_id].keys():
values.append(counts[gene_id][f])
else:
values.append('0')
if all([x not in gene_id for x in bad_fields]):
print(gene_id + '\t' + '\t'.join(values), file=f_out)
print("Done\n\n")
def normalize_rpkm(self):
for g in self.genomes:
data, conditions = read_matrix(self.dp[g]['exp_matrix_output'])
normalized_data = normalize_matrix_counts(data, conditions)
length_normalized_data = normalize_matrix_length(normalized_data, self.dp[g]['cds_fasta'])
write_matrix(self.dp[g]['exp_matrix_rpkm_output'], conditions, length_normalized_data)
def normalize_tpm(self):
for g in self.genomes:
data, conditions = read_matrix(self.dp[g]['exp_matrix_output'])
length_normalized_data = normalize_matrix_length(data, self.dp[g]['cds_fasta'])
normalized_data = normalize_matrix_counts(length_normalized_data, conditions)
write_matrix(self.dp[g]['exp_matrix_tpm_output'], conditions, normalized_data)