Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
LSTrAP/pipeline/transcriptome.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
321 lines (239 sloc)
13.7 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import os | |
from cluster import wait_for_job | |
from utils.matrix import read_matrix, write_matrix, normalize_matrix_counts, normalize_matrix_length | |
from pipeline.base import PipelineBase | |
from pipeline.check.quality import htseq_count_quality | |
class TranscriptomePipeline(PipelineBase): | |
""" | |
TranscriptomePipeline class. Reads a settings ini file and runs the transcriptome pipeline | |
""" | |
def prepare_genome(self): | |
""" | |
Runs bowtie-build for each genome on the cluster. All settings are obtained from the settings fasta file | |
""" | |
filename, jobname = self.write_submission_script("bowtie_build_%d", | |
self.bowtie_module, | |
self.bowtie_build_cmd, | |
"bowtie_build_%d.sh") | |
for g in self.genomes: | |
con_file = self.dp[g]['genome_fasta'] | |
output = self.dp[g]['bowtie_output'] | |
os.makedirs(os.path.dirname(output), exist_ok=True) | |
subprocess.call(["qsub", "-v", "in=" + con_file + ",out=" + output, filename]) | |
print("Preparing the genomic fasta file...") | |
# wait for all jobs to complete | |
wait_for_job(jobname) | |
# remove the submission script | |
os.remove(filename) | |
# remove OUT_ files | |
PipelineBase.clean_out_files(jobname) | |
print("Done\n\n") | |
def trim_fastq(self): | |
""" | |
Runs Trimmomatic on all fastq files | |
""" | |
filename_se, jobname = self.write_submission_script("trimmomatic_%d", | |
None, | |
self.trimmomatic_se_cmd, | |
"trimmomatic_se_%d.sh") | |
filename_pe, jobname = self.write_submission_script("trimmomatic_%d", | |
None, | |
self.trimmomatic_pe_cmd, | |
"trimmomatic_pe_%d.sh") | |
for g in self.genomes: | |
fastq_input_dir = self.dp[g]['fastq_dir'] | |
trimmed_output = self.dp[g]['trimmomatic_output'] | |
os.makedirs(trimmed_output, exist_ok=True) | |
fastq_files = [] | |
for file in os.listdir(fastq_input_dir): | |
if file.endswith('.fq.gz') or file.endswith('.fastq.gz'): | |
fastq_files.append(file) | |
# sort required to make sure _1 files are before _2 | |
fastq_files.sort() | |
while len(fastq_files) > 0: | |
file = fastq_files.pop(0) | |
if '_1.' in file: | |
pair_file = file.replace('_1.', '_2.') | |
if pair_file in fastq_files: | |
fastq_files.remove(pair_file) | |
ina = os.path.join(fastq_input_dir, file) | |
inb = os.path.join(fastq_input_dir, pair_file) | |
outap = file.replace('.fq.gz', '.trimmed.paired.fq.gz') if file.endswith('.fq.gz') else file.replace('.fastq.gz', '.trimmed.paired.fastq.gz') | |
outau = file.replace('.fq.gz', '.trimmed.unpaired.fq.gz') if file.endswith('.fq.gz') else file.replace('.fastq.gz', '.trimmed.unpaired.fastq.gz') | |
outbp = pair_file.replace('.fq.gz', '.trimmed.paired.fq.gz') if pair_file.endswith('.fq.gz') else pair_file.replace('.fastq.gz', '.trimmed.paired.fastq.gz') | |
outbu = pair_file.replace('.fq.gz', '.trimmed.unpaired.fq.gz') if pair_file.endswith('.fq.gz') else pair_file.replace('.fastq.gz', '.trimmed.unpaired.fastq.gz') | |
outap = os.path.join(trimmed_output, outap) | |
outau = os.path.join(trimmed_output, outau) | |
outbp = os.path.join(trimmed_output, outbp) | |
outbu = os.path.join(trimmed_output, outbu) | |
print('Submitting pair %s, %s' % (file, pair_file)) | |
subprocess.call(["qsub", "-v", "ina=%s,inb=%s,outap=%s,outau=%s,outbp=%s,outbu=%s" % (ina, inb, outap, outau, outbp, outbu), filename_pe]) | |
else: | |
print('Submitting single %s' % file) | |
outfile = file.replace('.fq.gz', '.trimmed.fq.gz') if file.endswith('.fq.gz') else file.replace('.fastq.gz', '.trimmed.fastq.gz') | |
subprocess.call(["qsub", "-v", "in=" + os.path.join(fastq_input_dir, file) + ",out=" + os.path.join(trimmed_output, outfile), filename_se]) | |
else: | |
print('Submitting single %s' % file) | |
outfile = file.replace('.fq.gz', '.trimmed.fq.gz') if file.endswith('.fq.gz') else file.replace('.fastq.gz', '.trimmed.fastq.gz') | |
subprocess.call(["qsub", "-v", "in=" + os.path.join(fastq_input_dir, file) + ",out=" + os.path.join(trimmed_output, outfile), filename_se]) | |
print('Trimming fastq files...') | |
# wait for all jobs to complete | |
wait_for_job(jobname, sleep_time=1) | |
# remove the submission script | |
os.remove(filename_se) | |
os.remove(filename_pe) | |
# remove OUT_ files | |
PipelineBase.clean_out_files(jobname) | |
print("Done\n\n") | |
def run_tophat(self): | |
""" | |
Maps the reads from the trimmed fastq files to the bowtie-indexed genome | |
""" | |
filename_se, jobname = self.write_submission_script("tophat_%d", | |
self.bowtie_module + ' ' + self.tophat_module, | |
self.tophat_se_cmd, | |
"tophat_se_%d.sh") | |
filename_pe, jobname = self.write_submission_script("tophat_%d", | |
self.bowtie_module + ' ' + self.tophat_module, | |
self.tophat_pe_cmd, | |
"tophat_pe_%d.sh") | |
print('Mapping reads with tophat...') | |
for g in self.genomes: | |
tophat_output = self.dp[g]['tophat_output'] | |
bowtie_output = self.dp[g]['bowtie_output'] | |
trimmed_fastq_dir = self.dp[g]['trimmomatic_output'] | |
os.makedirs(tophat_output, exist_ok=True) | |
pe_files = [] | |
se_files = [] | |
for file in os.listdir(trimmed_fastq_dir): | |
if file.endswith('.paired.fq.gz') or file.endswith('.paired.fastq.gz'): | |
pe_files.append(file) | |
elif not (file.endswith('.unpaired.fq.gz') or file.endswith('.unpaired.fastq.gz')): | |
se_files.append(file) | |
# sort required to make sure _1 files are before _2 | |
pe_files.sort() | |
se_files.sort() | |
for pe_file in pe_files: | |
if '_1.trimmed.paired.' in pe_file: | |
pair_file = pe_file.replace('_1.trimmed.paired.', '_2.trimmed.paired.') | |
output_dir = pe_file.replace('_1.trimmed.paired.fq.gz', '').replace('_1.trimmed.paired.fastq.gz', '') | |
output_dir = os.path.join(tophat_output, output_dir) | |
forward = os.path.join(trimmed_fastq_dir, pe_file) | |
reverse = os.path.join(trimmed_fastq_dir, pair_file) | |
print('Submitting pair %s, %s' % (pe_file, pair_file)) | |
subprocess.call(["qsub", "-v", "out=%s,genome=%s,forward=%s,reverse=%s" % (output_dir, bowtie_output, forward, reverse), filename_pe]) | |
for se_file in se_files: | |
print('Submitting single %s' % se_file) | |
output_dir = se_file.replace('.trimmed.fq.gz', '').replace('.trimmed.fastq.gz', '') | |
output_dir = os.path.join(tophat_output, output_dir) | |
subprocess.call(["qsub", "-v", "out=%s,genome=%s,fq=%s" % (output_dir, bowtie_output, os.path.join(trimmed_fastq_dir, se_file)), filename_se]) | |
# wait for all jobs to complete | |
wait_for_job(jobname, sleep_time=1) | |
# remove the submission script | |
os.remove(filename_se) | |
os.remove(filename_pe) | |
# remove OUT_ files | |
PipelineBase.clean_out_files(jobname) | |
print("Done\n\n") | |
def run_samtools(self): | |
""" | |
Convert tophat output (bam file) to sam file | |
""" | |
filename, jobname = self.write_submission_script("samtools_%d", | |
self.samtools_module, | |
self.samtools_cmd, | |
"samtools_%d.sh") | |
for g in self.genomes: | |
tophat_output = self.dp[g]['tophat_output'] | |
samtools_output = self.dp[g]['samtools_output'] | |
os.makedirs(samtools_output, exist_ok=True) | |
dirs = [o for o in os.listdir(tophat_output) if os.path.isdir(os.path.join(tophat_output, o))] | |
for d in dirs: | |
bam_file = os.path.join(tophat_output, d, 'accepted_hits.bam') | |
if os.path.exists(bam_file): | |
sam_file = os.path.join(samtools_output, d + '.sam') | |
print("Converting %s to %s" % (sam_file, bam_file)) | |
subprocess.call(["qsub", "-v", "out=%s,bam=%s" % (sam_file, bam_file), filename]) | |
# wait for all jobs to complete | |
wait_for_job(jobname, sleep_time=1) | |
# remove the submission script | |
os.remove(filename) | |
# remove OUT_ files | |
PipelineBase.clean_out_files(jobname) | |
print("Done\n\n") | |
def run_htseq_count(self): | |
""" | |
Based on the gff file and sam file counts the number of reads that map to a given gene | |
""" | |
filename, jobname = self.write_submission_script("htseq_count_%d", | |
self.python_module, | |
self.htseq_count_cmd, | |
"htseq_count_%d.sh") | |
for g in self.genomes: | |
samtools_output = self.dp[g]['samtools_output'] | |
htseq_output = self.dp[g]['htseq_output'] | |
os.makedirs(htseq_output, exist_ok=True) | |
gff_file = self.dp[g]['gff_file'] | |
gff_feature = self.dp[g]['gff_feature'] | |
gff_id = self.dp[g]['gff_id'] | |
sam_files = [] | |
for file in os.listdir(samtools_output): | |
if file.endswith('.sam'): | |
sam_files.append(file) | |
for sam_file in sam_files: | |
sam_in = os.path.join(samtools_output, sam_file) | |
htseq_out = os.path.join(htseq_output, sam_file.replace('.sam', '.htseq')) | |
subprocess.call(["qsub", "-v", "feature=%s,field=%s,sam=%s,gff=%s,out=%s" % (gff_feature, gff_id, sam_in, gff_file, htseq_out), filename]) | |
# wait for all jobs to complete | |
wait_for_job(jobname, sleep_time=1) | |
# remove the submission script | |
os.remove(filename) | |
# remove OUT_ files | |
PipelineBase.clean_out_files(jobname) | |
print("Done\n\n") | |
def htseq_to_matrix(self): | |
""" | |
Groups all htseq files into one expression matrix | |
""" | |
for g in self.genomes: | |
htseq_output = self.dp[g]['htseq_output'] | |
os.makedirs(os.path.dirname(htseq_output), exist_ok=True) | |
htseq_files = [f for f in os.listdir(htseq_output) if f.endswith('.htseq')] | |
counts = {} | |
for file in htseq_files: | |
full_path = os.path.join(htseq_output, file) | |
if htseq_count_quality(full_path, 1): | |
with open(full_path, "r") as f: | |
for row in f: | |
gene_id, count = row.strip().split('\t') | |
if gene_id not in counts.keys(): | |
counts[gene_id] = {} | |
counts[gene_id][file] = count | |
output_file = self.dp[g]['exp_matrix_output'] | |
with open(output_file, "w") as f_out: | |
header = '\t'.join(htseq_files) | |
print('gene\t' + header, file=f_out) | |
bad_fields = ['no_feature', 'ambiguous', 'too_low_aQual', 'not_aligned', 'alignment_not_unique'] | |
for gene_id in counts: | |
values = [] | |
for f in htseq_files: | |
if f in counts[gene_id].keys(): | |
values.append(counts[gene_id][f]) | |
else: | |
values.append('0') | |
if all([x not in gene_id for x in bad_fields]): | |
print(gene_id + '\t' + '\t'.join(values), file=f_out) | |
print("Done\n\n") | |
def normalize_rpkm(self): | |
for g in self.genomes: | |
data, conditions = read_matrix(self.dp[g]['exp_matrix_output']) | |
normalized_data = normalize_matrix_counts(data, conditions) | |
length_normalized_data = normalize_matrix_length(normalized_data, self.dp[g]['cds_fasta']) | |
write_matrix(self.dp[g]['exp_matrix_rpkm_output'], conditions, length_normalized_data) | |
def normalize_tpm(self): | |
for g in self.genomes: | |
data, conditions = read_matrix(self.dp[g]['exp_matrix_output']) | |
length_normalized_data = normalize_matrix_length(data, self.dp[g]['cds_fasta']) | |
normalized_data = normalize_matrix_counts(length_normalized_data, conditions) | |
write_matrix(self.dp[g]['exp_matrix_tpm_output'], conditions, normalized_data) | |