Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
from Bio import SeqIO
import os
codon_table = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
}
def determine_utrs(gene):
utr_file = open(snakemake.input[0],"r")
utr_lines = utr_file.readlines()
utr_file.close()
gene_utr = dict()
for line in utr_lines:
if (line.startswith(">")):
trans_id = line[1:].strip()
if (trans_id not in gene_utr):
gene_utr[trans_id] = []
else:
gene_utr[trans_id].append(line.strip())
return gene_utr
def score(seq,start):
kozak = {
"A":[0.25,0.61,0.27,0.15,1.00,0.00,0.00,0.23],
"C":[0.53,0.02,0.49,0.55,0.00,0.00,0.00,0.16],
"G":[0.15,0.36,0.13,0.21,0.00,0.00,1.00,0.46],
"T":[0.07,0.01,0.11,0.09,0.00,1.00,0.00,0.15]
}
score = 1.0
for i in range(start,len(seq)):
score *= kozak[seq[i]][i]
return score
def translate(seq, i, utr_regions):
translating = True
aa = ""
in_utr = False
for utr in utr_regions:
start,stop = utr
if ((start < i) and (i < stop)):
in_utr = True
while(translating):
if ((len(seq) < 3) or (in_utr)):
translating = False
aa = ""
else:
codon = seq[0:3]
if (codon_table[codon] == "_"):
translating = False
else:
aa += codon_table[codon]
seq = seq[3:]
i += 3
return aa,i
def find_utrs(seq,utr):
pos = seq.find(utr)
if (pos == -1):
if (len(utr) > 20):
for i in range(len(utr) - 1,len(utr)*5//10 - 1,-1):
pos = seq.find(utr[:i])
return pos
def translate_aa_seq(seq,enst,gene_utrs):
utr_regions = []
for utr in gene_utrs[enst]:
pos = find_utrs(seq,utr)
if (pos != -1):
utr_regions.append([pos,pos + len(utr)])
longest_aa_seq = "M"
longest_aa_seq_sc = 0
longest_aa_seq_sc_end = 0
for i in range(len(seq)):
if (seq[i:i+3] == "ATG"):
sc = score(seq[i-4:i+4],0)
aa,end = translate(seq[i:], i, utr_regions)
#print(i,seq[i-4:i+4],aa,sc, end)
if ((len(aa) > 20) and (sc > longest_aa_seq_sc) and (i > longest_aa_seq_sc_end)):
longest_aa_seq = aa
longest_aa_seq_sc = sc
longest_aa_seq_sc_end = end
return (longest_aa_seq,longest_aa_seq_sc)
def translate_aa_seq_length(seq,enst):
utr_regions = []
longest_aa_seq = "M"
for i in range(len(seq)):
if (seq[i:i+3] == "ATG"):
aa,end = translate(seq[i:], i, utr_regions)
#print(i,seq[i-4:i+4],aa, end)
if (len(aa) > len(longest_aa_seq)):
longest_aa_seq = aa
return longest_aa_seq
def find_all_aa_seqs(seq,enst,gene):
gene_utrs = determine_utrs(gene)
longest_aa_seq = translate_aa_seq_length(seq,enst)
if gene in gene_utrs:
for utr in gene_utrs[gene]:
if (find_utrs(seq,utr) != -1):
longest_aa_seq,longest_aa_seq_sc = translate_aa_seq(seq,enst,gene_utrs)
return longest_aa_seq
transcripts_filename = snakemake.input[1]
transcripts = SeqIO.index(transcripts_filename, "fasta")
output = []
gene = snakemake.params[0]
os.mkdir("/project/owlmayerTemporary/Sid/isoform_analysis/result/%s/transcripts" %(gene))
for transcript in transcripts:
seq = str(transcripts[transcript].seq).strip()
enst = str(transcripts[transcript].id).split("|")[-1].strip()
protein = find_all_aa_seqs(seq,enst,gene)
transcript_name = str(transcripts[transcript].id)
transcript_filename = transcript_name.replace("|","_")
transcript_filename = transcript_filename.replace("_","")
transcript_filename_path = "/project/owlmayerTemporary/Sid/isoform_analysis/result/%s/transcripts/%s_map_protein.fa" %(gene,transcript_filename)
transcript_file = open(transcript_filename_path, "w+")
transcript_file.write(">" + transcript_name + "\n" + protein + "\n")
transcript_file.close()