Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import argparse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from matplotlib.backends.backend_pdf import PdfPages
filename = snakemake.input[0]
file = open(filename, "r")
lines = file.readlines()
file.close()
gene = snakemake.params[0]
class transcript:
def __init__(self, tcons, length, idr, ips, ss8, pss):
self.tcons = tcons
self.length = length
self.idr = idr
self.ips = ips
self.ss8 = ss8
self.pss = pss
transcripts = dict()
count = False
idr_lines = []
ips_lines = []
ss8_lines = []
pss_lines = dict()
for line in lines:
if (line.startswith(">")):
if (count):
if ((tcons not in transcripts) or ((len(idr_lines) - 1) > transcripts[tcons].length)):
transcripts[tcons] = transcript(tcons, len(idr_lines) - 1, idr_lines, ips_lines, ss8_lines, pss_lines)
count = True
new = True
idr_lines = []
ips_lines = []
ss8_lines = []
pss_lines = dict()
idr = False
ips = False
ss8 = False
pss = False
tcons = line[1:].strip().split("|")[0]
elif (line.startswith("#####IUPred2A Analysis")):
idr = True
elif (line.startswith("#####InterProScan")):
ips = True
idr = False
elif (line.startswith("#####BrewerySS8 Analysis")):
ss8 = True
ips = False
elif (line.startswith("#####PrositeScan Analysis")):
pss = True
ss8 = False
elif (idr):
idr_lines.append(line.strip().split("\t"))
elif (ips):
ips_lines.append(line.strip().split("\t"))
elif (ss8):
ss8_lines.append(line.strip().split("\t"))
elif (pss):
if (line.startswith("#")):
pss_id = line.strip().split(" ")[3]
pss_lines[pss_id] = []
else:
pss_lines[pss_id].append(line.strip().replace(" "," ").split(" "))
if ((tcons not in transcripts) or ((len(idr_lines) - 1) > transcripts[tcons].length)):
transcripts[tcons] = transcript(tcons, len(idr_lines) - 1, idr_lines, ips_lines, ss8_lines, pss_lines)
longest_length = 0
longest_tcon = ""
for ids in transcripts:
if (transcripts[ids].length > longest_length):
longest_length = transcripts[ids].length
longest_tcon = ids
# PLOTTING
colors=['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0',
'#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8',
'#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff']
ss3_abbvs = ["H","E","C"]
aa_abbvs = ["A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y"]
def preprocessArguments(args):
if args.geneNames != '':
gene_id='gene_name'
with open(args.geneNames,'r') as f:
targets=[i.strip() for i in f]
elif args.geneIDs != '':
gene_id='gene_id'
with open(args.geneIDs,'r') as f:
targets=[i.strip() for i in f]
annotation=pd.read_csv(args.gtf, delimiter='\t', header=None, usecols=[0,2,3,4,6,8],
names=['chrm','type','start','stop','strand','more'])
annotation['transcript_id']=annotation.apply(lambda x:
x['more'].split('transcript_id "')[1].split('"')[0],1)
annotation=annotation.drop(columns='more')
data=pd.read_csv(args.csv)
samples=data.columns[~(data.columns.str.startswith('feature')|data.columns.str.startswith('gene')|data.columns.str.startswith('transcript'))]
#conditions=list(set([x.split('_')[0] for x in samples]))
#conditions = ["0","3","5"]
conditions = ["day0","day3","day5"]
conditions.sort()
number_replicates={}
numerical=True
for cond in conditions:
number_replicates[cond]=samples.str.startswith(cond).sum()
try:
float(cond)
except:
numerical=False
x=np.arange(len(conditions))
if numerical:
x=[float(cond) for cond in conditions]
return gene_id, targets, annotation, data, samples, conditions, number_replicates, x
def calculateStatistics(df,conds,nreps):
for cond in conds:
df['mean'+cond]=df.filter(like=cond+'_').mean(1)
df['stdn'+cond]=df.filter(like=cond+'_').std(1)/np.sqrt(nreps[cond])
df=df.sort_index()
return df
def chooseIsoforms2Plot(df,minTPM,minPct,maxIso,annotation):
df['minimum']=df.filter(regex='^mean').min(axis=1)
df=df[df['minimum']>minTPM]
df['maximumPct']=df.filter(regex='^Pct').min(axis=1)
df=df[df['maximumPct']>minPct]
df['maximum']=df.filter(regex='^mean').max(axis=1)
df=df.sort_values('maximum',ascending=False)
df=df.head(maxIso)
return df
def plotProfiles(x, df, df_gene, ax, colors, total=True):
if total:
plt.errorbar(x,df_gene.filter(like='mean').iloc[0],yerr=df_gene.filter(like='stdn').iloc[0],color='black',linewidth=2, label = "Total Expression")
for j in range(df.shape[0]):
row=df.iloc[j]
plt.errorbar(x+np.random.normal(0, 0.03, len(x)),row.filter(regex='^mean'),yerr=row.filter(like='stdn'),color=colors[j],linewidth=2, label = "")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_bounds(0,ax.get_yticks()[-2])
ax.spines['bottom'].set_bounds(min(x),max(x))
plt.xlabel("Day")
plt.ylabel("Normalized DeSeq2 TPM")
plt.legend(loc = "upper center", bbox_to_anchor=(0.5,1), frameon=False)
def plotStacked(x,df,ax,colors):
plt.stackplot(x,df.filter(regex='^Pct').values,colors=colors[:df.shape[0]])
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_bounds(0,100)
ax.spines['bottom'].set_bounds(min(x),max(x))
ax.axis([min(x),max(x),0,100])
plt.xlabel("Day")
plt.ylabel("TPM Percentage (out of 100%)")
def prepareAnnotation(annotation,df):
cut=annotation[annotation['transcript_id'].isin(df['transcript_id'].values)]
strand=cut.iloc[0]['strand']
if strand=='+':
start=cut['start'].min()
cut['plot_start']=cut['start']-start
cut['plot_stop']=cut['stop']-start
else:
start=cut['stop'].max()
cut['plot_start']=(cut['start']-start)*(-1)
cut['plot_stop']=(cut['stop']-start)*(-1)
return cut
def plotAnnotation(annotation, df, plt, colors, length):
transcripts_ids = []
transcripts_pos = []
chrm = ""
longest = annotation.loc[annotation['plot_stop'].idxmax()]["plot_stop"]
count = 3
panels = df_temp.shape[0] + 1
for j in range(df.shape[0]):
ax=plt.subplot(panels,2,(count,count+1))
ax.set_xlim((-50, length))
ax.set_ylim((-0.85, 1.85))
transcript_annotation=annotation[annotation['transcript_id']==df.iloc[j]["transcript_id"]]
transcripts_ids.append(df.iloc[j]["transcript_id"])
transcripts_pos.append(df.shape[0]-j)
if (transcript_annotation.shape[0] > 2):
for idx,row in transcript_annotation.iterrows():
chrm = row["chrm"]
if row['type']=='transcript':
plt.plot([row['plot_start']*length/longest,row['plot_stop']*length/longest],[1.75,1.75],color=colors[j],linewidth=2)
else:
plt.plot([row['plot_start']*length/longest,row['plot_stop']*length/longest],[1.75,1.75],color=colors[j],linewidth=10)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_bounds(0,ax.get_xticks()[-2])
#plt.yticks(transcripts_pos,transcripts_ids)
#plt.xlabel(chrm)
count += 2
def plotCodingPotential(plt, panels, df_temp):
count = 3
for ids in list(df_temp["transcript_id"]):
ax = plt.subplot(panels,2,(count,count+1))
flip = True
tcons_curr = transcripts[ids]
# Pfam domain
for i in tcons_curr.ips:
start = int(i[3])
stop = int(i[4])
if (i[1] == "Pfam"):
name = i[8].split(";")[3].split("=")[-1]
plt.gca().add_patch(Rectangle((start,1.05),stop-start,0.2,edgecolor="#3cb44b",facecolor='#3cb44b'))
if (flip):
ax.annotate(name,(start + (stop-start)/2.0,1.2), fontsize = 14, color = "#e6194b", ha = "center", va = "center")
else:
ax.annotate(name,(start + (stop-start)/2.0,1.1), fontsize = 14, color = "#e6194b", ha = "center", va = "center")
flip = not flip
ax.annotate("Pfam Domain",(-50,1.15), fontsize = 12, color = "#3cb44b", ha = "center", va = "center")
# Secondary structure prediction
ss8_df = pd.DataFrame(tcons_curr.ss8[1:], columns = tcons_curr.ss8[0])
ss = list(ss8_df["SS"])
for i in range(tcons_curr.length):
plt.gca().add_patch(Rectangle((i,-0.4),1,0.35,edgecolor=colors[ss3_abbvs.index(ss[i])],facecolor=colors[ss3_abbvs.index(ss[i])]))
plt.plot((0,longest_length),(1,1),color = "black")
plt.plot((0,longest_length),(0,0),color = "black")
plt.plot((-1,-1),(1.5,-0.9),color = "black")
ax.annotate("SS Prediction",(-50,-0.2), fontsize = 12, color = "#3cb44b", ha = "center", va = "center")
# Amino acid sequence
aa = list(ss8_df["AA"])
for i in range(tcons_curr.length):
plt.gca().add_patch(Rectangle((i,-0.8),1,0.35,edgecolor=colors[aa_abbvs.index(aa[i])],facecolor=colors[aa_abbvs.index(aa[i])]))
ax.annotate("AA Sequence",(-50,-0.6), fontsize = 12, color = "#3cb44b", ha = "center", va = "center")
#plt.plot((0,longest_length),(1,1),color = "black")
#plt.plot((0,longest_length),(0,0),color = "black")
#plt.plot((-1,-1),(1.5,-0.5),color = "black")
# IDR prediction
idr_df = pd.DataFrame(tcons_curr.idr[1:], columns = tcons_curr.idr[0])
idr_df = idr_df.astype({'# POS': 'int32',"IUPRED2":"float"})
plt.plot(idr_df["# POS"],idr_df["IUPRED2"])
ax.annotate("IDR Prediction",(-50,0.5), fontsize = 12, color = "#3cb44b", ha = "center", va = "center")
# Phosphorlyation Site
buffer = 0
for site_type in tcons_curr.pss:
for i in tcons_curr.pss[site_type]:
start = int(i[0])
stop = int(i[2])
plt.gca().add_patch(Rectangle((start,1.3+buffer),stop-start,0.025,edgecolor="black",facecolor='black'))
ax.annotate(site_type,(-50,1.3 + buffer), fontsize = 9, color = "#e6194b", ha = "center", va = "center")
#plt.gca().add_artist(plt.Circle((start + (stop - start)/2,1.35),0.25,color="black"))
buffer += 0.075
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_yticks([], [])
ax.title.set_text(tcons_curr.tcons)
count += 2
class Parser(object):
def __init__(self, csv, gtf, geneIDs, geneNames, outDir, minTPM, maxIso, minPct):
self.csv = csv
self.gtf = gtf
self.geneIDs = geneIDs
self.geneNames = geneNames
self.outDir = outDir
self.minTPM = minTPM
self.maxIso = maxIso
self.minPct = minPct
args = Parser('/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Quantification/all_counts_deseq2norm.txt',
'/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/GffCompare/nanopore.combined.gtf',
'', '/home/annaldas/projects/isoform_differentiation/test/list.txt',
'/home/annaldas/projects/isoform_differentiation/test/',0,18,0)
outdir=args.outDir
minimumTPM = args.minTPM
minimumPct = args.minPct
maximumIso = args.maxIso
(identifier, targets, annotation, data, samples, conditions, number_replicates, x) = preprocessArguments(args)
df=data[data[identifier]==gene]
df_temp = df
for j in range(df.shape[0]):
transcript_annotation=annotation[annotation['transcript_id']==df.iloc[j]["transcript_id"]]
if (transcript_annotation.shape[0] < 3):
df_temp = df_temp[df_temp["transcript_id"] != df.iloc[j]["transcript_id"]]
# total gene expression calculation
data_gene=df_temp[samples].sum().to_frame().transpose()
data_gene=calculateStatistics(data_gene,conditions,number_replicates)
# mean transcript expression calculation
df_temp=calculateStatistics(df_temp,conditions,number_replicates)
# isoform percentage calculation
df_temp=(df_temp.filter(like='mean').div(data_gene.filter(like='mean').values[0],1)*100).add_prefix('Pct_').join(df_temp)
#choose isoforms to plot
df_temp=chooseIsoforms2Plot(df_temp,minimumTPM,minimumPct,maximumIso,annotation)
x = [0,3,5]
if df_temp.shape[0]:
panels = df_temp.shape[0] + 1
fig,axes = plt.subplots(panels,2,figsize = (18,24))
fig.subplots_adjust(top = 0.95)
fig.suptitle(gene,fontsize=16)
#plot isoform expression
axg=plt.subplot(panels,2,1)
plotProfiles(x, df_temp, data_gene, axg, colors)
#plot isoform expression percentage
axt=plt.subplot(panels,2,2)
plotStacked(x,df_temp,axt,colors)
#prepare annotation
annotation_cut=prepareAnnotation(annotation,df_temp)
#plot annotation
#axa=plt.subplot(panels,2,(3,4))
plotAnnotation(annotation_cut, df_temp, plt, colors,longest_length)
plotCodingPotential(plt,panels,df_temp)
fig.savefig(snakemake.output[0])