Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from matplotlib.patches import Rectangle
from matplotlib.backends.backend_pdf import PdfPages
colors=['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0',
'#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8',
'#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff']
def calculateStatistics(df,conds,nreps):
for cond in conds:
df['mean'+cond]=df.filter(like=cond+'_').mean(1)
df['stdn'+cond]=df.filter(like=cond+'_').std(1)/np.sqrt(nreps[cond])
df=df.sort_index()
return df
def chooseIsoforms2Plot(df,minTPM,minPct,maxIso,annotation):
df['minimum']=df.filter(regex='^mean').min(axis=1)
df=df[df['minimum']>minTPM]
df['maximumPct']=df.filter(regex='^Pct').min(axis=1)
df=df[df['maximumPct']>minPct]
df['maximum']=df.filter(regex='^mean').max(axis=1)
df=df.sort_values('maximum',ascending=False)
df=df.head(maxIso)
return df
def prepareAnnotation(annotation,df):
cut=annotation[annotation['transcript_id'].isin(df['transcript_id'].values)]
strand=cut.iloc[0]['strand']
if strand=='+':
start=cut['start'].min()
cut['plot_start']=cut['start']-start
cut['plot_stop']=cut['stop']-start
else:
start=cut['stop'].max()
cut['plot_start']=(cut['start']-start)*(-1)
cut['plot_stop']=(cut['stop']-start)*(-1)
return cut
def plotProfiles(x, df, df_gene, ax, colors, total=True):
if total:
plt.errorbar(x,df_gene.filter(like='mean').iloc[0],yerr=df_gene.filter(like='stdn').iloc[0],color='black',linewidth=2, label = "Total Expression")
for j in range(df.shape[0]):
row=df.iloc[j]
plt.errorbar(x+np.random.normal(0, 0.03, len(x)),row.filter(regex='^mean'),yerr=row.filter(like='stdn'),color=colors[j],linewidth=2, label = "")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_bounds(0,ax.get_yticks()[-2])
ax.spines['bottom'].set_bounds(min(x),max(x))
plt.xlabel("Day")
plt.ylabel("Normalized DeSeq2 TPM")
plt.legend(loc = "upper center", bbox_to_anchor=(0.5,1), frameon=False)
def plotStacked(x,df,ax,colors):
plt.stackplot(x,df.filter(regex='^Pct').values,colors=colors[:df.shape[0]])
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_bounds(0,100)
ax.spines['bottom'].set_bounds(min(x),max(x))
ax.axis([min(x),max(x),0,100])
plt.xlabel("Day")
plt.ylabel("TPM Percentage (out of 100%)")
def plotAnnotation(annotation, df, ax, colors):
transcripts_ids = []
transcripts_pos = []
chrm = ""
for j in range(df.shape[0]):
transcript_annotation=annotation[annotation['transcript_id']==df.iloc[j]["transcript_id"]]
transcripts_ids.append(df.iloc[j]["transcript_id"])
transcripts_pos.append(df.shape[0]-j)
if (transcript_annotation.shape[0] > 2):
for idx,row in transcript_annotation.iterrows():
chrm = row["chrm"]
if row['type']=='transcript':
plt.plot([row['plot_start'],row['plot_stop']],[df.shape[0]-j,df.shape[0]-j],color=colors[j],linewidth=2)
else:
plt.plot([row['plot_start'],row['plot_stop']],[df.shape[0]-j,df.shape[0]-j],color=colors[j],linewidth=10)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_bounds(0,ax.get_xticks()[-2])
plt.yticks(transcripts_pos,transcripts_ids)
plt.xlabel(chrm)
#def getarg():
# parser = argparse.ArgumentParser(description = "To be added")
# parser.add_argument("-i", "--inFile", help = "Example: /path/nameofthefile without the suffix [pos|neg].sig", required = True)
# parser.add_argument("-l", "--targetList", help = "Example: /path/nameofthefile without the suffix [pos|neg].sig")
# parser.add_argument("-t", "--targetType", help = "Example: /path/nameofthefile without the suffix [pos|neg].sig")
# parser.add_argument("-m", "--tpmMin", help = "Example: /path/nameofthefile without the suffix [pos|neg].sig")
# parser.add_argument("-o", "--outFile", help = "Example: /path/", required = True)
# arg = parser.parse_args()
# return arg.inFile, arg.targetList, arg.targetType, arg.outFile, arg.tpmMin
def preprocessArguments(args):
if args.geneNames != '':
gene_id='gene_name'
with open(args.geneNames,'r') as f:
targets=[i.strip() for i in f]
elif args.geneIDs != '':
gene_id='gene_id'
with open(args.geneIDs,'r') as f:
targets=[i.strip() for i in f]
annotation=pd.read_csv(args.gtf, delimiter='\t', header=None, usecols=[0,2,3,4,6,8],
names=['chrm','type','start','stop','strand','more'])
annotation['transcript_id']=annotation.apply(lambda x:
x['more'].split('transcript_id "')[1].split('"')[0],1)
annotation=annotation.drop(columns='more')
data=pd.read_csv(args.csv)
samples=data.columns[~(data.columns.str.startswith('feature')|data.columns.str.startswith('gene')|data.columns.str.startswith('transcript'))]
#conditions=list(set([x.split('_')[0] for x in samples]))
#conditions = ["0","3","5"]
conditions = ["day0","day3","day5"]
conditions.sort()
number_replicates={}
numerical=True
for cond in conditions:
number_replicates[cond]=samples.str.startswith(cond).sum()
try:
float(cond)
except:
numerical=False
x=np.arange(len(conditions))
if numerical:
x=[float(cond) for cond in conditions]
return gene_id, targets, annotation, data, samples, conditions, number_replicates, x
class Parser(object):
def __init__(self, csv, gtf, geneIDs, geneNames, outDir, minTPM, maxIso, minPct):
self.csv = csv
self.gtf = gtf
self.geneIDs = geneIDs
self.geneNames = geneNames
self.outDir = outDir
self.minTPM = minTPM
self.maxIso = maxIso
self.minPct = minPct
#args = Parser('/home/annaldas/projects/nanopore-transcriptome-analysis/Results/Quantification/all_counts.txt',
# '//home/annaldas/projects/nanopore-transcriptome-analysis/Results/GffCompare/nanopore.combined.gtf',
# '', '/home/annaldas/projects/isoform_differentiation/test_old/list.txt',
# '/home/annaldas/projects/isoform_differentiation/test_old/',0,18,1)
args = Parser('/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Quantification/all_counts_deseq2norm.txt',
'/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/GffCompare/nanopore.combined.gtf',
'', '/home/annaldas/projects/isoform_differentiation/test/list.txt',
'/home/annaldas/projects/isoform_differentiation/test/',0,18,0)
#Parameters
outdir=args.outDir
minimumTPM = args.minTPM
minimumPct = args.minPct
maximumIso = args.maxIso
(identifier, targets, annotation, data, samples, conditions, number_replicates, x) = preprocessArguments(args)
with PdfPages(outdir+'test.pdf') as pdf:
for gene_ensembl in targets:
gene = gene_ensembl.split("_")[0]
print(gene)
df=data[data[identifier]==gene]
if not df.shape[0]:
continue
df_temp = df
for j in range(df.shape[0]):
transcript_annotation=annotation[annotation['transcript_id']==df.iloc[j]["transcript_id"]]
if (transcript_annotation.shape[0] < 3):
df_temp = df_temp[df_temp["transcript_id"] != df.iloc[j]["transcript_id"]]
# total gene expression calculation
data_gene=df_temp[samples].sum().to_frame().transpose()
data_gene=calculateStatistics(data_gene,conditions,number_replicates)
# mean transcript expression calculation
df_temp=calculateStatistics(df_temp,conditions,number_replicates)
# isoform percentage calculation
df_temp=(df_temp.filter(like='mean').div(data_gene.filter(like='mean').values[0],1)*100).add_prefix('Pct_').join(df_temp)
#choose isoforms to plot
print(df_temp)
df_temp=chooseIsoforms2Plot(df_temp,minimumTPM,minimumPct,maximumIso,annotation)
x = [0,3,5]
if df_temp.shape[0]:
panels = 2
if (df_temp.shape[0] > 9):
panels += 1
fig,axes = plt.subplots(panels,2,figsize = (12,9))
fig.subplots_adjust(top = 0.9)
fig.suptitle(gene_ensembl,fontsize=16)
#plot isoform expression
axg=plt.subplot(panels,2,1)
plotProfiles(x, df_temp, data_gene, axg, colors)
#plot isoform expression percentage
axt=plt.subplot(panels,2,2)
plotStacked(x,df_temp,axt,colors)
#prepare annotation
annotation_cut=prepareAnnotation(annotation,df_temp)
#plot annotation
if (panels == 2): axa=plt.subplot(panels,2,(3,4))
else: axa=plt.subplot(panels,2,(3,6))
plotAnnotation(annotation_cut, df_temp, axa, colors)
pdf.savefig()