interproscan_analysis.py

import argparse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from matplotlib.backends.backend_pdf import PdfPages


filename = snakemake.input[0]
file = open(filename, "r")
lines = file.readlines()
file.close()

gene = snakemake.params[0]

class transcript:
    def __init__(self, tcons, length, idr, ips, ss8, pss):
        self.tcons = tcons
        self.length = length
        self.idr = idr
        self.ips = ips
        self.ss8 = ss8
        self.pss = pss

transcripts = dict()

count = False
idr_lines = []
ips_lines = []
ss8_lines = []
pss_lines = dict()

for line in lines:
    if (line.startswith(">")):
        if (count):
            if ((tcons not in transcripts) or ((len(idr_lines) - 1) > transcripts[tcons].length)):
                transcripts[tcons] = transcript(tcons, len(idr_lines) - 1, idr_lines, ips_lines, ss8_lines, pss_lines)
        count = True
        new = True
        idr_lines = []
        ips_lines = []
        ss8_lines = []
        pss_lines = dict()
        idr = False
        ips = False
        ss8 = False
        pss = False
        tcons = line[1:].strip().split("|")[0]      
    elif (line.startswith("#####IUPred2A Analysis")):
        idr = True
    elif (line.startswith("#####InterProScan")):
        ips = True
        idr = False
    elif (line.startswith("#####BrewerySS8 Analysis")):
        ss8 = True
        ips = False
    elif (line.startswith("#####PrositeScan Analysis")):
        pss = True
        ss8 = False
    elif (idr):
        idr_lines.append(line.strip().split("\t"))
    elif (ips):
        ips_lines.append(line.strip().split("\t"))
    elif (ss8):
        ss8_lines.append(line.strip().split("\t"))
    elif (pss):
        if (line.startswith("#")):
            pss_id = line.strip().split(" ")[3]
            pss_lines[pss_id] = []
        else:
            pss_lines[pss_id].append(line.strip().replace("  "," ").split(" "))
            
if ((tcons not in transcripts) or ((len(idr_lines) - 1) > transcripts[tcons].length)):       
    transcripts[tcons] = transcript(tcons, len(idr_lines) - 1, idr_lines, ips_lines, ss8_lines, pss_lines)

longest_length = 0
longest_tcon = ""
for ids in transcripts:
    if (transcripts[ids].length > longest_length):
        longest_length = transcripts[ids].length
        longest_tcon = ids

# PLOTTING

colors=['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0',
        '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', 
        '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff']
ss3_abbvs = ["H","E","C"]
aa_abbvs = ["A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y"]

def preprocessArguments(args):
    if args.geneNames != '':
        gene_id='gene_name'
        with open(args.geneNames,'r') as f:
            targets=[i.strip() for i in f]
    elif args.geneIDs != '':
        gene_id='gene_id'
        with open(args.geneIDs,'r') as f:
            targets=[i.strip() for i in f]
    annotation=pd.read_csv(args.gtf, delimiter='\t', header=None, usecols=[0,2,3,4,6,8],
                           names=['chrm','type','start','stop','strand','more'])
    annotation['transcript_id']=annotation.apply(lambda x:
        x['more'].split('transcript_id "')[1].split('"')[0],1)
    annotation=annotation.drop(columns='more')
    data=pd.read_csv(args.csv)
    samples=data.columns[~(data.columns.str.startswith('feature')|data.columns.str.startswith('gene')|data.columns.str.startswith('transcript'))]
    #conditions=list(set([x.split('_')[0] for x in samples]))
    #conditions = ["0","3","5"]
    conditions = ["day0","day3","day5"]
    conditions.sort()
    number_replicates={}
    numerical=True
    for cond in conditions:
        number_replicates[cond]=samples.str.startswith(cond).sum()
        try:
            float(cond)
        except:
            numerical=False
    x=np.arange(len(conditions))
    if numerical:
        x=[float(cond) for cond in conditions]
    return gene_id, targets, annotation, data, samples, conditions, number_replicates, x  

def calculateStatistics(df,conds,nreps):
    for cond in conds: 
        df['mean'+cond]=df.filter(like=cond+'_').mean(1)
        df['stdn'+cond]=df.filter(like=cond+'_').std(1)/np.sqrt(nreps[cond])
    df=df.sort_index()
    return df

def chooseIsoforms2Plot(df,minTPM,minPct,maxIso,annotation):
    df['minimum']=df.filter(regex='^mean').min(axis=1)
    df=df[df['minimum']>minTPM]
    df['maximumPct']=df.filter(regex='^Pct').min(axis=1)
    df=df[df['maximumPct']>minPct]
    df['maximum']=df.filter(regex='^mean').max(axis=1)
    df=df.sort_values('maximum',ascending=False)
    df=df.head(maxIso)
    return df

def plotProfiles(x, df, df_gene, ax, colors, total=True):
    if total:
        plt.errorbar(x,df_gene.filter(like='mean').iloc[0],yerr=df_gene.filter(like='stdn').iloc[0],color='black',linewidth=2, label = "Total Expression")
    for j in range(df.shape[0]):
        row=df.iloc[j]
        plt.errorbar(x+np.random.normal(0, 0.03, len(x)),row.filter(regex='^mean'),yerr=row.filter(like='stdn'),color=colors[j],linewidth=2, label = "")
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_bounds(0,ax.get_yticks()[-2])
    ax.spines['bottom'].set_bounds(min(x),max(x))
    plt.xlabel("Day")
    plt.ylabel("Normalized DeSeq2 TPM")
    plt.legend(loc = "upper center", bbox_to_anchor=(0.5,1), frameon=False)
    
def plotStacked(x,df,ax,colors):
    plt.stackplot(x,df.filter(regex='^Pct').values,colors=colors[:df.shape[0]])
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_bounds(0,100)
    ax.spines['bottom'].set_bounds(min(x),max(x))
    ax.axis([min(x),max(x),0,100])
    plt.xlabel("Day")
    plt.ylabel("TPM Percentage (out of 100%)")
    
def prepareAnnotation(annotation,df):
    cut=annotation[annotation['transcript_id'].isin(df['transcript_id'].values)]
    strand=cut.iloc[0]['strand']
    if strand=='+':
        start=cut['start'].min()
        cut['plot_start']=cut['start']-start
        cut['plot_stop']=cut['stop']-start
    else:
        start=cut['stop'].max()
        cut['plot_start']=(cut['start']-start)*(-1)
        cut['plot_stop']=(cut['stop']-start)*(-1) 
    return cut

def plotAnnotation(annotation, df, plt, colors, length): 
    transcripts_ids = []
    transcripts_pos = []
    chrm = ""
    longest = annotation.loc[annotation['plot_stop'].idxmax()]["plot_stop"]
    count = 3
    panels = df_temp.shape[0] + 1
    for j in range(df.shape[0]):
        ax=plt.subplot(panels,2,(count,count+1)) 
        ax.set_xlim((-50, length))
        ax.set_ylim((-0.85, 1.85))
    
        transcript_annotation=annotation[annotation['transcript_id']==df.iloc[j]["transcript_id"]] 
        transcripts_ids.append(df.iloc[j]["transcript_id"])
        transcripts_pos.append(df.shape[0]-j) 
        if (transcript_annotation.shape[0] > 2): 
            for idx,row in transcript_annotation.iterrows(): 
                chrm = row["chrm"] 
                if row['type']=='transcript': 
                    plt.plot([row['plot_start']*length/longest,row['plot_stop']*length/longest],[1.75,1.75],color=colors[j],linewidth=2) 
                else:
                    plt.plot([row['plot_start']*length/longest,row['plot_stop']*length/longest],[1.75,1.75],color=colors[j],linewidth=10) 
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.spines['left'].set_visible(False)
            ax.spines['bottom'].set_bounds(0,ax.get_xticks()[-2]) 
            #plt.yticks(transcripts_pos,transcripts_ids)
            #plt.xlabel(chrm)
        count += 2
        
def plotCodingPotential(plt, panels, df_temp):
    count = 3
    for ids in list(df_temp["transcript_id"]):
        ax = plt.subplot(panels,2,(count,count+1)) 
        flip = True
        tcons_curr = transcripts[ids]
        # Pfam domain
        for i in tcons_curr.ips:
            
            start = int(i[3])
            stop = int(i[4])
            
            if (i[1] == "Pfam"):
                name = i[8].split(";")[3].split("=")[-1]
                plt.gca().add_patch(Rectangle((start,1.05),stop-start,0.2,edgecolor="#3cb44b",facecolor='#3cb44b'))
                if (flip):
                    ax.annotate(name,(start + (stop-start)/2.0,1.2), fontsize = 14, color = "#e6194b", ha = "center", va = "center")
                else:
                    ax.annotate(name,(start + (stop-start)/2.0,1.1), fontsize = 14, color = "#e6194b", ha = "center", va = "center")
                flip = not flip
                
        ax.annotate("Pfam Domain",(-50,1.15), fontsize = 12, color = "#3cb44b", ha = "center", va = "center")

        # Secondary structure prediction
        ss8_df = pd.DataFrame(tcons_curr.ss8[1:], columns = tcons_curr.ss8[0])
        ss = list(ss8_df["SS"])
        for i in range(tcons_curr.length):
            plt.gca().add_patch(Rectangle((i,-0.4),1,0.35,edgecolor=colors[ss3_abbvs.index(ss[i])],facecolor=colors[ss3_abbvs.index(ss[i])]))

        plt.plot((0,longest_length),(1,1),color = "black")
        plt.plot((0,longest_length),(0,0),color = "black")
        plt.plot((-1,-1),(1.5,-0.9),color = "black")
        ax.annotate("SS Prediction",(-50,-0.2), fontsize = 12, color = "#3cb44b", ha = "center", va = "center")
        
         # Amino acid sequence
        aa = list(ss8_df["AA"])
        for i in range(tcons_curr.length):
            plt.gca().add_patch(Rectangle((i,-0.8),1,0.35,edgecolor=colors[aa_abbvs.index(aa[i])],facecolor=colors[aa_abbvs.index(aa[i])]))
        ax.annotate("AA Sequence",(-50,-0.6), fontsize = 12, color = "#3cb44b", ha = "center", va = "center")
        
        #plt.plot((0,longest_length),(1,1),color = "black")
        #plt.plot((0,longest_length),(0,0),color = "black")
        #plt.plot((-1,-1),(1.5,-0.5),color = "black")

        # IDR prediction
        idr_df = pd.DataFrame(tcons_curr.idr[1:], columns = tcons_curr.idr[0])
        idr_df = idr_df.astype({'# POS': 'int32',"IUPRED2":"float"})
        plt.plot(idr_df["# POS"],idr_df["IUPRED2"])
        ax.annotate("IDR Prediction",(-50,0.5), fontsize = 12, color = "#3cb44b", ha = "center", va = "center")

        # Phosphorlyation Site
        buffer = 0
        for site_type in tcons_curr.pss:
            for i in tcons_curr.pss[site_type]:
                start = int(i[0])
                stop = int(i[2])
                plt.gca().add_patch(Rectangle((start,1.3+buffer),stop-start,0.025,edgecolor="black",facecolor='black'))
            ax.annotate(site_type,(-50,1.3 + buffer), fontsize = 9, color = "#e6194b", ha = "center", va = "center")

                #plt.gca().add_artist(plt.Circle((start + (stop - start)/2,1.35),0.25,color="black"))
            buffer += 0.075

        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.set_yticks([], [])

        ax.title.set_text(tcons_curr.tcons) 
        
        count += 2

class Parser(object):
    def __init__(self, csv, gtf, geneIDs, geneNames, outDir, minTPM, maxIso, minPct):
        self.csv = csv
        self.gtf = gtf
        self.geneIDs = geneIDs
        self.geneNames = geneNames
        self.outDir = outDir
        self.minTPM = minTPM
        self.maxIso = maxIso
        self.minPct = minPct

args = Parser('/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Quantification/all_counts_deseq2norm.txt',
              '/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/GffCompare/nanopore.combined.gtf',
              '', '/home/annaldas/projects/isoform_differentiation/test/list.txt',
              '/home/annaldas/projects/isoform_differentiation/test/',0,18,0)

outdir=args.outDir
minimumTPM = args.minTPM
minimumPct = args.minPct
maximumIso = args.maxIso
(identifier, targets, annotation, data, samples, conditions, number_replicates, x) = preprocessArguments(args)        


df=data[data[identifier]==gene]
df_temp = df
for j in range(df.shape[0]):
    transcript_annotation=annotation[annotation['transcript_id']==df.iloc[j]["transcript_id"]] 
    if (transcript_annotation.shape[0] < 3):
        df_temp = df_temp[df_temp["transcript_id"] != df.iloc[j]["transcript_id"]]

# total gene expression calculation
data_gene=df_temp[samples].sum().to_frame().transpose()
data_gene=calculateStatistics(data_gene,conditions,number_replicates)
# mean transcript expression calculation
df_temp=calculateStatistics(df_temp,conditions,number_replicates)
# isoform percentage calculation
df_temp=(df_temp.filter(like='mean').div(data_gene.filter(like='mean').values[0],1)*100).add_prefix('Pct_').join(df_temp)
#choose isoforms to plot
df_temp=chooseIsoforms2Plot(df_temp,minimumTPM,minimumPct,maximumIso,annotation)
x = [0,3,5]
if df_temp.shape[0]:
    panels = df_temp.shape[0] + 1
    fig,axes = plt.subplots(panels,2,figsize = (18,24)) 
    fig.subplots_adjust(top = 0.95) 
    fig.suptitle(gene,fontsize=16) 
    
    #plot isoform expression 
    axg=plt.subplot(panels,2,1) 
    plotProfiles(x, df_temp, data_gene, axg, colors) 
    
    #plot isoform expression percentage 
    axt=plt.subplot(panels,2,2) 
    plotStacked(x,df_temp,axt,colors) 
    
    #prepare annotation
    annotation_cut=prepareAnnotation(annotation,df_temp) 
    
    #plot annotation 
    #axa=plt.subplot(panels,2,(3,4)) 
    plotAnnotation(annotation_cut, df_temp, plt, colors,longest_length) 
    
    plotCodingPotential(plt,panels,df_temp)

fig.savefig(snakemake.output[0])