diff --git a/Snakefile b/Snakefile
index 6b6bd58..e86c99f 100644
--- a/Snakefile
+++ b/Snakefile
@@ -10,17 +10,16 @@ Transcripts = config["polished_reads"]
 
 rule all:
     input:
-        expand("/home/annaldas/projects/result/{gene}/{gene}_transcripts_filtered_coding_potential_analysis.txt", gene = GENES),
-        #expand("/home/annaldas/projects/result/{gene}/{gene}_blastx_protein_analysis.txt", gene = GENES)
-        #expand("/home/annaldas/projects/result/{gene}/{gene}_sashimi.pdf", gene = GENES)
+        #expand("/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_coding_potential_analysis.pdf", gene = GENES),
+        #expand("//project/owlmayerTemporary/Sid/isoform_analysis//result/{gene}/{gene}_blastx_protein_analysis.txt", gene = GENES)
+        expand("/project/owlmayerTemporary/Sid/isoform_analysis/result/all/{gene}_protein_sequences.txt", gene = GENES)
 
 rule gene_transcript:
     input:
         NanoporeGTF,
         Transcripts
     output:
-        "/home/annaldas/projects/result/{gene}/{gene}_seq.fa"#,
-        #"/home/annaldas/projects/result/{gene}/{gene}_sashimi.sh"
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq.fa"
     params:
         gene = "{gene}"
     script:
@@ -28,10 +27,10 @@ rule gene_transcript:
         
 rule blastx:
     input:
-        gene_fa = "/home/annaldas/projects/result/{gene}/{gene}_seq.fa",
+        gene_fa = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq.fa",
         db = config["human_protein"]
     output:
-        "/home/annaldas/projects/result/{gene}/{gene}_blastx.out"
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_blastx.out"
     threads:
         4
     shell:
@@ -40,10 +39,10 @@ rule blastx:
         
 rule protein_sequence:
     input:
-        blastx = "/home/annaldas/projects/result/{gene}/{gene}_blastx.out",
+        blastx = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_blastx.out",
         db = config["human_protein"]
     output:
-        "/home/annaldas/projects/result/{gene}/{gene}_blastx_protein.fa"
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_blastx_protein.fa"
     shell:
         "sh protein_transcript_sequences.sh {input.blastx} {input.db} {output}"
 
@@ -53,53 +52,70 @@ rule utr_regions:
     params:
         gene = "{gene}"
     output:
-        "/home/annaldas/projects/result/{gene}/{gene}_utr_regions.bed"
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.bed"
     script:
         "utr_regions.py"
     
 rule utr_sequences:
     input:
         hg = config["human_genome"],
-        utr = "/home/annaldas/projects/result/{gene}/{gene}_utr_regions.bed"
+        utr = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.bed"
     output:
-        "/home/annaldas/projects/result/{gene}/{gene}_utr_regions.fa"
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.fa"
     shell:
         "bedtools getfasta -fi {input.hg} -bed {input.utr} -fo {output} -name"
     
-rule transcript_filter_utr:
-    input:
-        utr = "/home/annaldas/projects/result/{gene}/{gene}_utr_regions.fa",
-        seq = "/home/annaldas/projects/result/{gene}/{gene}_seq.fa"
-    output:
-        "/home/annaldas/projects/result/{gene}/{gene}_seq_filt.fa"
-    script:
-        "filter_utr.py"
+#rule transcript_filter_utr:
+#    input:
+#        utr = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.fa",
+#        seq = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq.fa"
+#    output:
+#        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq_filt.fa"
+#    script:
+#        "filter_utr.py"
 
 checkpoint mapping:
     input:
-        "/home/annaldas/projects/result/{gene}/{gene}_seq_filt.fa"
+        utr = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.fa",
+        seq = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq.fa"
     output:
-        directory("/home/annaldas/projects/result/{gene}/transcripts") #/{transcript}/{transcript}_map_protein.fa"
+        directory("/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts") #/{transcript}/{transcript}_map_protein.fa"
     params:
         gene = "{gene}"
     script:
         "translation_protein.py"
         
+def aggregate_mapping(wildcards):
+    checkpoint_output = checkpoints.mapping.get(**wildcards).output[0]
+    return expand("/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa",
+           gene=wildcards.gene,
+           transcript=glob_wildcards(os.path.join(checkpoint_output,"{transcript}_map_protein.fa")).transcript)
+
+rule aggregate_mapping:
+    input:
+        aggregate_mapping
+    output:
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/all/{gene}_protein_sequences.txt"
+    params:
+        gene = "{gene}"
+    shell:
+        "cat {input} > {output}"
+        
 rule iupred2a_analysis:
     input:
-        "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa",
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa",
         config["iupred2a"]
     output:
-        "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_transcript_sequence.txt",
-        "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein_iupred2a.txt"
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_transcript_sequence.txt",
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_iupred2a.txt"
     script:
         "iupred2a_analysis.py"
         
 rule interpro_scan:
     input:
-        "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa"
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa"
     output:
-        "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map.gff3"
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map.gff3"
     params:
         db = "Pfam,ProDom,Gene3D,CDD,Coils,MobiDBLite,SMART"
     shell:
@@ -107,10 +123,10 @@ rule interpro_scan:
         
 rule brewery_analysis:
     input:
-        "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa"
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa"
     output:
-        "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa.ss3"
-        #"/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa.ss8"
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa.ss3"
+        #"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa.ss8"
     params:
         brewery = config["brewery"]
     shell:
@@ -118,9 +134,9 @@ rule brewery_analysis:
 
 rule functional_site_analysis:
     input:
-        "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa"
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa"
     output:
-        "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein_sites.txt"
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_sites.txt"
     params:
         ps_scan = config["prosite_scan"],
         prosite_dat = config["prosite_dat"]
@@ -129,18 +145,18 @@ rule functional_site_analysis:
 
 rule individual_transcript_analysis:
     input:
-        "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein_iupred2a.txt",
-        "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map.gff3",
-        "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa.ss3",
-        "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein_sites.txt"
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_iupred2a.txt",
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map.gff3",
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa.ss3",
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_sites.txt"
     output:
-        "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein_analysis.txt"
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_analysis.txt"
     script:
         "transcript_analysis.py"
         
 def aggregate_input(wildcards):
     checkpoint_output = checkpoints.mapping.get(**wildcards).output[0]
-    return expand("/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein_analysis.txt",
+    return expand("/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_analysis.txt",
            gene=wildcards.gene,
            transcript=glob_wildcards(os.path.join(checkpoint_output,"{transcript}_map_protein.fa")).transcript)
 
@@ -148,7 +164,7 @@ rule aggregate:
     input:
         aggregate_input
     output:
-        "/home/annaldas/projects/result/{gene}/{gene}_transcripts_filtered_analysis.txt"
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_analysis.txt"
     params:
         gene = "{gene}"
     shell:
@@ -156,26 +172,28 @@ rule aggregate:
         
 #rule filter_transcripts:
 #    input:
-#        "/home/annaldas/projects/result/{gene}/{gene}_transcripts_analysis.txt"
+#        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_analysis.txt"
 #    output:
-#        "/home/annaldas/projects/result/{gene}/{gene}_transcripts_filtered_analysis.txt"
+#        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_analysis.txt"
 #    script:
 #        "filter_transcripts.py"
         
 rule protein_coding_potential_analysis:
     input:
-        "/home/annaldas/projects/result/{gene}/{gene}_transcripts_filtered_analysis.txt"
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_analysis.txt"
     output:
-        "/home/annaldas/projects/result/{gene}/{gene}_transcripts_filtered_coding_potential_analysis.txt"
+        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_coding_potential_analysis.pdf"
+    params:
+         gene = "{gene}"
     script:
         "interproscan_analysis.py"
         
 #rule protein_domain_analysis:
 #    input:
-#        "/home/annaldas/projects/result/{gene}/transcripts/{transcript}/{transcript}_map.gff3",
-#        "/home/annaldas/projects/result/{gene}/transcripts/{transcript}/{transcript}_map_protein_iupred2a.txt"
+#        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}/{transcript}_map.gff3",
+#        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}/{transcript}_map_protein_iupred2a.txt"
 #    output:
-#        "/home/annaldas/projects/result/{gene}/{gene}_map_protein_analysis.txt"
+#        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_map_protein_analysis.txt"
 #    params:
 #        gene = "{gene}"
 #    script:
@@ -183,11 +201,11 @@ rule protein_coding_potential_analysis:
         
 #rule sashimi_plot:
 #    input:
-#        sashimi_sh = "/home/annaldas/projects/result/{gene}/{gene}_sashimi.sh",
+#        sashimi_sh = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_sashimi.sh",
 #        sashimi_py = config["sashimi"],
 #        bams = config["input_bams"],
 #        gtf = NanoporeGTF
 #    output:
-#        "/home/annaldas/projects/result/{gene}/{gene}_sashimi.pdf"
+#        "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_sashimi.pdf"
 #    shell:
 #        "sh {input.sashimi_sh} {input.sashimi_py} {input.bams} {input.gtf} {output}"
\ No newline at end of file
diff --git a/gene_transcripts.py b/gene_transcripts.py
index ef2b0f8..27cbc00 100644
--- a/gene_transcripts.py
+++ b/gene_transcripts.py
@@ -18,8 +18,6 @@
 # Mapping oID to transcript id
 # Mapping transcript id to exons
 
-ABI2_info = []
-
 gene_oID = dict()
 oID_tID = dict()
 gene_pos = dict()
@@ -57,7 +55,7 @@
 # Extracting isoforms from related genes
 
 output = []
-gene = snakemake.params[0].upper()
+gene = snakemake.params[0]
 for oID in gene_oID[gene]:
     tID = oID_tID[transcripts[oID].id]
     transID = tID_pos[tID]
diff --git a/genes.tab b/genes.tab
index ba1c398..b8a6956 100644
--- a/genes.tab
+++ b/genes.tab
@@ -1,2 +1,68 @@
 gene_symbol
-RPS24
\ No newline at end of file
+AC112178.1
+AL133395.1
+AL590617.2
+ANO7
+APOOL
+BAK1
+BCKDHB
+CADM2
+CHRNA1
+CUZD1
+DACT3
+DENND5B
+DLEU7
+EBF2
+EPB41L5
+ERBB2
+ERGIC3
+FAIM
+FAM78B
+GDF1
+GLS
+HAGHL
+HOMEZ
+IKBIP
+KCNH3
+KCNJ6
+KIF1B
+KIF7
+KLHL35
+KTN1-AS1
+LAGE3
+LARGE2
+LINC00467
+MIR302CHG
+MIR4787
+NCAN
+NECTIN2
+NKAIN3
+NKAIN4
+NRF1
+NRG1
+ONECUT2
+PDE4DIP
+PIWIL2
+PPP1R1C
+PRKCZ
+RAC3
+REEP1
+RENBP
+RGS9
+RIPOR2
+RPS24
+RRP7BP
+SGK3
+SLC17A7
+SLC44A3-AS1
+SLC6A15
+SNTG2
+SWSAP1
+SYNGR1
+THAP7-AS1
+THRA
+TLN1
+TPM2
+TYW1
+VSIG10
+WFDC2
\ No newline at end of file
diff --git a/interproscan_analysis.py b/interproscan_analysis.py
index 62037c5..0d639c6 100644
--- a/interproscan_analysis.py
+++ b/interproscan_analysis.py
@@ -1,11 +1,23 @@
+import argparse
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle
+from matplotlib.backends.backend_pdf import PdfPages
+
+
+
 filename = snakemake.input[0]
 file = open(filename, "r")
 lines = file.readlines()
 file.close()
 
+gene = snakemake.params[0]
+
 class transcript:
-    def __init__(self, tcons, idr, ips, ss8):
+    def __init__(self, tcons, length, idr, ips, ss8, pss):
         self.tcons = tcons
+        self.length = length
         self.idr = idr
         self.ips = ips
         self.ss8 = ss8
@@ -13,29 +25,29 @@ def __init__(self, tcons, idr, ips, ss8):
 
 transcripts = dict()
 
+count = False
 idr_lines = []
 ips_lines = []
 ss8_lines = []
-pss_lines = []
+pss_lines = dict()
 
 for line in lines:
     if (line.startswith(">")):
-        if (len(ips_lines) > 0):
-            transcripts[tcons] = transcript(tcons, idr, ips, ss8)
-            
+        if (count):
+            if ((tcons not in transcripts) or ((len(idr_lines) - 1) > transcripts[tcons].length)):
+                transcripts[tcons] = transcript(tcons, len(idr_lines) - 1, idr_lines, ips_lines, ss8_lines, pss_lines)
+        count = True
         new = True
         idr_lines = []
         ips_lines = []
         ss8_lines = []
-        pss_lines = []
+        pss_lines = dict()
         idr = False
         ips = False
         ss8 = False
         pss = False
-        tcons = line[1:].strip().split("|")[0]    
-        
-        
-    if (line.startswith("#####IUPred2A Analysis")):
+        tcons = line[1:].strip().split("|")[0]      
+    elif (line.startswith("#####IUPred2A Analysis")):
         idr = True
     elif (line.startswith("#####InterProScan")):
         ips = True
@@ -43,17 +55,288 @@ def __init__(self, tcons, idr, ips, ss8):
     elif (line.startswith("#####BrewerySS8 Analysis")):
         ss8 = True
         ips = False
-    
-    if (idr):
-        idr_lines.append(line.strip())
+    elif (line.startswith("#####PrositeScan Analysis")):
+        pss = True
+        ss8 = False
+    elif (idr):
+        idr_lines.append(line.strip().split("\t"))
     elif (ips):
-        ips_lines.append(line.strip())
+        ips_lines.append(line.strip().split("\t"))
     elif (ss8):
-        ss8_lines.append(line.strip())
+        ss8_lines.append(line.strip().split("\t"))
+    elif (pss):
+        if (line.startswith("#")):
+            pss_id = line.strip().split(" ")[3]
+            pss_lines[pss_id] = []
+        else:
+            pss_lines[pss_id].append(line.strip().replace("  "," ").split(" "))
+            
+if ((tcons not in transcripts) or ((len(idr_lines) - 1) > transcripts[tcons].length)):       
+    transcripts[tcons] = transcript(tcons, len(idr_lines) - 1, idr_lines, ips_lines, ss8_lines, pss_lines)
+
+longest_length = 0
+longest_tcon = ""
+for ids in transcripts:
+    if (transcripts[ids].length > longest_length):
+        longest_length = transcripts[ids].length
+        longest_tcon = ids
+
+# PLOTTING
+
+colors=['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0',
+        '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', 
+        '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff']
+ss3_abbvs = ["H","E","C"]
+aa_abbvs = ["A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y"]
+
+def preprocessArguments(args):
+    if args.geneNames != '':
+        gene_id='gene_name'
+        with open(args.geneNames,'r') as f:
+            targets=[i.strip() for i in f]
+    elif args.geneIDs != '':
+        gene_id='gene_id'
+        with open(args.geneIDs,'r') as f:
+            targets=[i.strip() for i in f]
+    annotation=pd.read_csv(args.gtf, delimiter='\t', header=None, usecols=[0,2,3,4,6,8],
+                           names=['chrm','type','start','stop','strand','more'])
+    annotation['transcript_id']=annotation.apply(lambda x:
+        x['more'].split('transcript_id "')[1].split('"')[0],1)
+    annotation=annotation.drop(columns='more')
+    data=pd.read_csv(args.csv)
+    samples=data.columns[~(data.columns.str.startswith('feature')|data.columns.str.startswith('gene')|data.columns.str.startswith('transcript'))]
+    #conditions=list(set([x.split('_')[0] for x in samples]))
+    #conditions = ["0","3","5"]
+    conditions = ["day0","day3","day5"]
+    conditions.sort()
+    number_replicates={}
+    numerical=True
+    for cond in conditions:
+        number_replicates[cond]=samples.str.startswith(cond).sum()
+        try:
+            float(cond)
+        except:
+            numerical=False
+    x=np.arange(len(conditions))
+    if numerical:
+        x=[float(cond) for cond in conditions]
+    return gene_id, targets, annotation, data, samples, conditions, number_replicates, x  
+
+def calculateStatistics(df,conds,nreps):
+    for cond in conds: 
+        df['mean'+cond]=df.filter(like=cond+'_').mean(1)
+        df['stdn'+cond]=df.filter(like=cond+'_').std(1)/np.sqrt(nreps[cond])
+    df=df.sort_index()
+    return df
+
+def chooseIsoforms2Plot(df,minTPM,minPct,maxIso,annotation):
+    df['minimum']=df.filter(regex='^mean').min(axis=1)
+    df=df[df['minimum']>minTPM]
+    df['maximumPct']=df.filter(regex='^Pct').min(axis=1)
+    df=df[df['maximumPct']>minPct]
+    df['maximum']=df.filter(regex='^mean').max(axis=1)
+    df=df.sort_values('maximum',ascending=False)
+    df=df.head(maxIso)
+    return df
+
+def plotProfiles(x, df, df_gene, ax, colors, total=True):
+    if total:
+        plt.errorbar(x,df_gene.filter(like='mean').iloc[0],yerr=df_gene.filter(like='stdn').iloc[0],color='black',linewidth=2, label = "Total Expression")
+    for j in range(df.shape[0]):
+        row=df.iloc[j]
+        plt.errorbar(x+np.random.normal(0, 0.03, len(x)),row.filter(regex='^mean'),yerr=row.filter(like='stdn'),color=colors[j],linewidth=2, label = "")
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.spines['left'].set_bounds(0,ax.get_yticks()[-2])
+    ax.spines['bottom'].set_bounds(min(x),max(x))
+    plt.xlabel("Day")
+    plt.ylabel("Normalized DeSeq2 TPM")
+    plt.legend(loc = "upper center", bbox_to_anchor=(0.5,1), frameon=False)
+    
+def plotStacked(x,df,ax,colors):
+    plt.stackplot(x,df.filter(regex='^Pct').values,colors=colors[:df.shape[0]])
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.spines['left'].set_bounds(0,100)
+    ax.spines['bottom'].set_bounds(min(x),max(x))
+    ax.axis([min(x),max(x),0,100])
+    plt.xlabel("Day")
+    plt.ylabel("TPM Percentage (out of 100%)")
+    
+def prepareAnnotation(annotation,df):
+    cut=annotation[annotation['transcript_id'].isin(df['transcript_id'].values)]
+    strand=cut.iloc[0]['strand']
+    if strand=='+':
+        start=cut['start'].min()
+        cut['plot_start']=cut['start']-start
+        cut['plot_stop']=cut['stop']-start
+    else:
+        start=cut['stop'].max()
+        cut['plot_start']=(cut['start']-start)*(-1)
+        cut['plot_stop']=(cut['stop']-start)*(-1) 
+    return cut
+
+def plotAnnotation(annotation, df, plt, colors, length): 
+    transcripts_ids = []
+    transcripts_pos = []
+    chrm = ""
+    longest = annotation.loc[annotation['plot_stop'].idxmax()]["plot_stop"]
+    count = 3
+    panels = df_temp.shape[0] + 1
+    for j in range(df.shape[0]):
+        ax=plt.subplot(panels,2,(count,count+1)) 
+        ax.set_xlim((-50, length))
+        ax.set_ylim((-0.85, 1.85))
+    
+        transcript_annotation=annotation[annotation['transcript_id']==df.iloc[j]["transcript_id"]] 
+        transcripts_ids.append(df.iloc[j]["transcript_id"])
+        transcripts_pos.append(df.shape[0]-j) 
+        if (transcript_annotation.shape[0] > 2): 
+            for idx,row in transcript_annotation.iterrows(): 
+                chrm = row["chrm"] 
+                if row['type']=='transcript': 
+                    plt.plot([row['plot_start']*length/longest,row['plot_stop']*length/longest],[1.75,1.75],color=colors[j],linewidth=2) 
+                else:
+                    plt.plot([row['plot_start']*length/longest,row['plot_stop']*length/longest],[1.75,1.75],color=colors[j],linewidth=10) 
+            ax.spines['top'].set_visible(False)
+            ax.spines['right'].set_visible(False)
+            ax.spines['left'].set_visible(False)
+            ax.spines['bottom'].set_bounds(0,ax.get_xticks()[-2]) 
+            #plt.yticks(transcripts_pos,transcripts_ids)
+            #plt.xlabel(chrm)
+        count += 2
         
-transcripts[tcons] = transcript(tcons, idr, ips, ss8)
+def plotCodingPotential(plt, panels, df_temp):
+    count = 3
+    for ids in list(df_temp["transcript_id"]):
+        ax = plt.subplot(panels,2,(count,count+1)) 
+        flip = True
+        tcons_curr = transcripts[ids]
+        # Pfam domain
+        for i in tcons_curr.ips:
+            
+            start = int(i[3])
+            stop = int(i[4])
+            
+            if (i[1] == "Pfam"):
+                name = i[8].split(";")[3].split("=")[-1]
+                plt.gca().add_patch(Rectangle((start,1.05),stop-start,0.2,edgecolor="#3cb44b",facecolor='#3cb44b'))
+                if (flip):
+                    ax.annotate(name,(start + (stop-start)/2.0,1.2), fontsize = 14, color = "#e6194b", ha = "center", va = "center")
+                else:
+                    ax.annotate(name,(start + (stop-start)/2.0,1.1), fontsize = 14, color = "#e6194b", ha = "center", va = "center")
+                flip = not flip
+                
+        ax.annotate("Pfam Domain",(-50,1.15), fontsize = 12, color = "#3cb44b", ha = "center", va = "center")
+
+        # Secondary structure prediction
+        ss8_df = pd.DataFrame(tcons_curr.ss8[1:], columns = tcons_curr.ss8[0])
+        ss = list(ss8_df["SS"])
+        for i in range(tcons_curr.length):
+            plt.gca().add_patch(Rectangle((i,-0.4),1,0.35,edgecolor=colors[ss3_abbvs.index(ss[i])],facecolor=colors[ss3_abbvs.index(ss[i])]))
+
+        plt.plot((0,longest_length),(1,1),color = "black")
+        plt.plot((0,longest_length),(0,0),color = "black")
+        plt.plot((-1,-1),(1.5,-0.9),color = "black")
+        ax.annotate("SS Prediction",(-50,-0.2), fontsize = 12, color = "#3cb44b", ha = "center", va = "center")
+        
+         # Amino acid sequence
+        aa = list(ss8_df["AA"])
+        for i in range(tcons_curr.length):
+            plt.gca().add_patch(Rectangle((i,-0.8),1,0.35,edgecolor=colors[aa_abbvs.index(aa[i])],facecolor=colors[aa_abbvs.index(aa[i])]))
+        ax.annotate("AA Sequence",(-50,-0.6), fontsize = 12, color = "#3cb44b", ha = "center", va = "center")
+        
+        #plt.plot((0,longest_length),(1,1),color = "black")
+        #plt.plot((0,longest_length),(0,0),color = "black")
+        #plt.plot((-1,-1),(1.5,-0.5),color = "black")
+
+        # IDR prediction
+        idr_df = pd.DataFrame(tcons_curr.idr[1:], columns = tcons_curr.idr[0])
+        idr_df = idr_df.astype({'# POS': 'int32',"IUPRED2":"float"})
+        plt.plot(idr_df["# POS"],idr_df["IUPRED2"])
+        ax.annotate("IDR Prediction",(-50,0.5), fontsize = 12, color = "#3cb44b", ha = "center", va = "center")
+
+        # Phosphorlyation Site
+        buffer = 0
+        for site_type in tcons_curr.pss:
+            for i in tcons_curr.pss[site_type]:
+                start = int(i[0])
+                stop = int(i[2])
+                plt.gca().add_patch(Rectangle((start,1.3+buffer),stop-start,0.025,edgecolor="black",facecolor='black'))
+            ax.annotate(site_type,(-50,1.3 + buffer), fontsize = 9, color = "#e6194b", ha = "center", va = "center")
+
+                #plt.gca().add_artist(plt.Circle((start + (stop - start)/2,1.35),0.25,color="black"))
+            buffer += 0.075
+
+        ax.spines['top'].set_visible(False)
+        ax.spines['right'].set_visible(False)
+        ax.set_yticks([], [])
+
+        ax.title.set_text(tcons_curr.tcons) 
+        
+        count += 2
+
+class Parser(object):
+    def __init__(self, csv, gtf, geneIDs, geneNames, outDir, minTPM, maxIso, minPct):
+        self.csv = csv
+        self.gtf = gtf
+        self.geneIDs = geneIDs
+        self.geneNames = geneNames
+        self.outDir = outDir
+        self.minTPM = minTPM
+        self.maxIso = maxIso
+        self.minPct = minPct
+
+args = Parser('/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Quantification/all_counts_deseq2norm.txt',
+              '/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/GffCompare/nanopore.combined.gtf',
+              '', '/home/annaldas/projects/isoform_differentiation/test/list.txt',
+              '/home/annaldas/projects/isoform_differentiation/test/',0,18,0)
+
+outdir=args.outDir
+minimumTPM = args.minTPM
+minimumPct = args.minPct
+maximumIso = args.maxIso
+(identifier, targets, annotation, data, samples, conditions, number_replicates, x) = preprocessArguments(args)        
 
 
+df=data[data[identifier]==gene]
+df_temp = df
+for j in range(df.shape[0]):
+    transcript_annotation=annotation[annotation['transcript_id']==df.iloc[j]["transcript_id"]] 
+    if (transcript_annotation.shape[0] < 3):
+        df_temp = df_temp[df_temp["transcript_id"] != df.iloc[j]["transcript_id"]]
 
+# total gene expression calculation
+data_gene=df_temp[samples].sum().to_frame().transpose()
+data_gene=calculateStatistics(data_gene,conditions,number_replicates)
+# mean transcript expression calculation
+df_temp=calculateStatistics(df_temp,conditions,number_replicates)
+# isoform percentage calculation
+df_temp=(df_temp.filter(like='mean').div(data_gene.filter(like='mean').values[0],1)*100).add_prefix('Pct_').join(df_temp)
+#choose isoforms to plot
+df_temp=chooseIsoforms2Plot(df_temp,minimumTPM,minimumPct,maximumIso,annotation)
+x = [0,3,5]
+if df_temp.shape[0]:
+    panels = df_temp.shape[0] + 1
+    fig,axes = plt.subplots(panels,2,figsize = (18,24)) 
+    fig.subplots_adjust(top = 0.95) 
+    fig.suptitle(gene,fontsize=16) 
+    
+    #plot isoform expression 
+    axg=plt.subplot(panels,2,1) 
+    plotProfiles(x, df_temp, data_gene, axg, colors) 
+    
+    #plot isoform expression percentage 
+    axt=plt.subplot(panels,2,2) 
+    plotStacked(x,df_temp,axt,colors) 
+    
+    #prepare annotation
+    annotation_cut=prepareAnnotation(annotation,df_temp) 
+    
+    #plot annotation 
+    #axa=plt.subplot(panels,2,(3,4)) 
+    plotAnnotation(annotation_cut, df_temp, plt, colors,longest_length) 
+    
+    plotCodingPotential(plt,panels,df_temp)
 
-print(transcripts)
\ No newline at end of file
+fig.savefig(snakemake.output[0])
\ No newline at end of file
diff --git a/isoform_transcripts.ipynb b/isoform_transcripts.ipynb
index 0c4f65c..c0882b9 100644
--- a/isoform_transcripts.ipynb
+++ b/isoform_transcripts.ipynb
@@ -23,7 +23,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -34,7 +34,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -71,28 +71,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "['4e6fca82-c640-4aac-be00-57223afed52e|41',\n",
-       " '0807443e-5f7f-4b1b-b59f-b6a737ad89a9|16']"
+       "['6356d404-48e6-4c7a-8fbd-5f6e7f9b42bc|5',\n",
+       " 'a5e2697c-74a1-4ac9-a5b2-d974af83137d|4']"
       ]
      },
-     "execution_count": 51,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "gene_oID[\"MLLT1\"]"
+    "gene_oID[\"KTN1-AS1\"]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -109,7 +109,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -153,28 +153,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "KeyError",
-     "evalue": "'142e462f-f586-42fc-97b9-b2e3bfa1fd0d'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-27-ad5dd960a8c2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0moID_tID\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"142e462f-f586-42fc-97b9-b2e3bfa1fd0d\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;31mKeyError\u001b[0m: '142e462f-f586-42fc-97b9-b2e3bfa1fd0d'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "oID_tID[\"142e462f-f586-42fc-97b9-b2e3bfa1fd0d\"]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -186,7 +174,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -208,7 +196,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -234,36 +222,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "92b2df50-a034-43a9-a75a-ad52cbccf075|6 AKAP8L\n",
-      "78c3fe18-5c83-402c-8269-c07aedbd40a4|3 c1554cca-de07-418f-9818-eca090c80b38 TCONS_00030383\n",
-      "9a2337b7-740a-4725-838a-8847f42a61bc|6 392ae6cc-c2f2-47a3-bb89-3fdf6caa6719 TCONS_00030384\n",
-      "fd563dd9-374b-4439-a696-517bcfe44fad|3 79b8683e-5bbc-4bdd-a545-b464ff3564e3 TCONS_00030385\n",
-      "e172e6af-1c34-43dd-bdb6-3656d795a229|3 4a88e4c4-0501-49dc-b89a-1990cdbaf23c TCONS_00030386\n",
-      "fc150c74-c6c6-4e61-b4d5-d3639c0dee91|33 89d79286-527c-427b-bb19-83b110feb370 TCONS_00030387\n",
-      "84a1fcd2-ec02-488e-b1e0-706944cb8fc5|3 9f9ce2f8-92d1-4a3c-90f7-68fcda8fb148 TCONS_00030388\n",
-      "e0325806-c4fd-4d16-a8a3-3cd3ee13cdfc|6 388b7956-d60f-42cc-948c-f607b14cab9d TCONS_00030389\n",
-      "b655aec1-864d-418b-b78a-f19b975b80fb|21 6be26262-4cff-4aca-8f54-9c7b3e5bdf22 TCONS_00030390\n",
-      "bdc78543-c0d2-4ef3-b74e-e60e1c97e9d0|8 ab1a0ddb-1b97-47bc-b004-6c34f2f09b6a TCONS_00030392\n",
-      "db5576fd-1921-46ce-a0d2-412728ab9db8|3 c62aeaa1-b7c1-47ad-b0a3-9815ad439460 TCONS_00032730\n",
-      "3fe42f15-a4d1-49ae-a646-64d71a8c43e6|3 36436437-fd4b-4802-b9da-0f048cb4c048 TCONS_00032731\n",
-      "287bdde2-eb53-4bbc-b96a-5b26773c3dd8|9 36436437-fd4b-4802-b9da-0f048cb4c048 TCONS_00032733\n",
-      "5a5a062a-2f6c-46d2-aea4-a5e80bbab920|3 21c301c2-0b56-4ee7-8b6b-27f35c89cb08 TCONS_00032734\n",
-      "6464d1df-7486-4008-a0bb-40d6227027a5|5 fb469fcf-d099-4192-a71a-d5c4c4ae37ba TCONS_00032735\n",
-      "1a232563-721a-4f76-83af-18a01a9cf221|10 fb469fcf-d099-4192-a71a-d5c4c4ae37ba TCONS_00032736\n",
-      "43396b69-e18d-4578-903b-782887dd7340|4 fb469fcf-d099-4192-a71a-d5c4c4ae37ba TCONS_00032737\n",
-      "8004538e-da37-43d1-b8cc-2e8cffc77383|3 fb469fcf-d099-4192-a71a-d5c4c4ae37ba TCONS_00032738\n",
-      "3debe6b9-8564-4b1f-a8a5-8d3587e3a789|3 fb469fcf-d099-4192-a71a-d5c4c4ae37ba TCONS_00032739\n",
-      "7ccaec48-68b4-416b-b2b8-79d9e200e14b|11 fb469fcf-d099-4192-a71a-d5c4c4ae37ba TCONS_00032740\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(tID_oID[\"TCONS_00032753\"],oID_gene[tID_oID[\"TCONS_00032753\"]])\n",
     "\n",
@@ -277,7 +238,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -298,30 +259,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": null,
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "ABI2\n",
-      "            OJ32       OJ33       OJ34 class_code             gene_id  \\\n",
-      "31847  12.061978  19.252866  14.095533          =  ENSG00000138443.16   \n",
-      "22086  16.082637  13.730726  17.263069          j  ENSG00000138443.16   \n",
-      "79409  11.200408   5.671387   5.384810          =  ENSG00000138443.16   \n",
-      "64102   3.159089   2.387952   1.583768          =  ENSG00000138443.16   \n",
-      "\n",
-      "      gene_name      ref_transcript   transcript_id  \n",
-      "31847      ABI2   ENST00000261017.9  TCONS_00035868  \n",
-      "22086      ABI2  ENST00000295851.10  TCONS_00035869  \n",
-      "79409      ABI2  ENST00000261018.11  TCONS_00035870  \n",
-      "64102      ABI2   ENST00000424558.5  TCONS_00035871  \n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "group = df_rep1.groupby([\"gene_name\"])\n",
     "\n",
@@ -337,44 +279,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": null,
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "FUS\n",
-      "             OJ40       OJ41        OJ42 class_code             gene_id  \\\n",
-      "45283   14.890654   7.486137   11.079592          j  ENSG00000089280.18   \n",
-      "53160   17.810391   4.277792    8.897248          =  ENSG00000089280.18   \n",
-      "68181  131.388127  31.816081  114.992736          =  ENSG00000089280.18   \n",
-      "55151   93.431557  28.073012   56.405196          j  ENSG00000089280.18   \n",
-      "58630   61.606433  41.173751   59.426903          =  ENSG00000089280.18   \n",
-      "64460   13.722760   3.208344    5.707669          j  ENSG00000089280.18   \n",
-      "47855    6.131446   2.138896    6.043414          c  ENSG00000089280.18   \n",
-      "84426   14.014734   6.951413   14.101299          c  ENSG00000089280.18   \n",
-      "36211   16.058549   6.416689    8.393630          c  ENSG00000089280.18   \n",
-      "35581    0.875921   0.267362    1.175108          x  ENSG00000089280.18   \n",
-      "9268     0.000000   0.267362    0.671490          s  ENSG00000089280.18   \n",
-      "\n",
-      "      gene_name      ref_transcript   transcript_id  \n",
-      "45283       FUS  ENST00000254108.11  TCONS_00022528  \n",
-      "53160       FUS   ENST00000566605.5  TCONS_00022529  \n",
-      "68181       FUS  ENST00000254108.11  TCONS_00022530  \n",
-      "55151       FUS  ENST00000254108.11  TCONS_00022531  \n",
-      "58630       FUS   ENST00000487509.6  TCONS_00022532  \n",
-      "64460       FUS  ENST00000254108.11  TCONS_00022533  \n",
-      "47855       FUS   ENST00000487045.6  TCONS_00022534  \n",
-      "84426       FUS   ENST00000487509.6  TCONS_00022535  \n",
-      "36211       FUS  ENST00000254108.11  TCONS_00022536  \n",
-      "35581       FUS  ENST00000254108.11  TCONS_00023974  \n",
-      "9268        FUS  ENST00000254108.11  TCONS_00023975  \n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "group = df_rep2.groupby([\"gene_name\"])\n",
     "count = 0\n",
@@ -392,21 +301,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'df' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-3-c66790573a00>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mgroup\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"gene_name\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mcount\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mgene\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mitem\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mgroup\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0;32mif\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgene\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"TLL3\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgene\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "group = df.groupby([\"gene_name\"])\n",
     "count = 0\n",
@@ -431,7 +328,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -455,7 +352,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -467,7 +364,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -483,20 +380,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 302,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['gene_id \"ENSG00000203667.10\"', ' transcript_id \"ENST00000391839.6\"', ' gene_type \"protein_coding\"', ' gene_name \"COX20\"', ' transcript_type \"lncRNA\"', ' transcript_name \"COX20-202\"', ' level 2', ' transcript_support_level \"1\"', ' hgnc_id \"HGNC:26970\"', ' havana_gene \"OTTHUMG00000040401.2\"', ' havana_transcript \"OTTHUMT00000097176.1\"', ''] transcript\n",
-      "['gene_id \"ENSG00000203667.10\"', ' transcript_id \"ENST00000391839.6\"', ' gene_type \"protein_coding\"', ' gene_name \"COX20\"', ' transcript_type \"lncRNA\"', ' transcript_name \"COX20-202\"', ' exon_number 1', ' exon_id \"ENSE00001899074.1\"', ' level 2', ' transcript_support_level \"1\"', ' hgnc_id \"HGNC:26970\"', ' havana_gene \"OTTHUMG00000040401.2\"', ' havana_transcript \"OTTHUMT00000097176.1\"', ''] exon\n",
-      "['gene_id \"ENSG00000203667.10\"', ' transcript_id \"ENST00000391839.6\"', ' gene_type \"protein_coding\"', ' gene_name \"COX20\"', ' transcript_type \"lncRNA\"', ' transcript_name \"COX20-202\"', ' exon_number 2', ' exon_id \"ENSE00003677820.1\"', ' level 2', ' transcript_support_level \"1\"', ' hgnc_id \"HGNC:26970\"', ' havana_gene \"OTTHUMG00000040401.2\"', ' havana_transcript \"OTTHUMT00000097176.1\"', ''] exon\n",
-      "['gene_id \"ENSG00000203667.10\"', ' transcript_id \"ENST00000391839.6\"', ' gene_type \"protein_coding\"', ' gene_name \"COX20\"', ' transcript_type \"lncRNA\"', ' transcript_name \"COX20-202\"', ' exon_number 3', ' exon_id \"ENSE00001889952.1\"', ' level 2', ' transcript_support_level \"1\"', ' hgnc_id \"HGNC:26970\"', ' havana_gene \"OTTHUMG00000040401.2\"', ' havana_transcript \"OTTHUMT00000097176.1\"', ''] exon\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "info = list(pd_aline[\"info\"])\n",
     "types = list(pd_aline[\"type\"])\n",
@@ -512,7 +398,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {
     "scrolled": true
    },
@@ -523,28 +409,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "chr8\t26044860\t26045413\tENST00000520164\n",
-      "chr8\t25841725\t25844611\tENST00000520164\n",
-      "chr8\t26041288\t26041488\tENST00000408929\n",
-      "chr8\t26040939\t26041002\tENST00000408929\n",
-      "chr8\t26040616\t26040671\tENST00000408929\n",
-      "chr8\t26040066\t26040101\tENST00000408929\n",
-      "chr8\t25844362\t25844611\tENST00000408929\n",
-      "chr8\t25858059\t25858083\tENST00000535548\n",
-      "chr8\t25850594\t25850761\tENST00000535548\n",
-      "chr8\t25844609\t25844640\tENST00000535548\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "df_utr_regions = pd.read_csv(\"/project/owlmayerTemporary/Sid/nanopore-analysis/ReferenceData/df_utr_regions.csv\")\n",
     "\n",
@@ -567,7 +436,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -589,19 +458,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1 TCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n",
-      "392 GAGCTGGAGATTGGATCACAG\n",
-      "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCATGTGTCTGGTTGTTTGAAA\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "if transcript_id in trans_utr:\n",
     "    seq = s\n",
@@ -614,43 +473,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "-1 GGCATCGGCGCGGTCAGCCTCGTGGCGCGCCCACGCCCCCACGCCGGCTCTTCCCGGGGTCCTTCCGTGCGCGTTGATATGATTGGCCGGCGAATCGTGGTTCTCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n",
-      "430 GAGCTGGAGATTGGATCACAG\n",
-      "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCATGTGTCTGGTTGTTTG\n",
-      "-1 GGCATCGGCGCGGTCAGCCTCGTGGCGCGCCCACGCCCCCACGCCGGCTCTTCCCGGGGTCCTTCCGTGCGCGTTGATATGATTGGCCGGCGAATCGTGGTTCTCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n",
-      "-1 AAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCATGTGTCTGGTTGTTTGAAA\n",
-      "-1 ATATGATTGGCCGGCGAATCGTGGTTCTCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n",
-      "-1 GAAGTGTCTAGCAG\n",
-      "430 GAGCTGGAGATTGGATCACAG\n",
-      "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCATGTG\n",
-      "1 TCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n",
-      "-1 GAAGTGTCTAGCAG\n",
-      "392 GAGCTGGAGATTGGATCACAG\n",
-      "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTC\n",
-      "1 TCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n",
-      "392 GAGCTGGAGATTGGATCACAG\n",
-      "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCA\n",
-      "1 TCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n",
-      "392 GAGCTGGAGATTGGATCACAG\n",
-      "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCATGTGTCTGGTTGTTTGAAA\n",
-      "1 TCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n",
-      "-1 AATGTCACTGCCATGGCCGCCTTGCTGCATTTCTGAGGATGCTTCATCTCTCCACCTTCTTCTCCACTCAGCAGCCAGCAGGGCACTGTGGAAATCGGAGTCACATGAGCTGGCACCTCTGTTCAGAACCCTCCAGGGCTCCACATCTCTCTCACCCAAATGCCAAAGACCTCCCCACGCCCCCACAATCCCCCACGACCTGGCCACTGGCCTCCCACCACCTTCCAGCTCCAGCGGCTCCTACCACATTTAAGGCTTTCCTTCCTAGTTTTAATTTTTCCTCGTCAGCAGTTGATTTTATTATTTTCTTGTTTATTGGTATTTTCCCACTAGAAATGAAGCTGCGTGAAGTTAGAGATTTTTTTTTTTGGTCTGTGTTCCTAATTAGCTCATTGCTATACCCCTGGCGCCCAGAACAATGCCTTGGACACAGTACGCAGTAGACTAAATAAATACTTGTTGAATGACTGACTGACGGAATGACGGCTGTGTGGGGAGTGGATTGGGTCGTGAGGCAGAGGCTGCGGTGGAAACTCAGGCAGGAGGTGATGGTGGTTCTTGGGGCTGCGGAATGCCAAGTTTAGAAGCTCTTCCTCTGCTGTGGCACATGAACCGGTCACTCGAGAAGGCTTTTAGATTTACTTTGCCTAATCCCCTCTTAGTGCATGTGGGGAAACTGAGGTACACAAAAGGAATTCCCCACCAAGTTAGGGGCAGAACCTAGCCCCCTTGTCTCCCAGATGGATATCTTCTTTTTTTTTTGAGACGGAGTCTTGCTCTGTTGCCCAGGCTGGAGTGCAGTGGTACCATCTTGGCTCACTGCAACCTCTGCTTCCCAGGTTCAAGCGATTCTCCTGCCTCAGCCTCCTGAGTGTCTGCGATTACAGGTGCACACAACCACGCCTGGCTAATTTTTGTATTTTTAGTAGAGACGGGGTTTCACCGTGTTGGTCAGGGTGACCTCAAACTCCTGACCTCATGATCCACCCAGCTCAGCCTCCCAACGTGCTGGGATTACAGGCATGAGCCACCGTGCCTGGCTGGACATCTTGTTATTAAAGCTTCTTCTCTCTTTGTAGGGGAGGGGGAGATGCCTCTGGTGGAGAAGACCAGTGTGGCAGTGACTGTGTCTGTTAGTGAACCTGGTGGCTGGTTGAGGGTCTGTCGTGGTGACTGAGGACACATACAAAGTGCTTTTCTCAGTGGTCACCTTGGTGTTGGTGAATAAGGGTCAGAAGATGGCTCCTGTCCTAGGGCACTGCCAGTCGGTTTGGAAGCTGAAATGCCTGCTTAGCAGTTTGAGGAAACACAGACCTTGGAGGATCTTCTGGTTGCCTCTTCAAGAATTCATTCTATTCCCCTTCTGCTCCCCAAATTTGCTTTTCTTGGGGTGGGTCTTGGTTGGCCTAAGCCAAGAAAGTATGGCATCTACTCCTTCCATAGCAATAGCTCAGGAATAGGCAGTGACCCAGACCTGAACCAATCAGTGCATGGAATTACCCCTGGCCAAAGTGGTTGATTGAGGCTGGGTGCAAGCAGAGTTGTGAGAAGGCTCCCATTTGGTGGTTGGAGAGATCGCACTTGCTCCAGAGGTCATAATGTGCAGATCTGAGGCTTGGAACTGCTGCAGACATTTTGCTACCACAAGTGAAGCCACCCTGACGACACAGTTGACAATTTGGAGCAGGGCAGAGCTGAGAGAACAGCAGGGAAACAGCCAGAGTCTTGCTCAAGCCTCCCTGAAGTATCTATACCCCTGGACTCTAGTTATGGGGGCTAATAAATGTTATATACTGTTTAAGGTA\n",
-      "15 GCTGTCTGAAGATAGATCGCCATC\n",
-      "-1 AGATGACTTGCCCCAAGTCCTTCAGCTCATTCATGCTGGGGAAAGGAGTAAGCTTCAGGCGTCTTCCCCTGGAGTTCACGCCACCTCTGACAGCAAGTGAGCCGTTTGCTACTCAAGTGCTGTTTCTTGCTTTTTTAAG\n",
-      "-1 TGAGGGAATTGGGGCTTGGAGTGCAAGCATTGGGAAGAATTTCCCAGGAAGAGAGATGCACAGATGTGAAGAACTCGAAGGCAAGAGAAAGCCGGGGGGTTGTGTGGCAGGTAGAAGTGCCAGGACCGTGGAGCGTGTGGACATG\n",
-      "-1 GAAGTGTCTAGCAGGTACTGAGATT\n",
-      "430 GAGCTGGAGATTGGATCACAG\n",
-      "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCA\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "for t in trans_utr:\n",
     "    seq = s\n",
@@ -665,7 +490,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -678,27 +503,18 @@
     "\n",
     "a = '''GCTCAGTCCTCCAGGCGTCGGTACTCAGCGGTGTTGGAACTTCGTTGCTTGCTTGCCTGTGCGCGCGTGCGCGGACATGGCCTCAAACGATTATACCCAACAAGCAACCCAAAGCTATGGGGCCTACCCCACCCAGCCCGGGCAGGGCTATTCCCAGCAGAGCAGTCAGCCCTACGGACAGCAGAGTTACAGTGGTTATAGCCAGTCCACGGACACTTCAGGCTATGGCCAGAGCAGCTATTCTTCTTATGGCCAGAGCCAGAACACAGGCTATGGAACTCAGTCAACTCCCCAGGGATATGGCTCGACTGGCGGCTATGGCAGTAGCCAGAGCTCCCAATCGTCTTACGGGCAGCAGTCCTCCTACCCTGGCTATGGCCAGCAGCCAGCTCCCAGCAGCACCTCGGGAAGTTACGGTAGCAGTTCTCAGAGCAGCAGCTATGGGCAGCCCCAGAGTGGGAGCTACAGCCAGCAGCCTAGCTATGGTGGACAGCAGCAAAGCTATGGACAGCAGCAAAGCTATAATCCCCCTCAGGGCTATGGACAGCAGAACCAGTACAACAGCAGCAGTGGTGGTGGAGGTGGAGGTGGAGGTGGAGGTAACTATGGCCAAGATCAATCCTCCATGAGTAGTGGTGGTGGCAGTGGTGGCGGTTATGGCAATCAAGACCAGAGTGGTGGAGGTGGCAGCGGTGGCTATGGACAGCAGGACCGTGGAGGCCGCGGCAGGGGTGGCAGTGGTGGCGGCGGCGGCGGCGGCGGTGGTGGTTACAACCGCAGCAGTGGTGGCTATGAACCCAGAGGTCGTGGAGGTGGCCGTGGAGGCAGAGGTGGCATGGGGTAGGTGTCTCATGAGCCAGGGAGTATCTTTGGTGGGGAGTGTGGAGGATTGCATGAATCTCCCTGAAGCCAGTCCCTAGTGCATGGTTTAGTATTCTTGTTGTCTAGGGATCTGTGAGGGCTTTGATTTGGGGGCAGTGACTTTCTTTTTACATCCCCATTTTATTTTTGTGAGAACTTGGGAGCCTGAACTCCCATCCATACCACTGAATAGAGATTTTGAGTAATGATACTTGTTTCCAAAAAAAAAGAAACCATACATAGATACGTATGGATTGGAGTCATTAATATCCTAGGCAAGAAACATGGAAGTGAAGACTTCTTTCTCTGCAAGGGAAACCGATGATCCCACTCCTGGGAAATAGTAGGGAAACTTGGTATGTGTATTCCCATGTGTCCTCTAGGGAGTTGGTAATGGTTAACCTGACTTCAGCTTCCAGGAATTGGCTACTCTTCCCGTTTTCTATAGTCATTTGAATCCACGAGCTTGATTTGCACTAATTTGACCGACATTGATTTTGTGTGTGACTTGGTTTATGGGGCCAGCTGACTGAAGTAAGCAGACCTTTTGGGCAAAAATATGCTTTGACAGTGGTCTCCCACCTATTTGTTCCACTGTCTGCCTTCCCCTGGTTACTTAAAATTCATCAGCTTGTCCAACTGGACCTTCTTTCCTTCCTGCTGAAGTTGATTTGAAGTAAAACCTTAGATTTGATGTTAAAACAGTTGTCAAATCTGTTGGTAAATAAGATTTGAAGGACCCTACTCTGTCTCCCTTGAAAAAGGGGAGGAATGTCAGTGTTACTGTTTTTGGAAAAAGTAGATTTTTAAACCGAGTTTGGAAATGGTAAGTATGCAGAGGTGGGTGGGGGCAATCTCAAAAACGTGCAAAAATGAGGAAAACAAAAATGAGGAAATGTGTGCGTGTGTTTAATGCAAAACTTTAAAAAGAAAAACAACTGTTATGTGACTGTTAACTTGCTCTGCATTTTATGTGCCACAGGTATGAAAGGTGACATTGCAAAATACTCCGCTCTTCTCGCAGTGTAGAAGGGGTGACCCCGGGGGTTGGGGGAGATCAAAAACAGCTCAGTAGTTAGGACAGAGCTTAGCTAAGTTTGTCTTGCTTTAAGGGGAAGTTGCCTTTGGTTTTGACTTTTTATGGAATGGGGTTGGGTCTGCTTGCTGCTTTCAAAGCAAAAACCACAAAAATGTGTTCAAGGCTACCCCAGCCTGGTGTGAAATGTCTTCTGGGTAAATTGGGGTAGGGTTTTTAAACCAACTACTTGGTTGTCAACCACTTGCGACAAGAGGAAAAAAAAACATCTGCTCCATCGGAAGAACGACCAAGGAAAATGGGTTATTTTTTTTCCAGAGGAAATAGATAACGTAACCTTTTAAAGCAAAATCTTTATAAACTGTGTCTGAGAAATTGCACACGTGTGTGTGACATGCTCAAAGGTCAGACAAGGGGTGGTCAGGAAGGGATGTATTTTAGTAGCCACTTGTATCTTTTTCCAAAAACACCTACCCATGTTTGGGGAATGTTAAACAAAATCAAAAAACAACCTTTTGTAGCCGTTGGAAGCTTCATGTCCTTTCTTCTAACTTGTCTTCTCCAGCGGAAGTGACCGTGGTGGCTTCAATAAATTTGGTGGTAAGTGAACAGAGTTTCCAAAATTCCCAACTCCCAGCAATGCTTTGTCTGATTGTTCATTTGCAGATGTCTTAGCGTGTTAATTTAAATGTCAAAGGTTTTGAGGTGTCCAGAACCACCTCCAGAAAGGGGTAGGGTAGAATGCCACCTGTTGCCTGGTGTGTGCTAACCTGGAGCAGGTAGGGGTAAGACTCAATAGTCATCTTTTACCAAATGGGTTTGCCCCAGGTTAATAAGAGGGGTCTAGTAGGCCTTGGACTGGGCCGTTGCCACACCTGGCACTTAGTGACCATCATCATGAGAAACTGGAGAGTGCGTGCTGGAACACGTGGTGCCATCTTGGCTTTAGGATCCTTTTGATCGTTGTGTCCAAGGCTTGTGTGTGTGTGAGTGTGTGGGAGACAACTCCGAATGTTTAATTCTGGAAGAGGGATGTAACATTGCCCTGAGGATGGTGAAGTTGGTATACATTTATAAAGTACGGAATGGTGTCAATGAATGCAATTCTATGTATATGGACTTAACTGAGATGGGCAAATAGAAACTAGCTCTGGGAAGGAACATGTGCACTACTTCAAGAAAGATTGGAAGCATGTGTGGCTCATGGGAAATAACCAGGTCTTAAACAGCACAAACTGAATTCGTGGACCAGGAAGGTCTTAAACAGCACAAACTGAATTCATGGAAAAATGACAAATTTGAGAAGTCTCCCAGTAAGCTGGAACTTTTCTGGTTTGGTTAACAAAAGGTTTCTTGATTTGTTTCAAGATTTAAAGCCAAAGGTGTGGGTTCATGACTTAGGTGTCATTGCGTGTGGGTACAATATTTATATATGGCGAATTCAGATAAACATTGGTCAAAGATGGTCTCTGGAAAAACAAAATAGAGGCTGCATTACGGAAATAAGATTTCTGGTCTGTTCCCTGGGACATGCTTAAAAAATACAATAGCTATTATGTATGGTTTTTATTTTCATGTGGTTTCGGGGAAACAACACGGTTTTAAGGATGGTTTCTAAAGATGAAATTAAAAATTGTTCCACAAGGGTTAAGTGTCTGGTGGTAAAGTTGGGAGAAACTGGATGGATGCACATCGCATGGCTGGTGGCGAGCCCATCTCTCTTCTCTCGGGTGAGAGAACCGGGCCAAGCTGAGTTGGTTTGTTCACTTTAATGGGTCTCCGTTTCCCCTGCCACCTGTGCTGAGGACATTTCCCAGCCTGAGCTGGGGGAGGCAGCATTTGCTGAAGTGTGGAGTTGTCTCTGTGGAGACTCAAGTTACAGATCTTAAGGGGCCTGCCTAGAATTTTCTCCTCTGGGCAGGCGACCCAGGAAAGGGTTTGGAGTGAGGCTGTGAGCACTTACTTGATATTTTACAAGTTTGGATTTGGTGTTAATTTTTTTCCTTGTCCGTTTTTTCCTGTTGACTAACGGCTCATCTTTTCCTTGTTTTTGTTTTTTTTTTGTTCTTTTTTTCCATGTCACTAAAGGCCCTCGGGACCAAGGATCACGTCATGACTCCGAACAGGATAATTCAGACAACAACACCATCTTTGTGCAAGGCCTGGGTGAGAATGTTACAATTGAGTCTGTGGCTGATTACTTCAAGCAGATTGGTATTATTAAGACAAACAAGAAAACGGGACAGCCCATGATTAATTTGTACACAGACAGGGAAACTGGCAAGCTGAAGGGAGAGGCAACGGTCTCTTTTGATGACCCACCTTCAGCTAAAGCAGCTATTGACTGGTTTGATGGTAAAGAATTCTCCGGAAATCCTATCAAGGTCTCATTTGCTACTCGCCGGGCAGACTTTAATCGGGGTGGTGGCAATGGTCGTGGAGGCCGAGGGCGAGGAGGACCCATGGGCCGTGGAGGCTATGGAGGTGGTGGCAGTGGTGGTGGTGGCCGAGGAGGATTTCCCAGTGGAGGTGGTGGCGGTGGAGGACAGCAGCGAGCTGGTGACTGGAAGTGTCCTAATCCCACCTGTGAGAATATGAACTTCTCTTGGAGGAATGAATGCAACCAGTGTAAGGCCCCTAAACCAGATGGCCCAGGAGGGGGACCAGGTGGCTCTCACATGGGGGGTAACTACGGGGATGATCGTCGTGGTGGCAGAGGAGGCTATGATCGAGGCGGCTACCGGGGCCGCGGCGGGGACCGTGGAGGCTTCCGAGGGGGCCGGGGTGGTGGGGACAGAGGTGGCTTTGGCCCTGGCAAGATGGATTCCAGGGGTGAGCACAGACAGGATCGCAGGGAGAGGCCGTATTAATTAGCCTGGCTCCCCAGGTTCTGGAACAGCTTTTTGTCCTGTACCCAGTGTTACCCTCGTTATTTTGTAACCTTCCAATTCCTGATCACCCAAGGGTTTTTTTGTGTCGGACTATGTAATTGTAACTATACCTCTGGTTCCCATTAAAAGTGACCATTTTAGTTAAA'''\n",
     "\n",
-    "a = '''AGTGCGGAGCCTTAGGCGGAGCGAAGAGAACCGGTCGCGGCAATCCTAGCGCGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCACCCGCATCCGCTGCGGGAGTCCGAGCCGGAACCACACCCAAGTAGCTGCCCTTTCCTCTTCTGTCATCTCACCGCCCCACCACAGACCGCGTTCCCCGAGGAAACCGGCCGCCCACGCCCGGAGCATCCTCCCCTGTTGAGCGGGCGCTGACGGACCCGGCGGCATGATGCGGCTGCGAGGCTCGGGGATGCTGCGGGACCTGCTCCTGCGGTCGCCCGCCGGCGTGAGCGCGACTCTGCGGCGGGCACAGCCCTTGGTCACCCTGTGCCGGCGTCCCCGAGGCGGGGGACGGCCGGCCGCGGGCCCGGCTGCCGCCGCGCGACTCCACCCGTGGTGGGGCGGGGGCGGCTGGCCGGCGGAGCCCCTCGCGCGGGGCCTGTCCAGCTCTCCTTCGGAGATCTTGCAGGAGCTGGGCAAGGGGAGCACGCATCCGCAGCCCGGGGTGTCGCCACCCGCTGCCCCGGCGGCGCCCGGCCCCAAGGACGGCCCCGGGGAGACGGACGCGTTTGGCAACAGCGAGGGCAAAGAGCTGGTGGCCTCAGGTGAAAATAAAATAAAACAGGGTCTGTTACCTAGCTTGGAAGATTTGCTGTTCTATACAATTGCTGAAGGACAAGAGAAAATACCTGTTCATAAATTTATTACAGCACTCAAATCTACAGGATTGCGAACGTCTGATCCCAGGTTGAAAGAGTGTATGGATATGTTAAGATTAACTCTTCAAACAACATCAGATGGTGTCATGCTAGACAAAGATCTTTTTAAAAAATGTGTTCAGAGCAACATTGTTTTGTTGACACAAGCATTTAGAAGAAAGTTTGTGATTCCTGACTTTATGTCTTTTACCTCACACATTGATGAGTTATATGAAAGTGCTAAAAAGCAGTCTGGAGGAAAGGTTGCAGATTATATTCCTCAACTGGCCAAATTCAGTCCCGATTTGTGGGGTGTGTCTGTTTGTACAGTAGATGGACAGAGGCATTCTACTGGAGATACCAAAGTTCCCTTCTGTCTTCAGTCCTGTGTAAAACCTTTGAAATATGCCATTGCTGTTAATGATCTTGGAACTGAATATGTGCATCGATATGTTGGAAAAGAGCCGAGTGGACTAAGATTCAACAAACTATTTTTGAATGAAGATGATAAACCACATAATCCTATGGTAAATGCTGGAGCAATTGTTGTGACTTCACTAATAAAGCAAGGAGTAAATAATGCTGAAAAATTTGACTATGTCATGCAGTTTTTGAATAAGATGGCTGGTAATGAATATGTTGGATTCAGTAATGCAACGTTTCAGTCTGAAAGAGAAAGTGGAGATCGAAATTTTGCAATAGGATATTACTTAAAAGAAAAGAAGTGTTTTCCAGAAGGCACAGACATGGTTGGTATATTAGACTTCTACTTCCAGCTGTGCTCCATTGAAGTGACTTGTGAATCAGCCAGTGTGATGGCTGCGACACTGGCTAATGGTGGTTTCTGCCCAATTACTGGTGAAAGAGTACTGAGCCCTGAAGCAGTTCGAAATACATTGAGTTTGATGCATTCCTGTGGCATGTATGACTTCTCAGGGCAGTTTGCTTTCCATGTTGGTCTTCCTGCAAAATCTGGAGTTGCTGGGGGCATTCTTTTAGTTGTCCCCAATGTTATGGGTATGATGTGCTGGTCTCCTCCTCTGGATAAGATGGGCAACAGTGTTAAGGGAATTCACTTTTGTCACGATCTTGTTTCTCTGTGTAATTTCCATAACTATGATAATTTGAGACACTTTGCAAAAAAACTTGATCCTCGAAGAGAAGGTGGTGATCAAAGGGTAAAGTCAGTGATAAATCTTTTGTTTGCTGCATATACTGGAGATGTGTCTGCACTTCGAAGATTTGCTTTGTCAGCTATGGACATGGAACAGCGGGACTATGATTCTAGAACAGCACTCCATGTAGCTGCTGCAGAGGGTCATGTTGAAGTTGTTAAATTTTTGCTGGAAGCCTGCAAAGTAAACCCTTTCCCCAAGGACAGGTGGAATAACACTCCCATGGATGAAGCACTGCACTTTGGACACCATGATGTATTTAAAATTCTCCAAGAATACCAAGTCCAGTACACACCTCAAGGAGATTCTGACAACGGGAAGGAAAATCAAACCGTCCATAAGAATCTTGATGGATTGTTGTAATGGTCTCAAATCCCAAGATTTAAATCACTTACCTATTTAATTGTGGAAAATGATTATGAAGAACATGTGTATTTCTATCTGGTAGTGATGTATATTTTACATTTGTCATTTCAGTGTTACTGGAGTTTTCTTCATTGTGCACACAGGACAAATCTGATCTCTTTGGGAAAAAATAGAAATAAAACAATCTCCCTCCATAATGTGAGCAATATTACCTCGTGCATTGTATAATTTGATGTAAAAGAAATAGTTACCAATGCTAGCTTGTGTGGTCTTCCATGATTTATTTGTGTTTTGTGAATTTTCAATTTATGGTGATGATCTGCTGATATGCATTTATAAAGTAAGCTCTGTTGTACAGTCTGTCCAAATGGGTCAAGGTTGCCTTTAGAAGCAAATAGTGTGATTTTCAAGACTTCAAATACAAATTTAGTTTAAGTGTTTGAACAACTATATGCACTTACGGTTGTGTGTTTAAAATGTCTCTCTCACCCCCTAGCTTCATGATGTGACTCTTAAAAAACTATAATAGTTAACAACTGTTAGTAAGATAGACCAATTCTGATTAGACTTTATCAGGGAATCTGTTTAAGATATGTTTGGTGACCAAAACGTATGTGTGAATGTAGTTATAATGCTTTTGAAAAATTTTCCTTTTTCTATATCCCCTTAGTCCAGCCTCTCTTCTCAGACATTTAGCTATCTGCCTCTTTCCTTTAGCTGGGAAAGTGAGAGCTGGCATACTATGCAGTTTTTATGTTTTCCATAGTAAGTCAGAAAATGCCTCCTATTTCTGGCATCAGAACTTTGCCATTTGTCTACAGAAGACGAACCAGAGACAAAATTACTAAGTATAAATTAGTCAAGTTTATCAGTCTAAAAAACGAAGGGATGTGCAACTGCAGCTCTTTAAGAAGTTTTTTTTTTTTAGCTTCTAGGGTAAAGATAAATTCAGAAATGCTCTAAGCTACCAAAGTTATTCTGAAAGTATGGGAACTGCTACAACTAACAAACATTTGTTTCCAAGCCTGTCATTAAGAGTCTGCATCAAGAGATTTGTCCTCCTTGGGGGACCACTGGATCATTCCAGATTTCTTGTGATTTTTCTATTGTGTAATTCTTGGTGGGCTCTGTAGTTTAATAATAAGAAAAAGGCCATTTCATTTTAAATTGTGACCTATAATTCTTTGTCTTGGGTTGGTAATTCAGGATTCATTTGGAAAGTGGGTAAAAGGGGCTTCAAAAAACGGATAGAACAGGATTTTCTAGGAGTTACACATACATTTTATCCTGTCATACCTCGAGATAAAGTGGCATGTTAGTGAGGAGTTCTGATATTAAGCACACACACACATGCACACAAATGGACTTCTCTGAAGCTGTGTTTAGTGAAATGAGCTCAAGTACATGAATGTTAGTTGTTATCACATACAGCAAATTCCTTTTTTTTTCTTTTTCTATGAGCACACTCTGCTGCTTCTAAACTTTACATGCCTGATGGCACCTTACTCCAGCAGCCTCCAGGTGCTTTCATTTTCACTTCCAGTCTAAGCCAGTGGCTCCTGCCACTGCCCTCCCATTACCTAGATGGCACCTCCTTTGGTGAAACCACGGCCAATGTTCCTTAGCTGCACCAGGCCCGAAGCTGTTCCCATGCTTGAGCTTCCATGGGGAGGATGCTGAGTGAGCAGTTTCCTACCCCGTGGATCTAGCAAGCCATGGAGACAGGTAGCATTTGTAAGATGCTGCACAGGAGCAGCATTATCCCCAAAGATATTACAGGGTAGACACGTTTTAACTGAAATCAATCAAGATAACTTTATTCAAAGAGCAGCCCGCTTTGTGTGACTAAAATGAAACAAGACAGTTGAATTGTGTGACTTGAAGATTACCAATGATTTTGAGGCTTTTCTATAATAAAAAGAGGTTCTAACCATTATTTGGGAACAAAGAGAGTTTTCATCTTTTTTCAGATCAAAACCATTCTGTAAAATCTTTGTTGTTTAATTAAATGTGCCGTTATTTACCCCTGATGTTATTTATGACTATGTGCCGATTCCTGCTCGGGCTGTTTGCTGTTGGCTGGTAATAATATATTTGATTTAAATGCTGTTGACTGTGCTATTAACTGCTGCCGTCAGTAAACTCCAAAGATCTTTTTGTTTTGGCTTTAGTATCATATGTGCTTTTTCTGTATCCTGAGCGCTCTATATGATCATGTTAATTTAAAGCTTTATACACATTGTTGTTTTTGCTGGTCTCATCTTTGGTAATATGCTATACCCCACTGCTGCCCGACACTGCCCTTTAGCTGCAGAGCTGGATTAGCTGTTGACCATTTGATGCTGTTGTCTGTCTGGCAGGGACTGAATGACCTGATGTCAGATTTAGATTCTTCCTGGGGATTACACAGCTATGAATGTATTTGCTTCTAAAACCTCCCAAAGTGAATCTAATCTTAAAACTACAAGTTGTAAGTATTCTGAAATTGGGAAACATTTATTTTAAATGCAATCAGGTAGTGTTGCTTTTTACAGCATAATAAATATATGTATCAAAAAAAAAA'''"
+    "a = '''GGCTGCAGCCGGGCTCCGTGGCGCTCGCAGCCACCGCCTCCTCTCGGCTCCAGGTCTTCCCCTTCTTTTTACAACTGATCCTGTTGGGGATTTTTTTTTTTTCTAAATTGGAACGGTGGGGAGGAGCAGGGAGGGGGGACCTGGAGGAAGGGGAGAGATTAGGCAGCCATCAATTTCCTCCAGTTTCTCCCAGAACAGGTGATGCTTCTAAATTGTGATCACTTTCAGGAGGCAGCACTGCAGCTGGAAGGATGCGAGCGACCTAGGGTGGAGTGGCTGAGGCGGCAGATCTGAACTTGCGGAGGATAAGAACCCAAACTTTGACTACATCAGTCCGCACCTCGCCAGTGAAGCAAAGGACGGGTTATCTTTTTTTTTTTTCTAAGACTCAAACTTGGGCACTTGATCCCTTTTCTTGGATTGCTTTGGAGGAGACGATTTGCTGGCAACGTTGGGAACAGTCAGGACTGTGTTGTAACTCTTACTTTTAAAGCGACAGTAGAGGATCAGACTTTTTAAATGTTTGGAATTCAAGATACTTTAGGAAGAGGACCAACTCTGAAAGAGAAATCGCTGGGCGCGGAGATGGATTCGGTCAGGTCCTGGGTCCGGAATGTCGGAGTGGTGGACGCTAATGTCGCCGCGCAGAGCGGGGTCGCCCTGTCCCGGGCCCACTTTGAGAAACAGCCTCCTTCCAACTTGAGGAAATCCAACTTCTTTCACTTCGTCCTGGCGCTCTATGACAGGCAGGGCCAGCCGGTGGAGATCGAGCGGACGGCCTTCGTGGACTTTGTGGAGAATGACAAAGAACAAGGCAACGAGAAGACCAACAACGGCACTCACTACAAGTTACAGCTCCTCTACAGCAACGGTGTCCGCACGGAACAGGACCTCTATGTCAGGCTCATCGACTCGGTCACCAAGCAGCCCATCGCTTACGAGGGACAGAATAAGAATCCGGAAATGTGCCGAGTTCTCCTGACGCACGAAGTGATGTGTAGTCGATGCTGCGAAAAGAAAAGCTGTGGAAACCGAAATGAGACTCCATCGGACCCAGTCATAATTGACAGATTCTTTTTAAAATTTTTCCTCAAGTGCAATCAGAATTGTTTGAAAACAGCAGGAAACCCAAGGGACATGAGACGGTTTCAGGTTGTGTTGTCAACAACGGTGAATGTGGATGGACACGTCCTGGCTGTTTCTGACAACATGTTTGTTCATAACAACTCCAAGCATGGACGGAGAGCAAGAAGACTCGATCCATCGGAAGCTACCCCCTGCATCAAAGCCATTAGCCCGAGTGAAGGCTGGACCACAGGAGGAGCCATGGTCATCATCATCGGGGACAACTTCTTTGATGGTCTCCAAGTGGTGTTTGGGACTATGCTTGTATGGAGCGAGCTAATAACCCCTCATGCCATCAGAGTACAGACTCCTCCCCGGCACATCCCAGGCGTGGTAGAGGTGACATTATCTTATAAATCTAAACAGTTCTGCAAAGGAGCCCCAGGAAGGTTCATTTACACAGCATTAAATGAACCCACCATAGACTATGGCTTCCAGAGACTGCAGAAGGTCATCCCTAGGCATCCTGGAGATCCTGAGAGATTAGCTAAGGAGATGCTGTTGAAAAGAGCTGCAGATCTAGTGGAAGCTCTTTATGGCACACCACACAATAACCAGGACATCATTTTGAAGCGAGCCGCAGACATTGCTGAAGCTCTCTACAGCGTCCCCAGGAATCCCAGCCAGCTTCCAGCCCTCTCTAGCTCCCCAGCGCACAGTGGCATGATGGGAATCAACTCCTATGGCAGCCAGCTTGGGGTCAGCATCTCAGAGTCAACACAAGGAAATAATCAAGGGTACATCCGCAACACAAGCAGCATCTCTCCGCGGGGATACTCTTCCAGCTCCACGCCTCAACAGTCTAATTACAGTACCTCCAGCAACAGTATGAATGGCTACAGCAATGTCCCCATGGCCAACTTGGGTGTTCCAGGTTCACCAGGATTTCTAAATGGCTCACCCACCGGCTCTCCTTATGGAATCATGTCATCAAGTCCCACCGTTGGGTCTTCCAGCACATCCTCCATCCTCCCATTTTCCTCTTCAGTTTTTCCTGCTGTCAAACAGAAGAGTGCCTTTGCCCCTGTCATCAGGCCCCAAGGCTCCCCTTCACCTGCCTGCTCCAGCGGCAATGGAAATGGATTCAGAGCCATGACCGGACTTGTTGTACCCCCGATGTAAAGAAGAACTGCTTTCTTATAGCACAAAACTACTTACTCTGATGGACCAATAATGAAGAAAGCACTAGGAGCTCTTTTGGGGGTGTAGTGGTGCCCCCACATGAACATGATGGACACCCTTGGGTCTGCAAGGAGCCAGCATCTTACTTGGTCCCACGTCCTCCTATAGCTCTGATGGTGGCTACACAAACTGACCCTCTTGGGACAAGGACAAAAGATGTCATTGACGTAGTCAGTGCTAAGAGCAGAAATGCAATTCTTTGTTATGAACATTATGAAAACCACCTTCCTATGTTTGTAAAATATTTAAGAAAAAATTGGCAAACAATTAATGCTTAATATTTTGGATACTATTTGTTTTTCTTTGTAGGAAAAAAAAGTTGAAAGTTTCTATTTTCTATGAAGCCTTTCAGATACCAATTTAGTTTATGCAGAAAAAAATTGAACAAAACAGGGTACCAGCACGGAAGACTTTCTTAAAACGCAACCTGAATTGAATGATGAAATGTTGTATGTGTGTTTGCTTATAGCTTAATCTCTTTAAAAAATGAACAAAAAAAA'''"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "M+ANLGVPGSPGFLNGSPTGSPYGIM+SSSPTVGSSSTSSILPFSSSVFPAVKQKSAFAPVIRPQGSPSPACSSGNGNGFRAM+TGLVVPPM+_RRTAFL_HKTTYSDGPIM+KKALGALLGV_WCPHM+NM+M+DTLGSARSQHLTWSHVLL_L_WWLHKLTLLGQGQKM+SLT_SVLRAEM+QFFVM+NIM+KTTFLCL_NI_EKIGKQLM+LNILDTICFSL_EKKVESFYFL_SLSDTNLVYAEKN_TKQGTSTEDFLKTQPELNDEM+LYVCLLIA_SL_KM+NKKREKF_NVLQLFK_INV_LIVSRGRN_DLFLEERGISLPDVK_K_CKHVVTSFKGV_LLLQLLTRFLSPVLHPLLFHHFIDPRARTCRFPQNTLM+QKQTL_HM+HKYSNPSIHSQLRLISK_STLL_ILFKIPIISFQRSPLSSPENSHV_VCLRLSASQGKKHLTF_QSQNHRLS_PRDPLGIPPHQNPTTPSALGAWFSTEVHSPIELQQHGLEKITLFDGLFRVSLRTSKSFPLPVNFPFYGIKLS_KT_LCHSTVSSQE_DGM+WERDIKSPNPLF_CSKKGKTRFLGDPEIRGICVLCSTVTQPFDGLFIVL_LALRGEEPCLSC_QWLILESLCNPRDISRHIGIANKDVTCFHVGVLLIDS_APQ_SLYTDLVWGSWLGFQH_K_M+QHSWKNIM+SASEPILDDFINSAYLLFDPLM+QAHIYPAPTVLFAFNPFWNYPV_LILFLCT_ESKVLETWHAYNYC_QSLIVRA_SAKIHGNIREKVPQ_ET_LM+EGEECVEYECYLPKVKAEQN_ACLALDLSQSNIRKFCFF_VALLLEPVNFPFA_R_VQYCLQNDCRISW_LYTEKCV_LNTRHLDHSARTLAATELFNCTNVCKDYF_CTNKLKTKLPCPR_SLLFNHSSPKLFLCHLEYEYFYW_LQTLSCVCKRSCVRFFSFKGTIYFIIYLSFVIAL_FFSVFLCTFQYAWFKNH_LLYLLYSKVSFSHCIVSFYCFALSVIYSIKLLCTELFVKTAFLFTVFNGLT_RISCL_NEYVYFSFKL_IILTKK_NFCTVK\n",
-      "WPTWVFQVHQDF_M+AHPPALLM+ESCHQVPPLGLPAHPPSSHFPLQFFLLSNRRVPLPLSSGPKAPLHLPAPAAM+EM+DSEP_PDLLYPRCKEELLSYSTKLLTLM+DQ__RKH_ELFWGCSGAPT_T_WTPLGLQGASILLGPTSSYSSDGGYTN_PSWDKDKRCH_RSQC_EQKCNSLL_TL_KPPSYVCKIFKKKLANN_CLIFWILFVFLCRKKKLKVSIFYEAFQIPI_FM+QKKIEQNRVPARKTFLKRNLN_M+M+KCCM+CVCL_LNLFKK_TKKGKNFKM+FYNYLNK_M+CNLL_VEVEIKTFFWKREGFLFLM+_NENDANM+__QVLKVCDYYCSYLLDSYPLSCTPSFSITLLTQGQEHVDSHRTP_CRNRHSSTCTNTPTHQFIPSSV_FLNSPHSCEYFLKFLLLAFNVLP_AVQRIPM+FESVSGFQHPKERSTSPSSRAKTTDFLNQGTPWEFLLTRTPPLPPLWVPGSVQRYTAP_NFSNTVWRR_LCLM+DSLESVLEHPSPFLSLSISPFM+ALNSLEKHSYATRQFLVKNETECGRGILKALIPSSDVRRKARPDSWEIQRLGASAFYAAQLPSPLM+GSSSFCD_LLEGKNLALAANSG_FLKAYVILETYPGILGLPIKM+SLVSTLGFSLLTHEHHNRVYTQT_FGVLGWDFSIRNRCSTHGKTS_VHLSQF_M+ISLILHTFSLTH_CRPTSIQPQLSYLPLTHSGITQYDLYYFYAHENPRF_KPGM+LITTADSPS_SGLKAPKYTET_EKKSPSEKPD_WKGKNVWNM+NVTFQK_RQSKIKHV_PWI_ANPT_GNFVFFK_HCF_NL_IFLLHEDECSTVFKM+IVEFLGSFTPKNACN_IPDILTIQLEPWQQQSYLIVQM+CVRIIFSVLIN_KQSYPVLVSHCYSIIPVPSYFCATWNM+SISIGNYKLYPVFVRGAV_DFFHLKGQFTSLFIFLLL_RCNSSQFFFAHFNM+HGLKTINFSTFCTVRFHFHTV_FHFIVLLCQLYTV_SCYAQSFL_RQLFCLLFLM+VLLKEYLVCKM+NM+STSVLNFKLS_QKNKIFVL_K\n",
-      "GQLGCSRFTRISKWLTHRLSLWNHVIKSHRWVFQHILHPPIFLFSFSCCQTEECLCPCHQAPRLPFTCLLQRQWKWIQSHDRTCCTPDVKKNCFLIAQNYLL_WTNNEESTRSSFGGVVVPPHEHDGHPWVCKEPASYLVPRPPIALM+VATQTDPLGTRTKDVIDVVSAKSRNAILCYEHYENHLPM+FVKYLRKNWQTINA_YFGYYLFFFVGKKS_KFLFSM+KPFRYQFSLCRKKLNKTGYQHGRLS_NAT_IE__NVVCVFAYSLISLKNEQKKGKILKCFTII_INKCVTYCK_R_KLRPFFGRERDFSS_CKM+KM+M+QTCSNKF_RCVIITAVTY_ILIPCPAPPPFPSLY_PKGKNM+_IPTEHPNAETDTLAHAQILQPINSFPAPSDF_IVHTLVNTF_NSYY_LSTFSPEQSREFPCLSLSQAFSIPRKEAPHLLAEPKPQTFLTKGPLGNSSSPEPHHSLRFGCLVQYRGTQPHRTSATRFGEDNSV_WTL_SQS_NIQVLSSPCQFPLLWH_TLLKNIAM+PLDSF_SRM+RRNVGEGY_KP_SPLLM+FEERQDQILGRSRD_GHLRSM+QHSYPAL_WALHRFVTSS_RGRTLP_LLTVVDS_KLM+_S_RHIQAYWDCQ_RCHLFPRWGSPY_LM+STTIESIHRLSLGFLVGISALEIDAALM+EKHHECI_ANFR_FH_FCIPSL_PIDAGPHLSSPNCLICL_PILELPSM+TYIISM+HM+RIQGFRNLACL_LLLTVPHSQGLKRQNTRKHKRKSPPVRNLIDGRGRM+CGI_M+LPSKSEGRAKLSM+FSPGFEPIQHKEILFFLSSIAFRTCEFSFCM+KM+SAVLSSK_L_NFLVALHRKM+RVTKYQTS_PFS_NPGSNRAI_LYKCV_GLFLVY__IKNKATLSSLVIAIQSFQSQVIFVPLGI_VFLLVTTNSILCL_EELCKIFFI_RDNLLHYLSFFCYSVVILLSFSLHISICM+V_KPLTSLPFVQ_GFIFTLYSFILLFCFVSYIQYKVAM+HRAFCKDSFFVYCF_WSYLKNILFVK_ICLLQF_TLNYPNKKIKFLYCKK\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
+    "a = \"TGGACAGCTCCCGCTCACCCAAACAGAAGACGTCGGCGCCGGAGCGGGCTCGGACATGGCGAGGCTGCGAGCCGGCCCGAGCGGCGGGGCCCGGTGATCCCTCCCTCCCTCCCCGTCCCCTCCCCTCTCCCGCACGCACGCCCCGTCCGCCCCCACCCCGCCCCCACCCCGGGCGAGCCCGCCCGCAGCCCGGGGCGCACACCCGCACGCGCACTCCTCTCCACTCACTCCCGCGCCCGCCCCCACTCCCGCAGCCGAGCCCCGCCACGCGCGCCTTGCCCGCCCGCCGGCCGCCCCCGCCGCCCCCGCCGCCCCCGGGCCCTGATGGACTGAATGAAGGCTGCCTACACCGCCTATCGATGCCTCACCAAAGACCTAGAAGGCTGCGCCATGAACCCGGAGCTGACAATGGAAAGTCTGGGCACTTTGCACGGGCCGGCCGGCGGCGGCAGTGGCGGGGGCGGCGGCGGGGGCGGCGGGGGCGGCGGCGGGGGCCCGGGCCATGAGCAGGAGCTGCTGGCCAGCCCCAGCCCCCACCACGCGGGCCGCGGCGCCGCTGGCTCGCTGCGGGGCCCTCCGCCGCCTCCAACCGCGCACCAGGAGCTGGGCACGGCGGCAGCGGCGGCAGCGGCGGCGTCGCGCTCGGCCATGGTCACCAGCATGGCCTCGATCCTGGACGGCGGCGACTACCGGCCCGAGCTCTCCATCCCGCTGCACCACGCCATGAGCATGTCCTGCGACTCGTCTCCGCCTGGCATGGGCATGAGCAACACCTACACCACGCTGACACCGCTCCAGCCGCTGCCACCCATCTCCACCGTGTCTGACAAGTTCCACCACCCTCACCCGCACCACCATCCGCACCACCACCACCACCACCACCACCAGCGCCTGTCCGGCAACGTCAGCGGCAGCTTCACCCTCATGCGCGACGAGCGCGGGCTCCCGGCCATGAACAACCTCTACAGTCCCTACAAGGAGATGCCCGGCATGAGCCAGAGCCTGTCCCCGCTGGCCGCCACGCCGCTGGGCAACGGGCTAGGCGGCCTCCACAACGCGCAGCAGAGTCTGCCCAACTACGGTCCGCCGGGCCACGACAAAATGCTCAGCCCCAACTTCGACGCGCACCACACTGCCATGCTGACCCGCGGTGAGCAACACCTGTCCCGCGGCCTGGGCACCCCACCTGCGGCCATGATGTCGCACCTGAACGGCCTGCACCACCCGGGCCACACTCAGTCTCACGGGCCGGTGCTGGCACCCAGTCGCGAGCGGCCACCCTCGTCCTCATCGGGCTCGCAGGTGGCCACGTCGGGCCAGCTGGAAGAAATCAACACCAAAGAGGTGGCCCAGCGCATCACAGCGGAGCTGAAGCGCTACAGTATCCCCCAGGCGATCTTTGCGCAGAGGGTGCTGTGCCGGTCTCAGGGGACTCTCTCCGACCTGCTCCGGAATCCAAAACCGTGGAGTAAACTCAAATCTGGCAGGGAGACCTTCCGCAGGATGTGGAAGTGGCTTCAGGAGCCCGAGTTCCAGCGCATGTCCGCCTTACGCCTGGCAGCGTGCAAACGCAAAGAGCAAGAACCAAACAAAGACAGGAACAATTCCCAGAAGAAGTCCCGCCTGGTGTTCACTGACCTCCAACGCCGAACACTCTTCGCCATCTTCAAGGAGAACAAACGCCCGTCAAAGGAGATGCAGATCACCATTTCCCAGCAGCTGGGCCTGGAGCTCACAACCGTCAGCAACTTCTTCATGAACGCCCGGCGCCGCAGCCTGGAGAAGTGGCAAGACGATCTGAGCACAGGGGGCTCCTCGTCCACCTCCAGCACGTGTACCAAAGCATGATGGAAGGACTCTCACTTGGGCACAAGTCACCTCCAAATGAGGACAACAGATACCAAAAGAAAACAAAGGAAAAAGACACCGGATTCCTAGCTGGGGCCCTTCACTGGTGATTTGAAAGCACAATTCTCTTGCAAAGAAACTTATATTCTAGCTGTAATCATAGGCCAGGTGTTCTTCTTTTGTTTTTAATGGCTATGGAGTCCAAGTGCAAGCTGAAAAATTAATCTCTTAGAACCAGACACTGTTCTCTGAGCATGCTAAGCATCCCAGAAACCCAAATGGGGCCTTCCTGGAGCGAGTTAATTCCAGTATGGTGTCAACCAAGCTCGGGATTGCTTAAAATATCATCCATCCCACTTCAGGTCCTGTCAGCTTCTTGCAGTCAGAGTTCCTATGAGTAACAATAGGAGTTTGGCCTATGTAAGGACTCTGAGTTTAGGCTTCCAAGATACAACAATAAGAGAAGAATCTAGCAACGAGAATGACCTCATTTGCTTTCCACATGCTTAGCCTCATTATACCATGTTATGTCCAAGTTCACAGCCACAACATCAGAATGGTAATTACTGAGCACAAGTTTTAAATATGGACGTTAAAAAAAAAAATCCAAGGACCTGTTTTTCCAACCCAGACATCTTTTCATTGAATGATTTAGAAAGCTTTAAGTTGATCCAGCTTACAATTTTTTTTTTCTTTACCTCCTGGAAATCTCATATGGTCTTGGATCCGTCAAAAAAACCAGTCAGTTCACTTGCGCTCAAAGTATCAAGCACAACAAAGATAAACAGAAGTGAGGAAGGTTCTGGGTTCACTACATCTGGATTTTCAAGACACCTATTGTGAAGTCATTAGGGAATTGATGAGAATATGGCTTCAAGCACATTTTGCAGTTTGCTACAAATTCTGTTGTACATAATGCAGACGCACACTCAGGAGGCCAATTTAACTGTTAACAGTGCATGGAGCGAATGCAGCATTTTAAAAGATCTAGGTTTTTTTAGGTCATTAATGTGTCCTTGGTTGATCAGTCATCTGGTCCCTCCTACTGTGTGTTATGACCACCACGTAATCCATTCTCGCTCTTTCTGATTTGGGGTTTTTCCTCATCCATCCCATTAGTAGGGATGTTTTCTGTGTTTTCTAGCAAGAAAAAAAAATCAATCAATCAAACCTGCATACATGTTACTCATGACTGTCATCTAGTCCTAAATCTCTTCTGTTGTTGAATCATCCTTGCAAAACAGCTGAATACATCTGGAGAAAACACAGCACACCAAAGAAGCAGAATACTGCAAACCAAAGACATTTATGACTTGTCATTTTCTAGCCTAAAAATACTGTGATTACTTTTAGAAATCAGAAAACCTCTGCAACTCCGAATGGCATTCAGCTCTTGCATTTGGCGCATCATCGGGCTGAGCGGACCAGCTACACCAAGGACATTAGCCAAGCCACCCAGAGGGGTGGCTTTGCCACACCAGTTGTCACCTTCCCATAGCAAGTGGAAGAGCGCCCACAGAACTCTGGGAGATTGCAAAGGTCACAATGTGCATATTTACCAGTGAATGGCCCCGGGTGGGGCCACGTGGGGGTGTTCAAAGCAAGCCAAACGCTGCAATCATTCTTTACAGACACTTGAGACTGACTTTTTTATGAATTACTTAGTCGAAACCAAAGAAACTTTTTCTGCACCTACTTCTGCAACAAACAAAACTGTCCCATTAAAATGAATAAATAAATCCGTAAATCAATGGAAATCACCACCAATAAGAAGGAAGCACGCCAGAAAATAAACGAAAACAAAAACAGGGAGACACACTGTGTTCAAACAGACCTCTTGGGACATTTTTTGGAAGCAGATTTTAAAGAAAGGGTTGAGACAAAGATAGAAATAAGGAAGAGCCTCAGTGGCTGCTGCTTCATTTGACAACTCACACGGTAATCTTAAAGCTGAAGATTGTCTTTAATTTGTGCCTATGCAGTTTTTCAAAAGAACACGGAACAGAGCAACAGAAACCTCAACAGCTACAATACCAAAGATGAGGATTTCTCACACCTTTTGTTTCAGTTCATTATCTCCTCTTGCCTGGCTAAAATACTAATAGCGCCATTGAACTGTATAAAGGTAATCAATTATGTTTCTCTGAGCAACAAAAGGAAAGGGCCATTTATTTGATTTTATTGTTTCATTTCAATTTTGTCTTATGGTTTTTTGCCCCAACATGGAATCTCTCAAAAGTTTCCATGGACTCCAAGTTTAAGATGTTGGGATATTGAACAGTTCTCTCTGCTCAGCAGAGGGTAGGGAATAACATTATCACTTGAATGTTCTTTGCTTAACCCTTAGACTTGGTTCCTTCTATGTTCAGAGTCTCATCATCAGGGGAAGGAAAGGGAGTGAGGGTCAGGGATAGGGGTCTTGGTGATGCATCCTCTCCCGAGCCACAGAACCAAAGAGTTTATAGAGGAATTTACAGCCTCGTTTTCATGTGATTGCTACATCCTAACAGGGCTTCATTTGGGGGTGGGGGGAAACATGTAAAAATAATTGCCAGTTTCTACTTTTCTATTAGCTTTTTAAAAATCAGCTGTAAAGTTGCATTTCTAAAGAAAGATATATATAATATATAAAATACATATATAGATCAACTTGACATTGGTGATAACCAAAATTATTGCTGTCCAAATTCATGTCTTGTTTTGGTCCAGTGCTTCATTTGCTAAGTATTCGGTTCAGAATTTTTCTCATTTCTCATGCCATTCCAGAGTTAATTTGCCACTGTGGATGATTTGAAGTATTCAGATCTCTATGGAAGTTTCTGGGACAGGTTTAAAGTCAAGATCAAGCATTTTAGCATTTAACCTGTTGATAAATGGATCCATGGTGTACATGAGTTTTATTTGTATTCGGAGTCATCTCTATTCTATCCCTCAGCCTCGATTAAGGTGGTGAGTGAAGTGCATCCAACAGACTCGGCCCAGAACTGGGTCCTGACAGTGGGGTGCTCATCTTCTGTAACTGTTGGGAAGGCTCGGTGGTCCATTTTCACCAGTTAAAGAATATGAGGCCAGCCCAGAAATCTGTTCTCCAGGAGCTGCCCTGTCCCATCTGGGTGTGCCAGACCCCCTCAGTGAGCAGGTCCACCAAAGGGACTTCTCACAGGGGAAGCCCAACTCCTGTTGCAATGGGTTGATAGATTTCCTCAGGGTGGTAATTACCAATTCGTATTTTGACAAGCCTATGTGCAACCACAGCTGGCACTGGGGTGGGCAGTGGTGTTGGGTGGGATGGGGGAGAGTGTCTCAATCCTGAAGAGAAAATATAAAGCAGGTTTTGGGGAGACTTCTGGAGTCCTGCCCCTAGAGAGCCCCATTGTTGTTCTTTGTGCCCCCTCCTCATTCCCCCTATGTGGGTCTCCCTATGCAGGAGCTGTGAGAGAATGTGACTCTCCACAATTTTTATAATTCATCCTTCCTAGGAGATTGTTCATTGGCTCTTCCCTTGTGTCCCTTTGTCCCTTGCTCATACTCCATGTTTCCTTTGTCAAAGGACTAAGAAAAGAGCATATTTCAGCAGAGGAGTGTTCCCATGTGGGTTGATTTCAACTTGGGTATTTCTAAAAGAGTCCTTGTGACATGTGTCCAGTGGAAATGGTTGCTCTTTTCCAGACTGGATTGAGGAATGGAGCCTGTTTGATTTGGTTAGTGATTCTTTGACATACTAATCTCAGCGTTTGGGTCTCCAGCATCCTCTGAAGATGTCTAGACTAGTAGAGGCTGCCTTTGTGACCTGACATTACAACATTGGTCAAACCAGTCCTCTGATAATCAGAAGAACATGTCATAATTGTTTAAAAAAAAAAAAAAGGCAAGAATTTCTCTCCAAGGAGCTTTAATAAATGTCTCATTCCAGATAATGTCATACCAGAGAAAAGTGCTTGCTTTTAGAAAATTATTTACATACATATATAAATATATATGTGTATCTATACAGTTATGTATCAAAATTTTAAGCCCTGCAGAATTTCAATTTGTTAGAAATCTAACAGAAAAAAATTTCTATATTGAAAGGTAATAGAATTTAACCCAGTGAGTTTACTCAAGGATTTTTAAATTTAAGTTAATAATTTCAGAGAAAATAACCATTTGGGTGTGGTTATAGTTTAGTATCCATTACCTCAATCCAAGGAAAATTCCAGGCATTCCTCAACCATCAGGAAAAGGTACAGTGTGAAGGAACAGTTCTCAGCCAAATTTCACATTCTTGAGGCAACAGAAATCAAAACACTCAGAGCCATTGAGTGGAAAAACAATTTACTTTATTCCTTTACACAAATAGGCTTGCATTGTTTTTGTTTTAATGTGATTTTGGTACTAGGGATATAATTATTTCATTCCAGGAAATAATAAAAAAAAACAGACAGAGCCAATACATTTCTTTTTTTAAAGGAAACAGCAACAACAATAAAAACTCAGCACCAATATTTAAAAGCTTTTCCAAAATGTAAAAGAAGTGTTTAGCTTGCACCATGCATAAAGGTGCAGGCTAGTTGAACCAGGAAGCATGGCACTTCCTCTGGAGAAATCCAGAAAGAGTTGCTTCTAAGCTCCCTTTTCCCCCTGCAGGCTCTTGGCAATTGTAGGCTTTAGCAAATCCAGAATAATTTTCAATTCAAGCTAAAATAAAATCAACATTTGGAATGTAAATCTGATACACACACACTTTTCTAAGTCAAACAACATATTTCAAAACCAAAAATAAATACCTTTTAGATAATCAGTTATTTTCTTTGTCTATACTGGGCACCCACCTACTAGTGCCAGTAAATTCAAGTTGAACAGATTTTTAAAATCACTATTATCTGGGTATGGGGGAAACTTCCCCACTTTTGAAAATGTTGGTAGAATTATAGGAATGTCTGTTTGATTATCATTACCAAAGTGTCATGACAGTATGCCTTTGTAGTGAACTCGGATTTTCAGGAGTTTGAATAGTTGGATATTTTAAAATCTAAGAAGAAAAGGCCTGTTTCCAATGTTGTTGAAGAATAATGAACTCTATTAAAAAGTGGAGAAAAAGATAATACATGTGGTCAAGGTTGACCACAAGGCCCAGGCACAACTACCTTGGCGATAATCTTCTAGATTCGTAACAGGTTAGAGCTGACTTTTTGTTTTTGTTGTTGCTGATGCTGTGTGATTCAGACTTCTCAGCCTAACCAGGAAGAGTAAGTGGAAATGGTAGATGAAGAAGGGGTAGAGCTGGTGTATCTATAACTTTCTGATATTTGTCTGCCAAACTTGATATATTAGTAATTTTTTTATCTTTAGCTAAGATCAAGTCACCCCTGAAACAACAGGAGATTCTAGTTTTAAAATAAGGCCACAAAAATCCTTACGGAATGAAGAATGGCACCCCAGTTGGTTGTATAAGTCTCATAAGATAATGATGTTGATTTTAAATATGGATGTCTCAATGCCTGTTTTCTATCAATGATTTGTTTGTTTCCAAGGTCGGGGAGGGAAAGAGGGGAGGGTTTATCTGTTTTAGAAAGTCTCAGAATACTTATAAAATACAGAAGTAGTTATTAAAATATATAGGACCTCACATAGGTAGATACAGAACTTACCATTGAGGCTGATGGGCTGTTGTGTGAATCACACAGGACCTTAAATGAGGCTCATTATTCTCACACACCAAAATGACTCTGACAGCCTGAAGCAGTTATTGCTAGAGCCCAAGCTTTCCTTGGAGGTTTTGGAGTTAGGTTGATTGGAAGTAACCAGCTAATACCTTTTCTAGTGGAGAAAAAGACATTGCTACCAGCTTGTTCATCCCATAGAAGTCTTCCACTCTGCTCCATTTTTAGCAGCAAGCATTTCATGTAGCATAAACCTTGGCAGATAAGTGTGCCTAAGGTTTATACAGTCTGTCCGCTTGGATGTATACAAATTTAGATACATATTTTAACATGTGTTCTCATAGATGACTTTATAACAACACACATTACCTATAGGTGTCTAGACTGTGTACATACAAGTGTGTACAGACAAGCTTCATACGTATATACTGTAATCCGTTACAACAAATAAATTTTAAATCATCGTTTAACATGTATGTGGTACTTCTACAGTGTACATTGTTTTCATTATTTATTGTAACATTGAAAACCACAGTGCAGGGAAAACAAAAGTATCCCAGCATCTTCATCCTGTACACTTGGAATTAATTTCATTTGGGCATATCCAAGATAAACTCAACTTTCAAGAAATCTTGTATATTATTTAATCATCTGTGTTAGGATGACACCTATGATTGATGACTTCGGTTGAATAGCTTTATTCTGGATTTTTCATAACTAAAGCTAAATCCAAAGACCTGAAAAAGGACAAAAAGAAAAAAAAAAAAAGAAA\"\n",
     "a = a.upper()\n",
     "a = a.replace(\"\\n\",\"\")\n",
     "       \n",
@@ -743,19 +559,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 118,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "MMRLRGSGMLRDLLLRSPAGVSATLRRAQPLVTLCRRPRGGGRPAAGPAAAARLHPWWGGGGWPAEPLARGLSSSPSEILQELGKGSTHPQPGVSPPAAPAAPGPKDGPGETDAFGNSEGKELVASGENKIKQGLLPSLEDLLFYTIAEGQEKIPVHKFITALKSTGLRTSDPRLKECMDMLRLTLQTTSDGVMLDKDLFKKCVQSNIVLLTQAFRRKFVIPDFMSFTSHIDELYESAKKQSGGKVADYIPQLAKFSPDLWGVSVCTVDGQRHSTGDTKVPFCLQSCVKPLKYAIAVNDLGTEYVHRYVGKEPSGLRFNKLFLNEDDKPHNPMVNAGAIVVTSLIKQGVNNAEKFDYVMQFLNKMAGNEYVGFSNATFQSERESGDRNFAIGYYLKEKKCFPEGTDMVGILDFYFQLCSIEVTCESASVMAATLANGGFCPITGERVLSPEAVRNTLSLMHSCGMYDFSGQFAFHVGLPAKSGVAGGILLVVPNVMGMMCWSPPLDKMGNSVKGIHFCHDLVSLCNFHNYDNLRHFAKKLDPRREGGDQRVKSVINLLFAAYTGDVSALRRFALSAMDMEQRDYDSRTALHVAAAEGHVEVVKFLLEACKVNPFPKDRWNNTPMDEALHFGHHDVFKILQEYQVQYTPQGDSDNGKENQTVHKNLDGLL\n",
-      "MVSC\n",
-      "MCSEQHCFVDTSI\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "def translate(seq,orf):\n",
     "    seq = seq.upper()\n",
@@ -806,7 +612,7 @@
     "            \n",
     "    return protein\n",
     "\n",
-    "a = \"AGTGCGGAGCCTTAGGCGGAGCGAAGAGAACCGGTCGCGGCAATCCTAGCGCGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCACCCGCATCCGCTGCGGGAGTCCGAGCCGGAACCACACCCAAGTAGCTGCCCTTTCCTCTTCTGTCATCTCACCGCCCCACCACAGACCGCGTTCCCCGAGGAAACCGGCCGCCCACGCCCGGAGCATCCTCCCCTGTTGAGCGGGCGCTGACGGACCCGGCGGCATGATGCGGCTGCGAGGCTCGGGGATGCTGCGGGACCTGCTCCTGCGGTCGCCCGCCGGCGTGAGCGCGACTCTGCGGCGGGCACAGCCCTTGGTCACCCTGTGCCGGCGTCCCCGAGGCGGGGGACGGCCGGCCGCGGGCCCGGCTGCCGCCGCGCGACTCCACCCGTGGTGGGGCGGGGGCGGCTGGCCGGCGGAGCCCCTCGCGCGGGGCCTGTCCAGCTCTCCTTCGGAGATCTTGCAGGAGCTGGGCAAGGGGAGCACGCATCCGCAGCCCGGGGTGTCGCCACCCGCTGCCCCGGCGGCGCCCGGCCCCAAGGACGGCCCCGGGGAGACGGACGCGTTTGGCAACAGCGAGGGCAAAGAGCTGGTGGCCTCAGGTGAAAATAAAATAAAACAGGGTCTGTTACCTAGCTTGGAAGATTTGCTGTTCTATACAATTGCTGAAGGACAAGAGAAAATACCTGTTCATAAATTTATTACAGCACTCAAATCTACAGGATTGCGAACGTCTGATCCCAGGTTGAAAGAGTGTATGGATATGTTAAGATTAACTCTTCAAACAACATCAGATGGTGTCATGCTAGACAAAGATCTTTTTAAAAAATGTGTTCAGAGCAACATTGTTTTGTTGACACAAGCATTTAGAAGAAAGTTTGTGATTCCTGACTTTATGTCTTTTACCTCACACATTGATGAGTTATATGAAAGTGCTAAAAAGCAGTCTGGAGGAAAGGTTGCAGATTATATTCCTCAACTGGCCAAATTCAGTCCCGATTTGTGGGGTGTGTCTGTTTGTACAGTAGATGGACAGAGGCATTCTACTGGAGATACCAAAGTTCCCTTCTGTCTTCAGTCCTGTGTAAAACCTTTGAAATATGCCATTGCTGTTAATGATCTTGGAACTGAATATGTGCATCGATATGTTGGAAAAGAGCCGAGTGGACTAAGATTCAACAAACTATTTTTGAATGAAGATGATAAACCACATAATCCTATGGTAAATGCTGGAGCAATTGTTGTGACTTCACTAATAAAGCAAGGAGTAAATAATGCTGAAAAATTTGACTATGTCATGCAGTTTTTGAATAAGATGGCTGGTAATGAATATGTTGGATTCAGTAATGCAACGTTTCAGTCTGAAAGAGAAAGTGGAGATCGAAATTTTGCAATAGGATATTACTTAAAAGAAAAGAAGTGTTTTCCAGAAGGCACAGACATGGTTGGTATATTAGACTTCTACTTCCAGCTGTGCTCCATTGAAGTGACTTGTGAATCAGCCAGTGTGATGGCTGCGACACTGGCTAATGGTGGTTTCTGCCCAATTACTGGTGAAAGAGTACTGAGCCCTGAAGCAGTTCGAAATACATTGAGTTTGATGCATTCCTGTGGCATGTATGACTTCTCAGGGCAGTTTGCTTTCCATGTTGGTCTTCCTGCAAAATCTGGAGTTGCTGGGGGCATTCTTTTAGTTGTCCCCAATGTTATGGGTATGATGTGCTGGTCTCCTCCTCTGGATAAGATGGGCAACAGTGTTAAGGGAATTCACTTTTGTCACGATCTTGTTTCTCTGTGTAATTTCCATAACTATGATAATTTGAGACACTTTGCAAAAAAACTTGATCCTCGAAGAGAAGGTGGTGATCAAAGGGTAAAGTCAGTGATAAATCTTTTGTTTGCTGCATATACTGGAGATGTGTCTGCACTTCGAAGATTTGCTTTGTCAGCTATGGACATGGAACAGCGGGACTATGATTCTAGAACAGCACTCCATGTAGCTGCTGCAGAGGGTCATGTTGAAGTTGTTAAATTTTTGCTGGAAGCCTGCAAAGTAAACCCTTTCCCCAAGGACAGGTGGAATAACACTCCCATGGATGAAGCACTGCACTTTGGACACCATGATGTATTTAAAATTCTCCAAGAATACCAAGTCCAGTACACACCTCAAGGAGATTCTGACAACGGGAAGGAAAATCAAACCGTCCATAAGAATCTTGATGGATTGTTGT\"\n",
+    "a = \"TGGACAGCTCCCGCTCACCCAAACAGAAGACGTCGGCGCCGGAGCGGGCTCGGACATGGCGAGGCTGCGAGCCGGCCCGAGCGGCGGGGCCCGGTGATCCCTCCCTCCCTCCCCGTCCCCTCCCCTCTCCCGCACGCACGCCCCGTCCGCCCCCACCCCGCCCCCACCCCGGGCGAGCCCGCCCGCAGCCCGGGGCGCACACCCGCACGCGCACTCCTCTCCACTCACTCCCGCGCCCGCCCCCACTCCCGCAGCCGAGCCCCGCCACGCGCGCCTTGCCCGCCCGCCGGCCGCCCCCGCCGCCCCCGCCGCCCCCGGGCCCTGATGGACTGAATGAAGGCTGCCTACACCGCCTATCGATGCCTCACCAAAGACCTAGAAGGCTGCGCCATGAACCCGGAGCTGACAATGGAAAGTCTGGGCACTTTGCACGGGCCGGCCGGCGGCGGCAGTGGCGGGGGCGGCGGCGGGGGCGGCGGGGGCGGCGGCGGGGGCCCGGGCCATGAGCAGGAGCTGCTGGCCAGCCCCAGCCCCCACCACGCGGGCCGCGGCGCCGCTGGCTCGCTGCGGGGCCCTCCGCCGCCTCCAACCGCGCACCAGGAGCTGGGCACGGCGGCAGCGGCGGCAGCGGCGGCGTCGCGCTCGGCCATGGTCACCAGCATGGCCTCGATCCTGGACGGCGGCGACTACCGGCCCGAGCTCTCCATCCCGCTGCACCACGCCATGAGCATGTCCTGCGACTCGTCTCCGCCTGGCATGGGCATGAGCAACACCTACACCACGCTGACACCGCTCCAGCCGCTGCCACCCATCTCCACCGTGTCTGACAAGTTCCACCACCCTCACCCGCACCACCATCCGCACCACCACCACCACCACCACCACCAGCGCCTGTCCGGCAACGTCAGCGGCAGCTTCACCCTCATGCGCGACGAGCGCGGGCTCCCGGCCATGAACAACCTCTACAGTCCCTACAAGGAGATGCCCGGCATGAGCCAGAGCCTGTCCCCGCTGGCCGCCACGCCGCTGGGCAACGGGCTAGGCGGCCTCCACAACGCGCAGCAGAGTCTGCCCAACTACGGTCCGCCGGGCCACGACAAAATGCTCAGCCCCAACTTCGACGCGCACCACACTGCCATGCTGACCCGCGGTGAGCAACACCTGTCCCGCGGCCTGGGCACCCCACCTGCGGCCATGATGTCGCACCTGAACGGCCTGCACCACCCGGGCCACACTCAGTCTCACGGGCCGGTGCTGGCACCCAGTCGCGAGCGGCCACCCTCGTCCTCATCGGGCTCGCAGGTGGCCACGTCGGGCCAGCTGGAAGAAATCAACACCAAAGAGGTGGCCCAGCGCATCACAGCGGAGCTGAAGCGCTACAGTATCCCCCAGGCGATCTTTGCGCAGAGGGTGCTGTGCCGGTCTCAGGGGACTCTCTCCGACCTGCTCCGGAATCCAAAACCGTGGAGTAAACTCAAATCTGGCAGGGAGACCTTCCGCAGGATGTGGAAGTGGCTTCAGGAGCCCGAGTTCCAGCGCATGTCCGCCTTACGCCTGGCAGCGTGCAAACGCAAAGAGCAAGAACCAAACAAAGACAGGAACAATTCCCAGAAGAAGTCCCGCCTGGTGTTCACTGACCTCCAACGCCGAACACTCTTCGCCATCTTCAAGGAGAACAAACGCCCGTCAAAGGAGATGCAGATCACCATTTCCCAGCAGCTGGGCCTGGAGCTCACAACCGTCAGCAACTTCTTCATGAACGCCCGGCGCCGCAGCCTGGAGAAGTGGCAAGACGATCTGAGCACAGGGGGCTCCTCGTCCACCTCCAGCACGTGTACCAAAGCATGATGGAAGGACTCTCACTTGGGCACAAGTCACCTCCAAATGAGGACAACAGATACCAAAAGAAAACAAAGGAAAAAGACACCGGATTCCTAGCTGGGGCCCTTCACTGGTGATTTGAAAGCACAATTCTCTTGCAAAGAAACTTATATTCTAGCTGTAATCATAGGCCAGGTGTTCTTCTTTTGTTTTTAATGGCTATGGAGTCCAAGTGCAAGCTGAAAAATTAATCTCTTAGAACCAGACACTGTTCTCTGAGCATGCTAAGCATCCCAGAAACCCAAATGGGGCCTTCCTGGAGCGAGTTAATTCCAGTATGGTGTCAACCAAGCTCGGGATTGCTTAAAATATCATCCATCCCACTTCAGGTCCTGTCAGCTTCTTGCAGTCAGAGTTCCTATGAGTAACAATAGGAGTTTGGCCTATGTAAGGACTCTGAGTTTAGGCTTCCAAGATACAACAATAAGAGAAGAATCTAGCAACGAGAATGACCTCATTTGCTTTCCACATGCTTAGCCTCATTATACCATGTTATGTCCAAGTTCACAGCCACAACATCAGAATGGTAATTACTGAGCACAAGTTTTAAATATGGACGTTAAAAAAAAAAATCCAAGGACCTGTTTTTCCAACCCAGACATCTTTTCATTGAATGATTTAGAAAGCTTTAAGTTGATCCAGCTTACAATTTTTTTTTTCTTTACCTCCTGGAAATCTCATATGGTCTTGGATCCGTCAAAAAAACCAGTCAGTTCACTTGCGCTCAAAGTATCAAGCACAACAAAGATAAACAGAAGTGAGGAAGGTTCTGGGTTCACTACATCTGGATTTTCAAGACACCTATTGTGAAGTCATTAGGGAATTGATGAGAATATGGCTTCAAGCACATTTTGCAGTTTGCTACAAATTCTGTTGTACATAATGCAGACGCACACTCAGGAGGCCAATTTAACTGTTAACAGTGCATGGAGCGAATGCAGCATTTTAAAAGATCTAGGTTTTTTTAGGTCATTAATGTGTCCTTGGTTGATCAGTCATCTGGTCCCTCCTACTGTGTGTTATGACCACCACGTAATCCATTCTCGCTCTTTCTGATTTGGGGTTTTTCCTCATCCATCCCATTAGTAGGGATGTTTTCTGTGTTTTCTAGCAAGAAAAAAAAATCAATCAATCAAACCTGCATACATGTTACTCATGACTGTCATCTAGTCCTAAATCTCTTCTGTTGTTGAATCATCCTTGCAAAACAGCTGAATACATCTGGAGAAAACACAGCACACCAAAGAAGCAGAATACTGCAAACCAAAGACATTTATGACTTGTCATTTTCTAGCCTAAAAATACTGTGATTACTTTTAGAAATCAGAAAACCTCTGCAACTCCGAATGGCATTCAGCTCTTGCATTTGGCGCATCATCGGGCTGAGCGGACCAGCTACACCAAGGACATTAGCCAAGCCACCCAGAGGGGTGGCTTTGCCACACCAGTTGTCACCTTCCCATAGCAAGTGGAAGAGCGCCCACAGAACTCTGGGAGATTGCAAAGGTCACAATGTGCATATTTACCAGTGAATGGCCCCGGGTGGGGCCACGTGGGGGTGTTCAAAGCAAGCCAAACGCTGCAATCATTCTTTACAGACACTTGAGACTGACTTTTTTATGAATTACTTAGTCGAAACCAAAGAAACTTTTTCTGCACCTACTTCTGCAACAAACAAAACTGTCCCATTAAAATGAATAAATAAATCCGTAAATCAATGGAAATCACCACCAATAAGAAGGAAGCACGCCAGAAAATAAACGAAAACAAAAACAGGGAGACACACTGTGTTCAAACAGACCTCTTGGGACATTTTTTGGAAGCAGATTTTAAAGAAAGGGTTGAGACAAAGATAGAAATAAGGAAGAGCCTCAGTGGCTGCTGCTTCATTTGACAACTCACACGGTAATCTTAAAGCTGAAGATTGTCTTTAATTTGTGCCTATGCAGTTTTTCAAAAGAACACGGAACAGAGCAACAGAAACCTCAACAGCTACAATACCAAAGATGAGGATTTCTCACACCTTTTGTTTCAGTTCATTATCTCCTCTTGCCTGGCTAAAATACTAATAGCGCCATTGAACTGTATAAAGGTAATCAATTATGTTTCTCTGAGCAACAAAAGGAAAGGGCCATTTATTTGATTTTATTGTTTCATTTCAATTTTGTCTTATGGTTTTTTGCCCCAACATGGAATCTCTCAAAAGTTTCCATGGACTCCAAGTTTAAGATGTTGGGATATTGAACAGTTCTCTCTGCTCAGCAGAGGGTAGGGAATAACATTATCACTTGAATGTTCTTTGCTTAACCCTTAGACTTGGTTCCTTCTATGTTCAGAGTCTCATCATCAGGGGAAGGAAAGGGAGTGAGGGTCAGGGATAGGGGTCTTGGTGATGCATCCTCTCCCGAGCCACAGAACCAAAGAGTTTATAGAGGAATTTACAGCCTCGTTTTCATGTGATTGCTACATCCTAACAGGGCTTCATTTGGGGGTGGGGGGAAACATGTAAAAATAATTGCCAGTTTCTACTTTTCTATTAGCTTTTTAAAAATCAGCTGTAAAGTTGCATTTCTAAAGAAAGATATATATAATATATAAAATACATATATAGATCAACTTGACATTGGTGATAACCAAAATTATTGCTGTCCAAATTCATGTCTTGTTTTGGTCCAGTGCTTCATTTGCTAAGTATTCGGTTCAGAATTTTTCTCATTTCTCATGCCATTCCAGAGTTAATTTGCCACTGTGGATGATTTGAAGTATTCAGATCTCTATGGAAGTTTCTGGGACAGGTTTAAAGTCAAGATCAAGCATTTTAGCATTTAACCTGTTGATAAATGGATCCATGGTGTACATGAGTTTTATTTGTATTCGGAGTCATCTCTATTCTATCCCTCAGCCTCGATTAAGGTGGTGAGTGAAGTGCATCCAACAGACTCGGCCCAGAACTGGGTCCTGACAGTGGGGTGCTCATCTTCTGTAACTGTTGGGAAGGCTCGGTGGTCCATTTTCACCAGTTAAAGAATATGAGGCCAGCCCAGAAATCTGTTCTCCAGGAGCTGCCCTGTCCCATCTGGGTGTGCCAGACCCCCTCAGTGAGCAGGTCCACCAAAGGGACTTCTCACAGGGGAAGCCCAACTCCTGTTGCAATGGGTTGATAGATTTCCTCAGGGTGGTAATTACCAATTCGTATTTTGACAAGCCTATGTGCAACCACAGCTGGCACTGGGGTGGGCAGTGGTGTTGGGTGGGATGGGGGAGAGTGTCTCAATCCTGAAGAGAAAATATAAAGCAGGTTTTGGGGAGACTTCTGGAGTCCTGCCCCTAGAGAGCCCCATTGTTGTTCTTTGTGCCCCCTCCTCATTCCCCCTATGTGGGTCTCCCTATGCAGGAGCTGTGAGAGAATGTGACTCTCCACAATTTTTATAATTCATCCTTCCTAGGAGATTGTTCATTGGCTCTTCCCTTGTGTCCCTTTGTCCCTTGCTCATACTCCATGTTTCCTTTGTCAAAGGACTAAGAAAAGAGCATATTTCAGCAGAGGAGTGTTCCCATGTGGGTTGATTTCAACTTGGGTATTTCTAAAAGAGTCCTTGTGACATGTGTCCAGTGGAAATGGTTGCTCTTTTCCAGACTGGATTGAGGAATGGAGCCTGTTTGATTTGGTTAGTGATTCTTTGACATACTAATCTCAGCGTTTGGGTCTCCAGCATCCTCTGAAGATGTCTAGACTAGTAGAGGCTGCCTTTGTGACCTGACATTACAACATTGGTCAAACCAGTCCTCTGATAATCAGAAGAACATGTCATAATTGTTTAAAAAAAAAAAAAAGGCAAGAATTTCTCTCCAAGGAGCTTTAATAAATGTCTCATTCCAGATAATGTCATACCAGAGAAAAGTGCTTGCTTTTAGAAAATTATTTACATACATATATAAATATATATGTGTATCTATACAGTTATGTATCAAAATTTTAAGCCCTGCAGAATTTCAATTTGTTAGAAATCTAACAGAAAAAAATTTCTATATTGAAAGGTAATAGAATTTAACCCAGTGAGTTTACTCAAGGATTTTTAAATTTAAGTTAATAATTTCAGAGAAAATAACCATTTGGGTGTGGTTATAGTTTAGTATCCATTACCTCAATCCAAGGAAAATTCCAGGCATTCCTCAACCATCAGGAAAAGGTACAGTGTGAAGGAACAGTTCTCAGCCAAATTTCACATTCTTGAGGCAACAGAAATCAAAACACTCAGAGCCATTGAGTGGAAAAACAATTTACTTTATTCCTTTACACAAATAGGCTTGCATTGTTTTTGTTTTAATGTGATTTTGGTACTAGGGATATAATTATTTCATTCCAGGAAATAATAAAAAAAAACAGACAGAGCCAATACATTTCTTTTTTTAAAGGAAACAGCAACAACAATAAAAACTCAGCACCAATATTTAAAAGCTTTTCCAAAATGTAAAAGAAGTGTTTAGCTTGCACCATGCATAAAGGTGCAGGCTAGTTGAACCAGGAAGCATGGCACTTCCTCTGGAGAAATCCAGAAAGAGTTGCTTCTAAGCTCCCTTTTCCCCCTGCAGGCTCTTGGCAATTGTAGGCTTTAGCAAATCCAGAATAATTTTCAATTCAAGCTAAAATAAAATCAACATTTGGAATGTAAATCTGATACACACACACTTTTCTAAGTCAAACAACATATTTCAAAACCAAAAATAAATACCTTTTAGATAATCAGTTATTTTCTTTGTCTATACTGGGCACCCACCTACTAGTGCCAGTAAATTCAAGTTGAACAGATTTTTAAAATCACTATTATCTGGGTATGGGGGAAACTTCCCCACTTTTGAAAATGTTGGTAGAATTATAGGAATGTCTGTTTGATTATCATTACCAAAGTGTCATGACAGTATGCCTTTGTAGTGAACTCGGATTTTCAGGAGTTTGAATAGTTGGATATTTTAAAATCTAAGAAGAAAAGGCCTGTTTCCAATGTTGTTGAAGAATAATGAACTCTATTAAAAAGTGGAGAAAAAGATAATACATGTGGTCAAGGTTGACCACAAGGCCCAGGCACAACTACCTTGGCGATAATCTTCTAGATTCGTAACAGGTTAGAGCTGACTTTTTGTTTTTGTTGTTGCTGATGCTGTGTGATTCAGACTTCTCAGCCTAACCAGGAAGAGTAAGTGGAAATGGTAGATGAAGAAGGGGTAGAGCTGGTGTATCTATAACTTTCTGATATTTGTCTGCCAAACTTGATATATTAGTAATTTTTTTATCTTTAGCTAAGATCAAGTCACCCCTGAAACAACAGGAGATTCTAGTTTTAAAATAAGGCCACAAAAATCCTTACGGAATGAAGAATGGCACCCCAGTTGGTTGTATAAGTCTCATAAGATAATGATGTTGATTTTAAATATGGATGTCTCAATGCCTGTTTTCTATCAATGATTTGTTTGTTTCCAAGGTCGGGGAGGGAAAGAGGGGAGGGTTTATCTGTTTTAGAAAGTCTCAGAATACTTATAAAATACAGAAGTAGTTATTAAAATATATAGGACCTCACATAGGTAGATACAGAACTTACCATTGAGGCTGATGGGCTGTTGTGTGAATCACACAGGACCTTAAATGAGGCTCATTATTCTCACACACCAAAATGACTCTGACAGCCTGAAGCAGTTATTGCTAGAGCCCAAGCTTTCCTTGGAGGTTTTGGAGTTAGGTTGATTGGAAGTAACCAGCTAATACCTTTTCTAGTGGAGAAAAAGACATTGCTACCAGCTTGTTCATCCCATAGAAGTCTTCCACTCTGCTCCATTTTTAGCAGCAAGCATTTCATGTAGCATAAACCTTGGCAGATAAGTGTGCCTAAGGTTTATACAGTCTGTCCGCTTGGATGTATACAAATTTAGATACATATTTTAACATGTGTTCTCATAGATGACTTTATAACAACACACATTACCTATAGGTGTCTAGACTGTGTACATACAAGTGTGTACAGACAAGCTTCATACGTATATACTGTAATCCGTTACAACAAATAAATTTTAAATCATCGTTTAACATGTATGTGGTACTTCTACAGTGTACATTGTTTTCATTATTTATTGTAACATTGAAAACCACAGTGCAGGGAAAACAAAAGTATCCCAGCATCTTCATCCTGTACACTTGGAATTAATTTCATTTGGGCATATCCAAGATAAACTCAACTTTCAAGAAATCTTGTATATTATTTAATCATCTGTGTTAGGATGACACCTATGATTGATGACTTCGGTTGAATAGCTTTATTCTGGATTTTTCATAACTAAAGCTAAATCCAAAGACCTGAAAAAGGACAAAAAGAAAAAAAAAAAAAGAAA\"\n",
     "print(translate(a,0))\n",
     "print(translate(a,1))\n",
     "print(translate(a,2))"
@@ -814,25 +620,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 160,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "264 CGGCATGA MMRLRGSGMLRDLLLRSPAGVSATLRRAQPLVTLCRRPRGGGRPAAGPAAAARLHPWWGGGGWPAEPLARGLSSSPSEILQELGKGSTHPQPGVSPPAAPAAPGPKDGPGETDAFGNSEGKELVASGENKIKQGLLPSLEDLLFYTIAEGQEKIPVHKFITALKSTGLRTSDPRLKECMDMLRLTLQTTSDGVMLDKDLFKKCVQSNIVLLTQAFRRKFVIPDFMSFTSHIDELYESAKKQSGGKVADYIPQLAKFSPDLWGVSVCTVDGQRHSTGDTKVPFCLQSCVKPLKYAIAVNDLGTEYVHRYVGKEPSGLRFNKLFLNEDDKPHNPMVNAGAIVVTSLIKQGVNNAEKFDYVMQFLNKMAGNEYVGFSNATFQSERESGDRNFAIGYYLKEKKCFPEGTDMVGILDFYFQLCSIEVTCESASVMAATLANGGFCPITGERVLSPEAVRNTLSLMHSCGMYDFSGQFAFHVGLPAKSGVAGGILLVVPNVMGMMCWSPPLDKMGNSVKGIHFCHDLVSLCNFHNYDNLRHFAKKLDPRREGGDQRVKSVINLLFAAYTGDVSALRRFALSAMDMEQRDYDSRTALHVAAAEGHVEVVKFLLEACKVNPFPKDRWNNTPMDEALHFGHHDVFKILQEYQVQYTPQGDSDNGKENQTVHKNLDGLL 0.003137706\n"
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: '/home/annaldas/projects/result/ONECUT2/ONECUT2_seq.fa'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-9-28170ef6fa91>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m    125\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    126\u001b[0m \u001b[0mtranscripts_filename\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/home/annaldas/projects/result/ONECUT2/ONECUT2_seq.fa\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 127\u001b[0;31m \u001b[0mtranscripts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSeqIO\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtranscripts_filename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"fasta\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    128\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    129\u001b[0m \u001b[0mgene\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"ONECUT2\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/Bio/SeqIO/__init__.py\u001b[0m in \u001b[0;36mindex\u001b[0;34m(filename, format, alphabet, key_function)\u001b[0m\n\u001b[1;32m    951\u001b[0m     \u001b[0mrepr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"SeqIO.index(%r, %r, alphabet=%r, key_function=%r)\"\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    952\u001b[0m         \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey_function\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 953\u001b[0;31m     return _IndexedSeqFileDict(proxy_class(filename, format, alphabet),\n\u001b[0m\u001b[1;32m    954\u001b[0m                                key_function, repr, \"SeqRecord\")\n\u001b[1;32m    955\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/Bio/SeqIO/_index.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, filename, format, alphabet)\u001b[0m\n\u001b[1;32m    183\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    184\u001b[0m         \u001b[0;34m\"\"\"Initialize the class.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 185\u001b[0;31m         \u001b[0mSeqFileRandomAccess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    186\u001b[0m         marker = {\"ace\": b\"CO \",\n\u001b[1;32m    187\u001b[0m                   \u001b[0;34m\"embl\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34mb\"ID \"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/Bio/SeqIO/_index.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, filename, format, alphabet)\u001b[0m\n\u001b[1;32m     44\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     45\u001b[0m         \u001b[0;34m\"\"\"Initialize the class.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 46\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_handle\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_open_for_random_access\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     47\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_alphabet\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     48\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_format\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/Bio/File.py\u001b[0m in \u001b[0;36m_open_for_random_access\u001b[0;34m(filename)\u001b[0m\n\u001b[1;32m    130\u001b[0m     \u001b[0mIf\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mfile\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mgzipped\u001b[0m \u001b[0mbut\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mBGZF\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ma\u001b[0m \u001b[0mspecific\u001b[0m \u001b[0mValueError\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mraised\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    131\u001b[0m     \"\"\"\n\u001b[0;32m--> 132\u001b[0;31m     \u001b[0mhandle\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    133\u001b[0m     \u001b[0mmagic\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    134\u001b[0m     \u001b[0mhandle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseek\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/home/annaldas/projects/result/ONECUT2/ONECUT2_seq.fa'"
      ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "'MMRLRGSGMLRDLLLRSPAGVSATLRRAQPLVTLCRRPRGGGRPAAGPAAAARLHPWWGGGGWPAEPLARGLSSSPSEILQELGKGSTHPQPGVSPPAAPAAPGPKDGPGETDAFGNSEGKELVASGENKIKQGLLPSLEDLLFYTIAEGQEKIPVHKFITALKSTGLRTSDPRLKECMDMLRLTLQTTSDGVMLDKDLFKKCVQSNIVLLTQAFRRKFVIPDFMSFTSHIDELYESAKKQSGGKVADYIPQLAKFSPDLWGVSVCTVDGQRHSTGDTKVPFCLQSCVKPLKYAIAVNDLGTEYVHRYVGKEPSGLRFNKLFLNEDDKPHNPMVNAGAIVVTSLIKQGVNNAEKFDYVMQFLNKMAGNEYVGFSNATFQSERESGDRNFAIGYYLKEKKCFPEGTDMVGILDFYFQLCSIEVTCESASVMAATLANGGFCPITGERVLSPEAVRNTLSLMHSCGMYDFSGQFAFHVGLPAKSGVAGGILLVVPNVMGMMCWSPPLDKMGNSVKGIHFCHDLVSLCNFHNYDNLRHFAKKLDPRREGGDQRVKSVINLLFAAYTGDVSALRRFALSAMDMEQRDYDSRTALHVAAAEGHVEVVKFLLEACKVNPFPKDRWNNTPMDEALHFGHHDVFKILQEYQVQYTPQGDSDNGKENQTVHKNLDGLL'"
-      ]
-     },
-     "execution_count": 160,
-     "metadata": {},
-     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -888,13 +692,13 @@
     "    translating = True\n",
     "    aa = \"\"\n",
     "    \n",
+    "    in_utr = False\n",
+    "    for utr in utr_regions:\n",
+    "        start,stop = utr\n",
+    "        if ((start < i) and (i < stop)):\n",
+    "            in_utr = True\n",
+    "    \n",
     "    while(translating): \n",
-    "        in_utr = False\n",
-    "        for utr in utr_regions:\n",
-    "            start,stop = utr\n",
-    "            if ((start < i) and (i < stop)):\n",
-    "                in_utr = True\n",
-    "                    \n",
     "        if ((len(seq) < 3) or (in_utr)):\n",
     "            translating = False\n",
     "            aa = \"\"\n",
@@ -906,47 +710,80 @@
     "                aa += codon_table[codon]\n",
     "            seq = seq[3:]\n",
     "            i += 3\n",
-    "    return aa\n",
+    "    return aa,i\n",
     "\n",
+    "def find_utrs(seq,utr):\n",
+    "    pos = seq.find(utr)\n",
+    "    if (pos == -1):\n",
+    "        if (len(utr) > 20): \n",
+    "            for i in range(len(utr) - 1,len(utr)*5//10 - 1,-1):\n",
+    "                pos = seq.find(utr[:i])\n",
+    "    return pos\n",
     "\n",
     "def translate_aa_seq(seq,enst,gene_utrs):\n",
     "    utr_regions = []\n",
     "    for utr in gene_utrs[enst]:\n",
-    "        pos = seq.find(utr)\n",
+    "        pos = find_utrs(seq,utr)\n",
     "        if (pos != -1):\n",
     "            utr_regions.append([pos,pos + len(utr)])\n",
     "    \n",
-    "    longest_aa_seq = \"\"\n",
+    "    longest_aa_seq = \"M\"\n",
     "    longest_aa_seq_sc = 0\n",
+    "    longest_aa_seq_sc_end = 0\n",
     "    for i in range(len(seq)):\n",
     "        if (seq[i:i+3] == \"ATG\"):\n",
     "            sc = score(seq[i-4:i+4],0)\n",
-    "            aa = translate(seq[i:], i, utr_regions)\n",
-    "            if ((aa != \"\") and (sc > longest_aa_seq_sc) and (aa not in longest_aa_seq)):\n",
-    "                print(i,seq[i-4:i+4],aa,sc)\n",
+    "            aa,end = translate(seq[i:], i, utr_regions)\n",
+    "            #print(i,seq[i-4:i+4],aa,sc, end)\n",
+    "            if ((len(aa) > 20) and (sc > longest_aa_seq_sc) and (i > longest_aa_seq_sc_end)):\n",
     "                longest_aa_seq = aa\n",
     "                longest_aa_seq_sc = sc\n",
+    "                longest_aa_seq_sc_end = end\n",
     "    return (longest_aa_seq,longest_aa_seq_sc)\n",
     "\n",
-    "def find_all_aa_seqs(seq,enst):\n",
-    "    gene_utrs = determine_utrs(\"GLS\")\n",
+    "\n",
+    "\n",
+    "def translate_aa_seq_length(seq,enst,gene_utrs):\n",
+    "    utr_regions = []\n",
     "    \n",
-    "    if (enst in gene_utrs):\n",
-    "        longest_aa_seq,longest_aa_seq_sc = translate_aa_seq(seq,enst,gene_utrs)\n",
-    "    else:\n",
-    "        longest_aa_seq = \"\"\n",
-    "        longest_aa_seq_sc = 0\n",
-    "        for enst_id in gene_utr:\n",
-    "            aa_seq,aa_seq_sc = translate_aa_seq(seq,enst_id,gene_utrs)\n",
-    "            if (aa_seq_sc > longest_aa_seq_sc):\n",
-    "                longest_aa_seq = aa_seq\n",
-    "                longest_aa_seq_sc = aa_seq_sc\n",
+    "    longest_aa_seq = \"M\"\n",
+    "    for i in range(len(seq)):\n",
+    "        if (seq[i:i+3] == \"ATG\"):\n",
+    "            aa,end = translate(seq[i:], i, utr_regions)\n",
+    "            #print(i,seq[i-4:i+4],aa, end)\n",
+    "            if (len(aa) > len(longest_aa_seq)):\n",
+    "                longest_aa_seq  = aa\n",
+    "    return longest_aa_seq\n",
+    "\n",
+    "def find_all_aa_seqs(seq,enst,gene):\n",
+    "    gene_utrs = determine_utrs(gene)\n",
+    "    \n",
+    "    longest_aa_seq = translate_aa_seq_length(seq,enst,gene_utrs)\n",
+    "    if gene in gene_utrs:\n",
+    "        for utr in gene_utrs[gene]:\n",
+    "            if (find_utrs(seq,utr) != -1):\n",
+    "                longest_aa_seq,longest_aa_seq_sc = translate_aa_seq(seq,enst,gene_utrs)\n",
     "            \n",
     "    return longest_aa_seq\n",
     "            \n",
-    "enst = \"ENST00000320717\"\n",
-    "a=\"AGTGCGGAGCCTTAGGCGGAGCGAAGAGAACCGGTCGCGGCAATCCTAGCGCGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCACCCGCATCCGCTGCGGGAGTCCGAGCCGGAACCACACCCAAGTAGCTGCCCTTTCCTCTTCTGTCATCTCACCGCCCCACCACAGACCGCGTTCCCCGAGGAAACCGGCCGCCCACGCCCGGAGCATCCTCCCCTGTTGAGCGGGCGCTGACGGACCCGGCGGCATGATGCGGCTGCGAGGCTCGGGGATGCTGCGGGACCTGCTCCTGCGGTCGCCCGCCGGCGTGAGCGCGACTCTGCGGCGGGCACAGCCCTTGGTCACCCTGTGCCGGCGTCCCCGAGGCGGGGGACGGCCGGCCGCGGGCCCGGCTGCCGCCGCGCGACTCCACCCGTGGTGGGGCGGGGGCGGCTGGCCGGCGGAGCCCCTCGCGCGGGGCCTGTCCAGCTCTCCTTCGGAGATCTTGCAGGAGCTGGGCAAGGGGAGCACGCATCCGCAGCCCGGGGTGTCGCCACCCGCTGCCCCGGCGGCGCCCGGCCCCAAGGACGGCCCCGGGGAGACGGACGCGTTTGGCAACAGCGAGGGCAAAGAGCTGGTGGCCTCAGGTGAAAATAAAATAAAACAGGGTCTGTTACCTAGCTTGGAAGATTTGCTGTTCTATACAATTGCTGAAGGACAAGAGAAAATACCTGTTCATAAATTTATTACAGCACTCAAATCTACAGGATTGCGAACGTCTGATCCCAGGTTGAAAGAGTGTATGGATATGTTAAGATTAACTCTTCAAACAACATCAGATGGTGTCATGCTAGACAAAGATCTTTTTAAAAAATGTGTTCAGAGCAACATTGTTTTGTTGACACAAGCATTTAGAAGAAAGTTTGTGATTCCTGACTTTATGTCTTTTACCTCACACATTGATGAGTTATATGAAAGTGCTAAAAAGCAGTCTGGAGGAAAGGTTGCAGATTATATTCCTCAACTGGCCAAATTCAGTCCCGATTTGTGGGGTGTGTCTGTTTGTACAGTAGATGGACAGAGGCATTCTACTGGAGATACCAAAGTTCCCTTCTGTCTTCAGTCCTGTGTAAAACCTTTGAAATATGCCATTGCTGTTAATGATCTTGGAACTGAATATGTGCATCGATATGTTGGAAAAGAGCCGAGTGGACTAAGATTCAACAAACTATTTTTGAATGAAGATGATAAACCACATAATCCTATGGTAAATGCTGGAGCAATTGTTGTGACTTCACTAATAAAGCAAGGAGTAAATAATGCTGAAAAATTTGACTATGTCATGCAGTTTTTGAATAAGATGGCTGGTAATGAATATGTTGGATTCAGTAATGCAACGTTTCAGTCTGAAAGAGAAAGTGGAGATCGAAATTTTGCAATAGGATATTACTTAAAAGAAAAGAAGTGTTTTCCAGAAGGCACAGACATGGTTGGTATATTAGACTTCTACTTCCAGCTGTGCTCCATTGAAGTGACTTGTGAATCAGCCAGTGTGATGGCTGCGACACTGGCTAATGGTGGTTTCTGCCCAATTACTGGTGAAAGAGTACTGAGCCCTGAAGCAGTTCGAAATACATTGAGTTTGATGCATTCCTGTGGCATGTATGACTTCTCAGGGCAGTTTGCTTTCCATGTTGGTCTTCCTGCAAAATCTGGAGTTGCTGGGGGCATTCTTTTAGTTGTCCCCAATGTTATGGGTATGATGTGCTGGTCTCCTCCTCTGGATAAGATGGGCAACAGTGTTAAGGGAATTCACTTTTGTCACGATCTTGTTTCTCTGTGTAATTTCCATAACTATGATAATTTGAGACACTTTGCAAAAAAACTTGATCCTCGAAGAGAAGGTGGTGATCAAAGGGTAAAGTCAGTGATAAATCTTTTGTTTGCTGCATATACTGGAGATGTGTCTGCACTTCGAAGATTTGCTTTGTCAGCTATGGACATGGAACAGCGGGACTATGATTCTAGAACAGCACTCCATGTAGCTGCTGCAGAGGGTCATGTTGAAGTTGTTAAATTTTTGCTGGAAGCCTGCAAAGTAAACCCTTTCCCCAAGGACAGGTGGAATAACACTCCCATGGATGAAGCACTGCACTTTGGACACCATGATGTATTTAAAATTCTCCAAGAATACCAAGTCCAGTACACACCTCAAGGAGATTCTGACAACGGGAAGGAAAATCAAACCGTCCATAAGAATCTTGATGGATTGTTGTAATGGTCTCAAATCCCAAGATTTAAATCACTTACCTATTTAATTGTGGAAAATGATTATGAAGAACATGTGTATTTCTATCTGGTAGTGATGTATATTTTACATTTGTCATTTCAGTGTTACTGGAGTTTTCTTCATTGTGCACACAGGACAAATCTGATCTCTTTGGGAAAAAATAGAAATAAAACAATCTCCCTCCATAATGTGAGCAATATTACCTCGTGCATTGTATAATTTGATGTAAAAGAAATAGTTACCAATGCTAGCTTGTGTGGTCTTCCATGATTTATTTGTGTTTTGTGAATTTTCAATTTATGGTGATGATCTGCTGATATGCATTTATAAAGTAAGCTCTGTTGTACAGTCTGTCCAAATGGGTCAAGGTTGCCTTTAGAAGCAAATAGTGTGATTTTCAAGACTTCAAATACAAATTTAGTTTAAGTGTTTGAACAACTATATGCACTTACGGTTGTGTGTTTAAAATGTCTCTCTCACCCCCTAGCTTCATGATGTGACTCTTAAAAAACTATAATAGTTAACAACTGTTAGTAAGATAGACCAATTCTGATTAGACTTTATCAGGGAATCTGTTTAAGATATGTTTGGTGACCAAAACGTATGTGTGAATGTAGTTATAATGCTTTTGAAAAATTTTCCTTTTTCTATATCCCCTTAGTCCAGCCTCTCTTCTCAGACATTTAGCTATCTGCCTCTTTCCTTTAGCTGGGAAAGTGAGAGCTGGCATACTATGCAGTTTTTATGTTTTCCATAGTAAGTCAGAAAATGCCTCCTATTTCTGGCATCAGAACTTTGCCATTTGTCTACAGAAGACGAACCAGAGACAAAATTACTAAGTATAAATTAGTCAAGTTTATCAGTCTAAAAAACGAAGGGATGTGCAACTGCAGCTCTTTAAGAAGTTTTTTTTTTTTAGCTTCTAGGGTAAAGATAAATTCAGAAATGCTCTAAGCTACCAAAGTTATTCTGAAAGTATGGGAACTGCTACAACTAACAAACATTTGTTTCCAAGCCTGTCATTAAGAGTCTGCATCAAGAGATTTGTCCTCCTTGGGGGACCACTGGATCATTCCAGATTTCTTGTGATTTTTCTATTGTGTAATTCTTGGTGGGCTCTGTAGTTTAATAATAAGAAAAAGGCCATTTCATTTTAAATTGTGACCTATAATTCTTTGTCTTGGGTTGGTAATTCAGGATTCATTTGGAAAGTGGGTAAAAGGGGCTTCAAAAAACGGATAGAACAGGATTTTCTAGGAGTTACACATACATTTTATCCTGTCATACCTCGAGATAAAGTGGCATGTTAGTGAGGAGTTCTGATATTAAGCACACACACACATGCACACAAATGGACTTCTCTGAAGCTGTGTTTAGTGAAATGAGCTCAAGTACATGAATGTTAGTTGTTATCACATACAGCAAATTCCTTTTTTTTTCTTTTTCTATGAGCACACTCTGCTGCTTCTAAACTTTACATGCCTGATGGCACCTTACTCCAGCAGCCTCCAGGTGCTTTCATTTTCACTTCCAGTCTAAGCCAGTGGCTCCTGCCACTGCCCTCCCATTACCTAGATGGCACCTCCTTTGGTGAAACCACGGCCAATGTTCCTTAGCTGCACCAGGCCCGAAGCTGTTCCCATGCTTGAGCTTCCATGGGGAGGATGCTGAGTGAGCAGTTTCCTACCCCGTGGATCTAGCAAGCCATGGAGACAGGTAGCATTTGTAAGATGCTGCACAGGAGCAGCATTATCCCCAAAGATATTACAGGGTAGACACGTTTTAACTGAAATCAATCAAGATAACTTTATTCAAAGAGCAGCCCGCTTTGTGTGACTAAAATGAAACAAGACAGTTGAATTGTGTGACTTGAAGATTACCAATGATTTTGAGGCTTTTCTATAATAAAAAGAGGTTCTAACCATTATTTGGGAACAAAGAGAGTTTTCATCTTTTTTCAGATCAAAACCATTCTGTAAAATCTTTGTTGTTTAATTAAATGTGCCGTTATTTACCCCTGATGTTATTTATGACTATGTGCCGATTCCTGCTCGGGCTGTTTGCTGTTGGCTGGTAATAATATATTTGATTTAAATGCTGTTGACTGTGCTATTAACTGCTGCCGTCAGTAAACTCCAAAGATCTTTTTGTTTTGGCTTTAGTATCATATGTGCTTTTTCTGTATCCTGAGCGCTCTATATGATCATGTTAATTTAAAGCTTTATACACATTGTTGTTTTTGCTGGTCTCATCTTTGGTAATATGCTATACCCCACTGCTGCCCGACACTGCCCTTTAGCTGCAGAGCTGGATTAGCTGTTGACCATTTGATGCTGTTGTCTGTCTGGCAGGGACTGAATGACCTGATGTCAGATTTAGATTCTTCCTGGGGATTACACAGCTATGAATGTATTTGCTTCTAAAACCTCCCAAAGTGAATCTAATCTTAAAACTACAAGTTGTAAGTATTCTGAAATTGGGAAACATTTATTTTAAATGCAATCAGGTAGTGTTGCTTTTTACAGCATAATAAATATATGTATCAAAAAAAAAA\"\n",
-    "find_all_aa_seqs(a,enst)"
+    "transcripts_filename = \"/home/annaldas/projects/result/ONECUT2/ONECUT2_seq.fa\"\n",
+    "transcripts = SeqIO.index(transcripts_filename, \"fasta\")\n",
+    "\n",
+    "gene = \"ONECUT2\"\n",
+    "\n",
+    "for transcript in transcripts:\n",
+    "    seq = str(transcripts[transcript].seq).strip()\n",
+    "    enst = str(transcripts[transcript].id).split(\"|\")[-1].strip()\n",
+    "    protein = find_all_aa_seqs(seq,enst,gene)\n",
+    "    transcript_name =  str(transcripts[transcript].id)\n",
+    "    transcript_name =  str(transcripts[transcript].id)\n",
+    "    transcript_filename = transcript_name.replace(\"|\",\"_\")\n",
+    "    transcript_filename = transcript_filename.replace(\"_\",\"\")\n",
+    "    print(transcript_name,transcript_filename,protein)\n",
+    "    \n",
+    "#enst = \"enst\"\n",
+    "#a=\"TGGACAGCTCCCGCTCACCCAAACAGAAGACGTCGGCGCCGGAGCGGGCTCGGACATGGCGAGGCTGCGAGCCGGCCCGAGCGGCGGGGCCCGGTGATCCCTCCCTCCCTCCCCGTCCCCTCCCCTCTCCCGCACGCACGCCCCGTCCGCCCCCACCCCGCCCCCACCCCGGGCGAGCCCGCCCGCAGCCCGGGGCGCACACCCGCACGCGCACTCCTCTCCACTCACTCCCGCGCCCGCCCCCACTCCCGCAGCCGAGCCCCGCCACGCGCGCCTTGCCCGCCCGCCGGCCGCCCCCGCCGCCCCCGCCGCCCCCGGGCCCTGATGGACTGAATGAAGGCTGCCTACACCGCCTATCGATGCCTCACCAAAGACCTAGAAGGCTGCGCCATGAACCCGGAGCTGACAATGGAAAGTCTGGGCACTTTGCACGGGCCGGCCGGCGGCGGCAGTGGCGGGGGCGGCGGCGGGGGCGGCGGGGGCGGCGGCGGGGGCCCGGGCCATGAGCAGGAGCTGCTGGCCAGCCCCAGCCCCCACCACGCGGGCCGCGGCGCCGCTGGCTCGCTGCGGGGCCCTCCGCCGCCTCCAACCGCGCACCAGGAGCTGGGCACGGCGGCAGCGGCGGCAGCGGCGGCGTCGCGCTCGGCCATGGTCACCAGCATGGCCTCGATCCTGGACGGCGGCGACTACCGGCCCGAGCTCTCCATCCCGCTGCACCACGCCATGAGCATGTCCTGCGACTCGTCTCCGCCTGGCATGGGCATGAGCAACACCTACACCACGCTGACACCGCTCCAGCCGCTGCCACCCATCTCCACCGTGTCTGACAAGTTCCACCACCCTCACCCGCACCACCATCCGCACCACCACCACCACCACCACCACCAGCGCCTGTCCGGCAACGTCAGCGGCAGCTTCACCCTCATGCGCGACGAGCGCGGGCTCCCGGCCATGAACAACCTCTACAGTCCCTACAAGGAGATGCCCGGCATGAGCCAGAGCCTGTCCCCGCTGGCCGCCACGCCGCTGGGCAACGGGCTAGGCGGCCTCCACAACGCGCAGCAGAGTCTGCCCAACTACGGTCCGCCGGGCCACGACAAAATGCTCAGCCCCAACTTCGACGCGCACCACACTGCCATGCTGACCCGCGGTGAGCAACACCTGTCCCGCGGCCTGGGCACCCCACCTGCGGCCATGATGTCGCACCTGAACGGCCTGCACCACCCGGGCCACACTCAGTCTCACGGGCCGGTGCTGGCACCCAGTCGCGAGCGGCCACCCTCGTCCTCATCGGGCTCGCAGGTGGCCACGTCGGGCCAGCTGGAAGAAATCAACACCAAAGAGGTGGCCCAGCGCATCACAGCGGAGCTGAAGCGCTACAGTATCCCCCAGGCGATCTTTGCGCAGAGGGTGCTGTGCCGGTCTCAGGGGACTCTCTCCGACCTGCTCCGGAATCCAAAACCGTGGAGTAAACTCAAATCTGGCAGGGAGACCTTCCGCAGGATGTGGAAGTGGCTTCAGGAGCCCGAGTTCCAGCGCATGTCCGCCTTACGCCTGGCAGCGTGCAAACGCAAAGAGCAAGAACCAAACAAAGACAGGAACAATTCCCAGAAGAAGTCCCGCCTGGTGTTCACTGACCTCCAACGCCGAACACTCTTCGCCATCTTCAAGGAGAACAAACGCCCGTCAAAGGAGATGCAGATCACCATTTCCCAGCAGCTGGGCCTGGAGCTCACAACCGTCAGCAACTTCTTCATGAACGCCCGGCGCCGCAGCCTGGAGAAGTGGCAAGACGATCTGAGCACAGGGGGCTCCTCGTCCACCTCCAGCACGTGTACCAAAGCATGATGGAAGGACTCTCACTTGGGCACAAGTCACCTCCAAATGAGGACAACAGATACCAAAAGAAAACAAAGGAAAAAGACACCGGATTCCTAGCTGGGGCCCTTCACTGGTGATTTGAAAGCACAATTCTCTTGCAAAGAAACTTATATTCTAGCTGTAATCATAGGCCAGGTGTTCTTCTTTTGTTTTTAATGGCTATGGAGTCCAAGTGCAAGCTGAAAAATTAATCTCTTAGAACCAGACACTGTTCTCTGAGCATGCTAAGCATCCCAGAAACCCAAATGGGGCCTTCCTGGAGCGAGTTAATTCCAGTATGGTGTCAACCAAGCTCGGGATTGCTTAAAATATCATCCATCCCACTTCAGGTCCTGTCAGCTTCTTGCAGTCAGAGTTCCTATGAGTAACAATAGGAGTTTGGCCTATGTAAGGACTCTGAGTTTAGGCTTCCAAGATACAACAATAAGAGAAGAATCTAGCAACGAGAATGACCTCATTTGCTTTCCACATGCTTAGCCTCATTATACCATGTTATGTCCAAGTTCACAGCCACAACATCAGAATGGTAATTACTGAGCACAAGTTTTAAATATGGACGTTAAAAAAAAAAATCCAAGGACCTGTTTTTCCAACCCAGACATCTTTTCATTGAATGATTTAGAAAGCTTTAAGTTGATCCAGCTTACAATTTTTTTTTTCTTTACCTCCTGGAAATCTCATATGGTCTTGGATCCGTCAAAAAAACCAGTCAGTTCACTTGCGCTCAAAGTATCAAGCACAACAAAGATAAACAGAAGTGAGGAAGGTTCTGGGTTCACTACATCTGGATTTTCAAGACACCTATTGTGAAGTCATTAGGGAATTGATGAGAATATGGCTTCAAGCACATTTTGCAGTTTGCTACAAATTCTGTTGTACATAATGCAGACGCACACTCAGGAGGCCAATTTAACTGTTAACAGTGCATGGAGCGAATGCAGCATTTTAAAAGATCTAGGTTTTTTTAGGTCATTAATGTGTCCTTGGTTGATCAGTCATCTGGTCCCTCCTACTGTGTGTTATGACCACCACGTAATCCATTCTCGCTCTTTCTGATTTGGGGTTTTTCCTCATCCATCCCATTAGTAGGGATGTTTTCTGTGTTTTCTAGCAAGAAAAAAAAATCAATCAATCAAACCTGCATACATGTTACTCATGACTGTCATCTAGTCCTAAATCTCTTCTGTTGTTGAATCATCCTTGCAAAACAGCTGAATACATCTGGAGAAAACACAGCACACCAAAGAAGCAGAATACTGCAAACCAAAGACATTTATGACTTGTCATTTTCTAGCCTAAAAATACTGTGATTACTTTTAGAAATCAGAAAACCTCTGCAACTCCGAATGGCATTCAGCTCTTGCATTTGGCGCATCATCGGGCTGAGCGGACCAGCTACACCAAGGACATTAGCCAAGCCACCCAGAGGGGTGGCTTTGCCACACCAGTTGTCACCTTCCCATAGCAAGTGGAAGAGCGCCCACAGAACTCTGGGAGATTGCAAAGGTCACAATGTGCATATTTACCAGTGAATGGCCCCGGGTGGGGCCACGTGGGGGTGTTCAAAGCAAGCCAAACGCTGCAATCATTCTTTACAGACACTTGAGACTGACTTTTTTATGAATTACTTAGTCGAAACCAAAGAAACTTTTTCTGCACCTACTTCTGCAACAAACAAAACTGTCCCATTAAAATGAATAAATAAATCCGTAAATCAATGGAAATCACCACCAATAAGAAGGAAGCACGCCAGAAAATAAACGAAAACAAAAACAGGGAGACACACTGTGTTCAAACAGACCTCTTGGGACATTTTTTGGAAGCAGATTTTAAAGAAAGGGTTGAGACAAAGATAGAAATAAGGAAGAGCCTCAGTGGCTGCTGCTTCATTTGACAACTCACACGGTAATCTTAAAGCTGAAGATTGTCTTTAATTTGTGCCTATGCAGTTTTTCAAAAGAACACGGAACAGAGCAACAGAAACCTCAACAGCTACAATACCAAAGATGAGGATTTCTCACACCTTTTGTTTCAGTTCATTATCTCCTCTTGCCTGGCTAAAATACTAATAGCGCCATTGAACTGTATAAAGGTAATCAATTATGTTTCTCTGAGCAACAAAAGGAAAGGGCCATTTATTTGATTTTATTGTTTCATTTCAATTTTGTCTTATGGTTTTTTGCCCCAACATGGAATCTCTCAAAAGTTTCCATGGACTCCAAGTTTAAGATGTTGGGATATTGAACAGTTCTCTCTGCTCAGCAGAGGGTAGGGAATAACATTATCACTTGAATGTTCTTTGCTTAACCCTTAGACTTGGTTCCTTCTATGTTCAGAGTCTCATCATCAGGGGAAGGAAAGGGAGTGAGGGTCAGGGATAGGGGTCTTGGTGATGCATCCTCTCCCGAGCCACAGAACCAAAGAGTTTATAGAGGAATTTACAGCCTCGTTTTCATGTGATTGCTACATCCTAACAGGGCTTCATTTGGGGGTGGGGGGAAACATGTAAAAATAATTGCCAGTTTCTACTTTTCTATTAGCTTTTTAAAAATCAGCTGTAAAGTTGCATTTCTAAAGAAAGATATATATAATATATAAAATACATATATAGATCAACTTGACATTGGTGATAACCAAAATTATTGCTGTCCAAATTCATGTCTTGTTTTGGTCCAGTGCTTCATTTGCTAAGTATTCGGTTCAGAATTTTTCTCATTTCTCATGCCATTCCAGAGTTAATTTGCCACTGTGGATGATTTGAAGTATTCAGATCTCTATGGAAGTTTCTGGGACAGGTTTAAAGTCAAGATCAAGCATTTTAGCATTTAACCTGTTGATAAATGGATCCATGGTGTACATGAGTTTTATTTGTATTCGGAGTCATCTCTATTCTATCCCTCAGCCTCGATTAAGGTGGTGAGTGAAGTGCATCCAACAGACTCGGCCCAGAACTGGGTCCTGACAGTGGGGTGCTCATCTTCTGTAACTGTTGGGAAGGCTCGGTGGTCCATTTTCACCAGTTAAAGAATATGAGGCCAGCCCAGAAATCTGTTCTCCAGGAGCTGCCCTGTCCCATCTGGGTGTGCCAGACCCCCTCAGTGAGCAGGTCCACCAAAGGGACTTCTCACAGGGGAAGCCCAACTCCTGTTGCAATGGGTTGATAGATTTCCTCAGGGTGGTAATTACCAATTCGTATTTTGACAAGCCTATGTGCAACCACAGCTGGCACTGGGGTGGGCAGTGGTGTTGGGTGGGATGGGGGAGAGTGTCTCAATCCTGAAGAGAAAATATAAAGCAGGTTTTGGGGAGACTTCTGGAGTCCTGCCCCTAGAGAGCCCCATTGTTGTTCTTTGTGCCCCCTCCTCATTCCCCCTATGTGGGTCTCCCTATGCAGGAGCTGTGAGAGAATGTGACTCTCCACAATTTTTATAATTCATCCTTCCTAGGAGATTGTTCATTGGCTCTTCCCTTGTGTCCCTTTGTCCCTTGCTCATACTCCATGTTTCCTTTGTCAAAGGACTAAGAAAAGAGCATATTTCAGCAGAGGAGTGTTCCCATGTGGGTTGATTTCAACTTGGGTATTTCTAAAAGAGTCCTTGTGACATGTGTCCAGTGGAAATGGTTGCTCTTTTCCAGACTGGATTGAGGAATGGAGCCTGTTTGATTTGGTTAGTGATTCTTTGACATACTAATCTCAGCGTTTGGGTCTCCAGCATCCTCTGAAGATGTCTAGACTAGTAGAGGCTGCCTTTGTGACCTGACATTACAACATTGGTCAAACCAGTCCTCTGATAATCAGAAGAACATGTCATAATTGTTTAAAAAAAAAAAAAAGGCAAGAATTTCTCTCCAAGGAGCTTTAATAAATGTCTCATTCCAGATAATGTCATACCAGAGAAAAGTGCTTGCTTTTAGAAAATTATTTACATACATATATAAATATATATGTGTATCTATACAGTTATGTATCAAAATTTTAAGCCCTGCAGAATTTCAATTTGTTAGAAATCTAACAGAAAAAAATTTCTATATTGAAAGGTAATAGAATTTAACCCAGTGAGTTTACTCAAGGATTTTTAAATTTAAGTTAATAATTTCAGAGAAAATAACCATTTGGGTGTGGTTATAGTTTAGTATCCATTACCTCAATCCAAGGAAAATTCCAGGCATTCCTCAACCATCAGGAAAAGGTACAGTGTGAAGGAACAGTTCTCAGCCAAATTTCACATTCTTGAGGCAACAGAAATCAAAACACTCAGAGCCATTGAGTGGAAAAACAATTTACTTTATTCCTTTACACAAATAGGCTTGCATTGTTTTTGTTTTAATGTGATTTTGGTACTAGGGATATAATTATTTCATTCCAGGAAATAATAAAAAAAAACAGACAGAGCCAATACATTTCTTTTTTTAAAGGAAACAGCAACAACAATAAAAACTCAGCACCAATATTTAAAAGCTTTTCCAAAATGTAAAAGAAGTGTTTAGCTTGCACCATGCATAAAGGTGCAGGCTAGTTGAACCAGGAAGCATGGCACTTCCTCTGGAGAAATCCAGAAAGAGTTGCTTCTAAGCTCCCTTTTCCCCCTGCAGGCTCTTGGCAATTGTAGGCTTTAGCAAATCCAGAATAATTTTCAATTCAAGCTAAAATAAAATCAACATTTGGAATGTAAATCTGATACACACACACTTTTCTAAGTCAAACAACATATTTCAAAACCAAAAATAAATACCTTTTAGATAATCAGTTATTTTCTTTGTCTATACTGGGCACCCACCTACTAGTGCCAGTAAATTCAAGTTGAACAGATTTTTAAAATCACTATTATCTGGGTATGGGGGAAACTTCCCCACTTTTGAAAATGTTGGTAGAATTATAGGAATGTCTGTTTGATTATCATTACCAAAGTGTCATGACAGTATGCCTTTGTAGTGAACTCGGATTTTCAGGAGTTTGAATAGTTGGATATTTTAAAATCTAAGAAGAAAAGGCCTGTTTCCAATGTTGTTGAAGAATAATGAACTCTATTAAAAAGTGGAGAAAAAGATAATACATGTGGTCAAGGTTGACCACAAGGCCCAGGCACAACTACCTTGGCGATAATCTTCTAGATTCGTAACAGGTTAGAGCTGACTTTTTGTTTTTGTTGTTGCTGATGCTGTGTGATTCAGACTTCTCAGCCTAACCAGGAAGAGTAAGTGGAAATGGTAGATGAAGAAGGGGTAGAGCTGGTGTATCTATAACTTTCTGATATTTGTCTGCCAAACTTGATATATTAGTAATTTTTTTATCTTTAGCTAAGATCAAGTCACCCCTGAAACAACAGGAGATTCTAGTTTTAAAATAAGGCCACAAAAATCCTTACGGAATGAAGAATGGCACCCCAGTTGGTTGTATAAGTCTCATAAGATAATGATGTTGATTTTAAATATGGATGTCTCAATGCCTGTTTTCTATCAATGATTTGTTTGTTTCCAAGGTCGGGGAGGGAAAGAGGGGAGGGTTTATCTGTTTTAGAAAGTCTCAGAATACTTATAAAATACAGAAGTAGTTATTAAAATATATAGGACCTCACATAGGTAGATACAGAACTTACCATTGAGGCTGATGGGCTGTTGTGTGAATCACACAGGACCTTAAATGAGGCTCATTATTCTCACACACCAAAATGACTCTGACAGCCTGAAGCAGTTATTGCTAGAGCCCAAGCTTTCCTTGGAGGTTTTGGAGTTAGGTTGATTGGAAGTAACCAGCTAATACCTTTTCTAGTGGAGAAAAAGACATTGCTACCAGCTTGTTCATCCCATAGAAGTCTTCCACTCTGCTCCATTTTTAGCAGCAAGCATTTCATGTAGCATAAACCTTGGCAGATAAGTGTGCCTAAGGTTTATACAGTCTGTCCGCTTGGATGTATACAAATTTAGATACATATTTTAACATGTGTTCTCATAGATGACTTTATAACAACACACATTACCTATAGGTGTCTAGACTGTGTACATACAAGTGTGTACAGACAAGCTTCATACGTATATACTGTAATCCGTTACAACAAATAAATTTTAAATCATCGTTTAACATGTATGTGGTACTTCTACAGTGTACATTGTTTTCATTATTTATTGTAACATTGAAAACCACAGTGCAGGGAAAACAAAAGTATCCCAGCATCTTCATCCTGTACACTTGGAATTAATTTCATTTGGGCATATCCAAGATAAACTCAACTTTCAAGAAATCTTGTATATTATTTAATCATCTGTGTTAGGATGACACCTATGATTGATGACTTCGGTTGAATAGCTTTATTCTGGATTTTTCATAACTAAAGCTAAATCCAAAGACCTGAAAAAGGACAAAAAGAAAAAAAAAAAAAGAAA\"\n",
+    "#find_all_aa_seqs(a,enst,\"ONECUT2\")"
    ]
   },
   {
@@ -954,18 +791,25 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "test = determine_utrs(\"ONECUT2\")\n",
+    "for i in test:\n",
+    "    for j in test[i]:\n",
+    "        print(i,find_utrs(a,j), j)"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "len(\"MALNGAEVDDFSWEPPTEAETKVLQARRERQDRISRLMGDYLLRGYRMLGETCADCGTILLQDKQRKIYCVACQELDSDVDKDNPALRDVVPQPLPF\") *3"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1010,25 +854,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'TCONS_00010063|ENST00000533115': {'IPR039499'},\n",
-       " 'TCONS_00010061|TCONS_00010060': {'IPR009563'},\n",
-       " 'TCONS_00010064|ENST00000533115': {'IPR009563'},\n",
-       " 'TCONS_00011857|ENST00000531405': {'IPR009563'},\n",
-       " 'TCONS_00010062|ENST00000533115': {'IPR009563'},\n",
-       " 'TCONS_00010060|ENST00000533115': {'IPR009563'}}"
-      ]
-     },
-     "execution_count": 20,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "domains"
    ]
diff --git a/test/list.txt b/test/list.txt
index a6957de..5e478ac 100644
--- a/test/list.txt
+++ b/test/list.txt
@@ -1,38 +1,27 @@
-BRD4
-BRD3
-BRD2
-PAF1 
-CTR9 
-CDC73 
-LEO1
-RTF1
-WDR61
-SPT5
-SPT4
-SPT6
-TCEA1
-TCEA2
-TCEA3
-TCEANC
-TCEANC2
-CDK9
-TRIM28
-SUPT16H
-SSRP1
-ELF1
-CDK12
-SUPT16H
-SSRP1
-ELL2
-AFF4
-SKI
-CCNT1
-NELFA
-NELFB
-NELFC
-NELFD
-NELFE
-TCEAL
-SNUPN
-MYC
-MLLT1
\ No newline at end of file
+SMARCC2
+SMARCB1
+SMARCE1
+SMARCD1
+SMARCD2
+SMARCD3
+BRD7
+SMARCA4
+SMARCA2
+ARID1A
+ARID1B
+ARID2
+PBRM1
+ACTL6A
+ACTL6B
+PHF10
+DPF1
+DPF2
+DPF3
+BCL7B
+BCL7A
+BCL7C
+BRD9
+BCL7C
+SS18
+SS1BL1
+GLTSCR1L
diff --git a/translation_protein.py b/translation_protein.py
index 1dabda4..a754d89 100644
--- a/translation_protein.py
+++ b/translation_protein.py
@@ -1,11 +1,7 @@
 from Bio import SeqIO
 import os
 
-def translate(seq,orf):
-    seq = seq.upper()
-    seq = seq.replace("\n","")
-
-    table = { 
+codon_table = {
         'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
         'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
         'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
@@ -22,80 +18,133 @@ def translate(seq,orf):
         'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
         'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 
         'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', 
-    } 
+        }
 
-    protein = ""
-    exon = False
-    translating = True
-    i = orf
+def determine_utrs(gene):
+    utr_file = open(snakemake.input[0],"r")
+    utr_lines = utr_file.readlines()
+    utr_file.close()
+    gene_utr = dict()
+    
+    for line in utr_lines:
+        if (line.startswith(">")):
+            trans_id = line[1:].strip()
+            if (trans_id not in gene_utr):
+                gene_utr[trans_id] = []
+        else:
+            gene_utr[trans_id].append(line.strip())
+    return gene_utr
 
-    while (translating):
-        codon = seq[i:i+3]
+def score(seq,start):    
+    kozak = {
+        "A":[0.25,0.61,0.27,0.15,1.00,0.00,0.00,0.23],
+        "C":[0.53,0.02,0.49,0.55,0.00,0.00,0.00,0.16],
+        "G":[0.15,0.36,0.13,0.21,0.00,0.00,1.00,0.46],
+        "T":[0.07,0.01,0.11,0.09,0.00,1.00,0.00,0.15]
+    }
+    
+    score = 1.0
+    for i in range(start,len(seq)):
+        score *= kozak[seq[i]][i]
+    return score
         
-        try: table[codon]
-        except: break
-
-        if (table[codon] == "M"):
-            exon = True
 
-        if (exon):
-            if (table[codon] == "_"):
-                exon = False
+def translate(seq, i, utr_regions):
+    translating = True
+    aa = ""
+    
+    in_utr = False
+    for utr in utr_regions:
+        start,stop = utr
+        if ((start < i) and (i < stop)):
+            in_utr = True
+    
+    while(translating): 
+        if ((len(seq) < 3) or (in_utr)):
+            translating = False
+            aa = ""
+        else:
+            codon = seq[0:3]
+            if (codon_table[codon] == "_"):
                 translating = False
             else:
-                protein += table[codon]
-                i += 3
-        else:
+                aa += codon_table[codon]
+            seq = seq[3:]
             i += 3
-            
-    return protein
+    return aa,i
 
+def find_utrs(seq,utr):
+    pos = seq.find(utr)
+    if (pos == -1):
+        if (len(utr) > 20): 
+            for i in range(len(utr) - 1,len(utr)*5//10 - 1,-1):
+                pos = seq.find(utr[:i])
+    return pos
 
+def translate_aa_seq(seq,enst,gene_utrs):
+    utr_regions = []
+    for utr in gene_utrs[enst]:
+        pos = find_utrs(seq,utr)
+        if (pos != -1):
+            utr_regions.append([pos,pos + len(utr)])
+    
+    longest_aa_seq = "M"
+    longest_aa_seq_sc = 0
+    longest_aa_seq_sc_end = 0
+    for i in range(len(seq)):
+        if (seq[i:i+3] == "ATG"):
+            sc = score(seq[i-4:i+4],0)
+            aa,end = translate(seq[i:], i, utr_regions)
+            #print(i,seq[i-4:i+4],aa,sc, end)
+            if ((len(aa) > 20) and (sc > longest_aa_seq_sc) and (i > longest_aa_seq_sc_end)):
+                longest_aa_seq = aa
+                longest_aa_seq_sc = sc
+                longest_aa_seq_sc_end = end
+    return (longest_aa_seq,longest_aa_seq_sc)
 
-transcripts_filename = snakemake.input[0]
+
+
+def translate_aa_seq_length(seq,enst):
+    utr_regions = []
+    
+    longest_aa_seq = "M"
+    for i in range(len(seq)):
+        if (seq[i:i+3] == "ATG"):
+            aa,end = translate(seq[i:], i, utr_regions)
+            #print(i,seq[i-4:i+4],aa, end)
+            if (len(aa) > len(longest_aa_seq)):
+                longest_aa_seq  = aa
+    return longest_aa_seq
+
+def find_all_aa_seqs(seq,enst,gene):
+    gene_utrs = determine_utrs(gene)
+    
+    longest_aa_seq = translate_aa_seq_length(seq,enst)
+    if gene in gene_utrs:
+        for utr in gene_utrs[gene]:
+            if (find_utrs(seq,utr) != -1):
+                longest_aa_seq,longest_aa_seq_sc = translate_aa_seq(seq,enst,gene_utrs)
+            
+    return longest_aa_seq
+
+
+transcripts_filename = snakemake.input[1]
 transcripts = SeqIO.index(transcripts_filename, "fasta")
 output = []
 
-gene = snakemake.input[0].split("/")[5]
-os.mkdir("/home/annaldas/projects/result/%s/transcripts" %(gene))
+gene = snakemake.params[0]
+os.mkdir("/project/owlmayerTemporary/Sid/isoform_analysis/result/%s/transcripts" %(gene))
 
 for transcript in transcripts:
-    protein = translate(str(transcripts[transcript].seq),0)
-    if (protein != ""):
-        transcript_name =  str(transcripts[transcript].id) + "_1"
-        transcript_filename = transcript_name.replace("|","_")
-        transcript_filename = transcript_filename.replace("_","")
-        transcript_filename_path = "/home/annaldas/projects/result/%s/transcripts/%s_map_protein.fa" %(gene,transcript_filename)
-        #os.mkdir("/home/annaldas/projects/result/%s/transcripts/%s" %(gene,transcript_filename))
-        transcript_file = open(transcript_filename_path, "w+")
-        transcript_file.write(">" + transcript_name + "\n" + protein)
-        transcript_file.close()
-        
-    protein = translate(str(transcripts[transcript].seq),1)
-    if (protein != ""):
-        transcript_name =  str(transcripts[transcript].id) + "_2"
-        transcript_filename = transcript_name.replace("|","_")
-        transcript_filename = transcript_filename.replace("_","")
-        transcript_filename_path = "/home/annaldas/projects/result/%s/transcripts/%s_map_protein.fa" %(gene,transcript_filename)
-        #os.mkdir("/home/annaldas/projects/result/%s/transcripts/%s" %(gene,transcript_filename))
-        transcript_file = open(transcript_filename_path, "w+")
-        transcript_file.write(">" + transcript_name + "\n" + protein)
-        transcript_file.close()
-        
-    protein = translate(str(transcripts[transcript].seq),2)
-    if (protein != ""):
-        transcript_name =  str(transcripts[transcript].id) + "_3"
-        transcript_filename = transcript_name.replace("|","_")
-        transcript_filename = transcript_filename.replace("_","")
-        transcript_filename_path = "/home/annaldas/projects/result/%s/transcripts/%s_map_protein.fa" %(gene,transcript_filename)
-        #os.mkdir("/home/annaldas/projects/result/%s/transcripts/%s" %(gene,transcript_filename))
-        transcript_file = open(transcript_filename_path, "w+")
-        transcript_file.write(">" + transcript_name + "\n" + protein)
-        transcript_file.close()
+    seq = str(transcripts[transcript].seq).strip()
+    enst = str(transcripts[transcript].id).split("|")[-1].strip()
+    protein = find_all_aa_seqs(seq,enst,gene)
     
-#output_filename = snakemake.output[0]
-#output_file = open(output_filename,"w+")
-#output_file.write("\n".join(output))
-#output_file.close()
-
-    
\ No newline at end of file
+    transcript_name =  str(transcripts[transcript].id)
+    transcript_filename = transcript_name.replace("|","_")
+    transcript_filename = transcript_filename.replace("_","")
+    transcript_filename_path = "/project/owlmayerTemporary/Sid/isoform_analysis/result/%s/transcripts/%s_map_protein.fa" %(gene,transcript_filename)
+    transcript_file = open(transcript_filename_path, "w+")
+    transcript_file.write(">" + transcript_name + "\n" + protein + "\n")
+    transcript_file.close()
+