From 0eae1fd78153cfa0fb0dba48bd49b99b66e9eaf9 Mon Sep 17 00:00:00 2001 From: Siddharth Annaldasula Date: Fri, 13 Mar 2020 18:25:02 +0100 Subject: [PATCH] adding all before you know --- Snakefile | 114 +++++----- gene_transcripts.py | 4 +- genes.tab | 68 +++++- interproscan_analysis.py | 317 ++++++++++++++++++++++++++-- isoform_transcripts.ipynb | 430 ++++++++++++-------------------------- test/list.txt | 65 +++--- translation_protein.py | 181 ++++++++++------ 7 files changed, 705 insertions(+), 474 deletions(-) diff --git a/Snakefile b/Snakefile index 6b6bd58..e86c99f 100644 --- a/Snakefile +++ b/Snakefile @@ -10,17 +10,16 @@ Transcripts = config["polished_reads"] rule all: input: - expand("/home/annaldas/projects/result/{gene}/{gene}_transcripts_filtered_coding_potential_analysis.txt", gene = GENES), - #expand("/home/annaldas/projects/result/{gene}/{gene}_blastx_protein_analysis.txt", gene = GENES) - #expand("/home/annaldas/projects/result/{gene}/{gene}_sashimi.pdf", gene = GENES) + #expand("/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_coding_potential_analysis.pdf", gene = GENES), + #expand("//project/owlmayerTemporary/Sid/isoform_analysis//result/{gene}/{gene}_blastx_protein_analysis.txt", gene = GENES) + expand("/project/owlmayerTemporary/Sid/isoform_analysis/result/all/{gene}_protein_sequences.txt", gene = GENES) rule gene_transcript: input: NanoporeGTF, Transcripts output: - "/home/annaldas/projects/result/{gene}/{gene}_seq.fa"#, - #"/home/annaldas/projects/result/{gene}/{gene}_sashimi.sh" + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq.fa" params: gene = "{gene}" script: @@ -28,10 +27,10 @@ rule gene_transcript: rule blastx: input: - gene_fa = "/home/annaldas/projects/result/{gene}/{gene}_seq.fa", + gene_fa = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq.fa", db = config["human_protein"] output: - "/home/annaldas/projects/result/{gene}/{gene}_blastx.out" + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_blastx.out" threads: 4 shell: @@ -40,10 +39,10 @@ rule blastx: rule protein_sequence: input: - blastx = "/home/annaldas/projects/result/{gene}/{gene}_blastx.out", + blastx = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_blastx.out", db = config["human_protein"] output: - "/home/annaldas/projects/result/{gene}/{gene}_blastx_protein.fa" + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_blastx_protein.fa" shell: "sh protein_transcript_sequences.sh {input.blastx} {input.db} {output}" @@ -53,53 +52,70 @@ rule utr_regions: params: gene = "{gene}" output: - "/home/annaldas/projects/result/{gene}/{gene}_utr_regions.bed" + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.bed" script: "utr_regions.py" rule utr_sequences: input: hg = config["human_genome"], - utr = "/home/annaldas/projects/result/{gene}/{gene}_utr_regions.bed" + utr = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.bed" output: - "/home/annaldas/projects/result/{gene}/{gene}_utr_regions.fa" + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.fa" shell: "bedtools getfasta -fi {input.hg} -bed {input.utr} -fo {output} -name" -rule transcript_filter_utr: - input: - utr = "/home/annaldas/projects/result/{gene}/{gene}_utr_regions.fa", - seq = "/home/annaldas/projects/result/{gene}/{gene}_seq.fa" - output: - "/home/annaldas/projects/result/{gene}/{gene}_seq_filt.fa" - script: - "filter_utr.py" +#rule transcript_filter_utr: +# input: +# utr = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.fa", +# seq = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq.fa" +# output: +# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq_filt.fa" +# script: +# "filter_utr.py" checkpoint mapping: input: - "/home/annaldas/projects/result/{gene}/{gene}_seq_filt.fa" + utr = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.fa", + seq = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq.fa" output: - directory("/home/annaldas/projects/result/{gene}/transcripts") #/{transcript}/{transcript}_map_protein.fa" + directory("/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts") #/{transcript}/{transcript}_map_protein.fa" params: gene = "{gene}" script: "translation_protein.py" +def aggregate_mapping(wildcards): + checkpoint_output = checkpoints.mapping.get(**wildcards).output[0] + return expand("/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa", + gene=wildcards.gene, + transcript=glob_wildcards(os.path.join(checkpoint_output,"{transcript}_map_protein.fa")).transcript) + +rule aggregate_mapping: + input: + aggregate_mapping + output: + "/project/owlmayerTemporary/Sid/isoform_analysis/result/all/{gene}_protein_sequences.txt" + params: + gene = "{gene}" + shell: + "cat {input} > {output}" + rule iupred2a_analysis: input: - "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa", + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa", config["iupred2a"] output: - "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_transcript_sequence.txt", - "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein_iupred2a.txt" + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_transcript_sequence.txt", + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_iupred2a.txt" script: "iupred2a_analysis.py" rule interpro_scan: input: - "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa" + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa" output: - "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map.gff3" + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map.gff3" params: db = "Pfam,ProDom,Gene3D,CDD,Coils,MobiDBLite,SMART" shell: @@ -107,10 +123,10 @@ rule interpro_scan: rule brewery_analysis: input: - "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa" + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa" output: - "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa.ss3" - #"/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa.ss8" + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa.ss3" + #"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa.ss8" params: brewery = config["brewery"] shell: @@ -118,9 +134,9 @@ rule brewery_analysis: rule functional_site_analysis: input: - "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa" + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa" output: - "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein_sites.txt" + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_sites.txt" params: ps_scan = config["prosite_scan"], prosite_dat = config["prosite_dat"] @@ -129,18 +145,18 @@ rule functional_site_analysis: rule individual_transcript_analysis: input: - "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein_iupred2a.txt", - "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map.gff3", - "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa.ss3", - "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein_sites.txt" + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_iupred2a.txt", + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map.gff3", + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa.ss3", + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_sites.txt" output: - "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein_analysis.txt" + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_analysis.txt" script: "transcript_analysis.py" def aggregate_input(wildcards): checkpoint_output = checkpoints.mapping.get(**wildcards).output[0] - return expand("/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein_analysis.txt", + return expand("/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_analysis.txt", gene=wildcards.gene, transcript=glob_wildcards(os.path.join(checkpoint_output,"{transcript}_map_protein.fa")).transcript) @@ -148,7 +164,7 @@ rule aggregate: input: aggregate_input output: - "/home/annaldas/projects/result/{gene}/{gene}_transcripts_filtered_analysis.txt" + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_analysis.txt" params: gene = "{gene}" shell: @@ -156,26 +172,28 @@ rule aggregate: #rule filter_transcripts: # input: -# "/home/annaldas/projects/result/{gene}/{gene}_transcripts_analysis.txt" +# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_analysis.txt" # output: -# "/home/annaldas/projects/result/{gene}/{gene}_transcripts_filtered_analysis.txt" +# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_analysis.txt" # script: # "filter_transcripts.py" rule protein_coding_potential_analysis: input: - "/home/annaldas/projects/result/{gene}/{gene}_transcripts_filtered_analysis.txt" + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_analysis.txt" output: - "/home/annaldas/projects/result/{gene}/{gene}_transcripts_filtered_coding_potential_analysis.txt" + "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_coding_potential_analysis.pdf" + params: + gene = "{gene}" script: "interproscan_analysis.py" #rule protein_domain_analysis: # input: -# "/home/annaldas/projects/result/{gene}/transcripts/{transcript}/{transcript}_map.gff3", -# "/home/annaldas/projects/result/{gene}/transcripts/{transcript}/{transcript}_map_protein_iupred2a.txt" +# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}/{transcript}_map.gff3", +# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}/{transcript}_map_protein_iupred2a.txt" # output: -# "/home/annaldas/projects/result/{gene}/{gene}_map_protein_analysis.txt" +# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_map_protein_analysis.txt" # params: # gene = "{gene}" # script: @@ -183,11 +201,11 @@ rule protein_coding_potential_analysis: #rule sashimi_plot: # input: -# sashimi_sh = "/home/annaldas/projects/result/{gene}/{gene}_sashimi.sh", +# sashimi_sh = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_sashimi.sh", # sashimi_py = config["sashimi"], # bams = config["input_bams"], # gtf = NanoporeGTF # output: -# "/home/annaldas/projects/result/{gene}/{gene}_sashimi.pdf" +# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_sashimi.pdf" # shell: # "sh {input.sashimi_sh} {input.sashimi_py} {input.bams} {input.gtf} {output}" \ No newline at end of file diff --git a/gene_transcripts.py b/gene_transcripts.py index ef2b0f8..27cbc00 100644 --- a/gene_transcripts.py +++ b/gene_transcripts.py @@ -18,8 +18,6 @@ # Mapping oID to transcript id # Mapping transcript id to exons -ABI2_info = [] - gene_oID = dict() oID_tID = dict() gene_pos = dict() @@ -57,7 +55,7 @@ # Extracting isoforms from related genes output = [] -gene = snakemake.params[0].upper() +gene = snakemake.params[0] for oID in gene_oID[gene]: tID = oID_tID[transcripts[oID].id] transID = tID_pos[tID] diff --git a/genes.tab b/genes.tab index ba1c398..b8a6956 100644 --- a/genes.tab +++ b/genes.tab @@ -1,2 +1,68 @@ gene_symbol -RPS24 \ No newline at end of file +AC112178.1 +AL133395.1 +AL590617.2 +ANO7 +APOOL +BAK1 +BCKDHB +CADM2 +CHRNA1 +CUZD1 +DACT3 +DENND5B +DLEU7 +EBF2 +EPB41L5 +ERBB2 +ERGIC3 +FAIM +FAM78B +GDF1 +GLS +HAGHL +HOMEZ +IKBIP +KCNH3 +KCNJ6 +KIF1B +KIF7 +KLHL35 +KTN1-AS1 +LAGE3 +LARGE2 +LINC00467 +MIR302CHG +MIR4787 +NCAN +NECTIN2 +NKAIN3 +NKAIN4 +NRF1 +NRG1 +ONECUT2 +PDE4DIP +PIWIL2 +PPP1R1C +PRKCZ +RAC3 +REEP1 +RENBP +RGS9 +RIPOR2 +RPS24 +RRP7BP +SGK3 +SLC17A7 +SLC44A3-AS1 +SLC6A15 +SNTG2 +SWSAP1 +SYNGR1 +THAP7-AS1 +THRA +TLN1 +TPM2 +TYW1 +VSIG10 +WFDC2 \ No newline at end of file diff --git a/interproscan_analysis.py b/interproscan_analysis.py index 62037c5..0d639c6 100644 --- a/interproscan_analysis.py +++ b/interproscan_analysis.py @@ -1,11 +1,23 @@ +import argparse +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from matplotlib.patches import Rectangle +from matplotlib.backends.backend_pdf import PdfPages + + + filename = snakemake.input[0] file = open(filename, "r") lines = file.readlines() file.close() +gene = snakemake.params[0] + class transcript: - def __init__(self, tcons, idr, ips, ss8): + def __init__(self, tcons, length, idr, ips, ss8, pss): self.tcons = tcons + self.length = length self.idr = idr self.ips = ips self.ss8 = ss8 @@ -13,29 +25,29 @@ def __init__(self, tcons, idr, ips, ss8): transcripts = dict() +count = False idr_lines = [] ips_lines = [] ss8_lines = [] -pss_lines = [] +pss_lines = dict() for line in lines: if (line.startswith(">")): - if (len(ips_lines) > 0): - transcripts[tcons] = transcript(tcons, idr, ips, ss8) - + if (count): + if ((tcons not in transcripts) or ((len(idr_lines) - 1) > transcripts[tcons].length)): + transcripts[tcons] = transcript(tcons, len(idr_lines) - 1, idr_lines, ips_lines, ss8_lines, pss_lines) + count = True new = True idr_lines = [] ips_lines = [] ss8_lines = [] - pss_lines = [] + pss_lines = dict() idr = False ips = False ss8 = False pss = False - tcons = line[1:].strip().split("|")[0] - - - if (line.startswith("#####IUPred2A Analysis")): + tcons = line[1:].strip().split("|")[0] + elif (line.startswith("#####IUPred2A Analysis")): idr = True elif (line.startswith("#####InterProScan")): ips = True @@ -43,17 +55,288 @@ def __init__(self, tcons, idr, ips, ss8): elif (line.startswith("#####BrewerySS8 Analysis")): ss8 = True ips = False - - if (idr): - idr_lines.append(line.strip()) + elif (line.startswith("#####PrositeScan Analysis")): + pss = True + ss8 = False + elif (idr): + idr_lines.append(line.strip().split("\t")) elif (ips): - ips_lines.append(line.strip()) + ips_lines.append(line.strip().split("\t")) elif (ss8): - ss8_lines.append(line.strip()) + ss8_lines.append(line.strip().split("\t")) + elif (pss): + if (line.startswith("#")): + pss_id = line.strip().split(" ")[3] + pss_lines[pss_id] = [] + else: + pss_lines[pss_id].append(line.strip().replace(" "," ").split(" ")) + +if ((tcons not in transcripts) or ((len(idr_lines) - 1) > transcripts[tcons].length)): + transcripts[tcons] = transcript(tcons, len(idr_lines) - 1, idr_lines, ips_lines, ss8_lines, pss_lines) + +longest_length = 0 +longest_tcon = "" +for ids in transcripts: + if (transcripts[ids].length > longest_length): + longest_length = transcripts[ids].length + longest_tcon = ids + +# PLOTTING + +colors=['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', + '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', + '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff'] +ss3_abbvs = ["H","E","C"] +aa_abbvs = ["A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y"] + +def preprocessArguments(args): + if args.geneNames != '': + gene_id='gene_name' + with open(args.geneNames,'r') as f: + targets=[i.strip() for i in f] + elif args.geneIDs != '': + gene_id='gene_id' + with open(args.geneIDs,'r') as f: + targets=[i.strip() for i in f] + annotation=pd.read_csv(args.gtf, delimiter='\t', header=None, usecols=[0,2,3,4,6,8], + names=['chrm','type','start','stop','strand','more']) + annotation['transcript_id']=annotation.apply(lambda x: + x['more'].split('transcript_id "')[1].split('"')[0],1) + annotation=annotation.drop(columns='more') + data=pd.read_csv(args.csv) + samples=data.columns[~(data.columns.str.startswith('feature')|data.columns.str.startswith('gene')|data.columns.str.startswith('transcript'))] + #conditions=list(set([x.split('_')[0] for x in samples])) + #conditions = ["0","3","5"] + conditions = ["day0","day3","day5"] + conditions.sort() + number_replicates={} + numerical=True + for cond in conditions: + number_replicates[cond]=samples.str.startswith(cond).sum() + try: + float(cond) + except: + numerical=False + x=np.arange(len(conditions)) + if numerical: + x=[float(cond) for cond in conditions] + return gene_id, targets, annotation, data, samples, conditions, number_replicates, x + +def calculateStatistics(df,conds,nreps): + for cond in conds: + df['mean'+cond]=df.filter(like=cond+'_').mean(1) + df['stdn'+cond]=df.filter(like=cond+'_').std(1)/np.sqrt(nreps[cond]) + df=df.sort_index() + return df + +def chooseIsoforms2Plot(df,minTPM,minPct,maxIso,annotation): + df['minimum']=df.filter(regex='^mean').min(axis=1) + df=df[df['minimum']>minTPM] + df['maximumPct']=df.filter(regex='^Pct').min(axis=1) + df=df[df['maximumPct']>minPct] + df['maximum']=df.filter(regex='^mean').max(axis=1) + df=df.sort_values('maximum',ascending=False) + df=df.head(maxIso) + return df + +def plotProfiles(x, df, df_gene, ax, colors, total=True): + if total: + plt.errorbar(x,df_gene.filter(like='mean').iloc[0],yerr=df_gene.filter(like='stdn').iloc[0],color='black',linewidth=2, label = "Total Expression") + for j in range(df.shape[0]): + row=df.iloc[j] + plt.errorbar(x+np.random.normal(0, 0.03, len(x)),row.filter(regex='^mean'),yerr=row.filter(like='stdn'),color=colors[j],linewidth=2, label = "") + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['left'].set_bounds(0,ax.get_yticks()[-2]) + ax.spines['bottom'].set_bounds(min(x),max(x)) + plt.xlabel("Day") + plt.ylabel("Normalized DeSeq2 TPM") + plt.legend(loc = "upper center", bbox_to_anchor=(0.5,1), frameon=False) + +def plotStacked(x,df,ax,colors): + plt.stackplot(x,df.filter(regex='^Pct').values,colors=colors[:df.shape[0]]) + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['left'].set_bounds(0,100) + ax.spines['bottom'].set_bounds(min(x),max(x)) + ax.axis([min(x),max(x),0,100]) + plt.xlabel("Day") + plt.ylabel("TPM Percentage (out of 100%)") + +def prepareAnnotation(annotation,df): + cut=annotation[annotation['transcript_id'].isin(df['transcript_id'].values)] + strand=cut.iloc[0]['strand'] + if strand=='+': + start=cut['start'].min() + cut['plot_start']=cut['start']-start + cut['plot_stop']=cut['stop']-start + else: + start=cut['stop'].max() + cut['plot_start']=(cut['start']-start)*(-1) + cut['plot_stop']=(cut['stop']-start)*(-1) + return cut + +def plotAnnotation(annotation, df, plt, colors, length): + transcripts_ids = [] + transcripts_pos = [] + chrm = "" + longest = annotation.loc[annotation['plot_stop'].idxmax()]["plot_stop"] + count = 3 + panels = df_temp.shape[0] + 1 + for j in range(df.shape[0]): + ax=plt.subplot(panels,2,(count,count+1)) + ax.set_xlim((-50, length)) + ax.set_ylim((-0.85, 1.85)) + + transcript_annotation=annotation[annotation['transcript_id']==df.iloc[j]["transcript_id"]] + transcripts_ids.append(df.iloc[j]["transcript_id"]) + transcripts_pos.append(df.shape[0]-j) + if (transcript_annotation.shape[0] > 2): + for idx,row in transcript_annotation.iterrows(): + chrm = row["chrm"] + if row['type']=='transcript': + plt.plot([row['plot_start']*length/longest,row['plot_stop']*length/longest],[1.75,1.75],color=colors[j],linewidth=2) + else: + plt.plot([row['plot_start']*length/longest,row['plot_stop']*length/longest],[1.75,1.75],color=colors[j],linewidth=10) + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['left'].set_visible(False) + ax.spines['bottom'].set_bounds(0,ax.get_xticks()[-2]) + #plt.yticks(transcripts_pos,transcripts_ids) + #plt.xlabel(chrm) + count += 2 -transcripts[tcons] = transcript(tcons, idr, ips, ss8) +def plotCodingPotential(plt, panels, df_temp): + count = 3 + for ids in list(df_temp["transcript_id"]): + ax = plt.subplot(panels,2,(count,count+1)) + flip = True + tcons_curr = transcripts[ids] + # Pfam domain + for i in tcons_curr.ips: + + start = int(i[3]) + stop = int(i[4]) + + if (i[1] == "Pfam"): + name = i[8].split(";")[3].split("=")[-1] + plt.gca().add_patch(Rectangle((start,1.05),stop-start,0.2,edgecolor="#3cb44b",facecolor='#3cb44b')) + if (flip): + ax.annotate(name,(start + (stop-start)/2.0,1.2), fontsize = 14, color = "#e6194b", ha = "center", va = "center") + else: + ax.annotate(name,(start + (stop-start)/2.0,1.1), fontsize = 14, color = "#e6194b", ha = "center", va = "center") + flip = not flip + + ax.annotate("Pfam Domain",(-50,1.15), fontsize = 12, color = "#3cb44b", ha = "center", va = "center") + + # Secondary structure prediction + ss8_df = pd.DataFrame(tcons_curr.ss8[1:], columns = tcons_curr.ss8[0]) + ss = list(ss8_df["SS"]) + for i in range(tcons_curr.length): + plt.gca().add_patch(Rectangle((i,-0.4),1,0.35,edgecolor=colors[ss3_abbvs.index(ss[i])],facecolor=colors[ss3_abbvs.index(ss[i])])) + + plt.plot((0,longest_length),(1,1),color = "black") + plt.plot((0,longest_length),(0,0),color = "black") + plt.plot((-1,-1),(1.5,-0.9),color = "black") + ax.annotate("SS Prediction",(-50,-0.2), fontsize = 12, color = "#3cb44b", ha = "center", va = "center") + + # Amino acid sequence + aa = list(ss8_df["AA"]) + for i in range(tcons_curr.length): + plt.gca().add_patch(Rectangle((i,-0.8),1,0.35,edgecolor=colors[aa_abbvs.index(aa[i])],facecolor=colors[aa_abbvs.index(aa[i])])) + ax.annotate("AA Sequence",(-50,-0.6), fontsize = 12, color = "#3cb44b", ha = "center", va = "center") + + #plt.plot((0,longest_length),(1,1),color = "black") + #plt.plot((0,longest_length),(0,0),color = "black") + #plt.plot((-1,-1),(1.5,-0.5),color = "black") + + # IDR prediction + idr_df = pd.DataFrame(tcons_curr.idr[1:], columns = tcons_curr.idr[0]) + idr_df = idr_df.astype({'# POS': 'int32',"IUPRED2":"float"}) + plt.plot(idr_df["# POS"],idr_df["IUPRED2"]) + ax.annotate("IDR Prediction",(-50,0.5), fontsize = 12, color = "#3cb44b", ha = "center", va = "center") + + # Phosphorlyation Site + buffer = 0 + for site_type in tcons_curr.pss: + for i in tcons_curr.pss[site_type]: + start = int(i[0]) + stop = int(i[2]) + plt.gca().add_patch(Rectangle((start,1.3+buffer),stop-start,0.025,edgecolor="black",facecolor='black')) + ax.annotate(site_type,(-50,1.3 + buffer), fontsize = 9, color = "#e6194b", ha = "center", va = "center") + + #plt.gca().add_artist(plt.Circle((start + (stop - start)/2,1.35),0.25,color="black")) + buffer += 0.075 + + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.set_yticks([], []) + + ax.title.set_text(tcons_curr.tcons) + + count += 2 + +class Parser(object): + def __init__(self, csv, gtf, geneIDs, geneNames, outDir, minTPM, maxIso, minPct): + self.csv = csv + self.gtf = gtf + self.geneIDs = geneIDs + self.geneNames = geneNames + self.outDir = outDir + self.minTPM = minTPM + self.maxIso = maxIso + self.minPct = minPct + +args = Parser('/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Quantification/all_counts_deseq2norm.txt', + '/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/GffCompare/nanopore.combined.gtf', + '', '/home/annaldas/projects/isoform_differentiation/test/list.txt', + '/home/annaldas/projects/isoform_differentiation/test/',0,18,0) + +outdir=args.outDir +minimumTPM = args.minTPM +minimumPct = args.minPct +maximumIso = args.maxIso +(identifier, targets, annotation, data, samples, conditions, number_replicates, x) = preprocessArguments(args) +df=data[data[identifier]==gene] +df_temp = df +for j in range(df.shape[0]): + transcript_annotation=annotation[annotation['transcript_id']==df.iloc[j]["transcript_id"]] + if (transcript_annotation.shape[0] < 3): + df_temp = df_temp[df_temp["transcript_id"] != df.iloc[j]["transcript_id"]] +# total gene expression calculation +data_gene=df_temp[samples].sum().to_frame().transpose() +data_gene=calculateStatistics(data_gene,conditions,number_replicates) +# mean transcript expression calculation +df_temp=calculateStatistics(df_temp,conditions,number_replicates) +# isoform percentage calculation +df_temp=(df_temp.filter(like='mean').div(data_gene.filter(like='mean').values[0],1)*100).add_prefix('Pct_').join(df_temp) +#choose isoforms to plot +df_temp=chooseIsoforms2Plot(df_temp,minimumTPM,minimumPct,maximumIso,annotation) +x = [0,3,5] +if df_temp.shape[0]: + panels = df_temp.shape[0] + 1 + fig,axes = plt.subplots(panels,2,figsize = (18,24)) + fig.subplots_adjust(top = 0.95) + fig.suptitle(gene,fontsize=16) + + #plot isoform expression + axg=plt.subplot(panels,2,1) + plotProfiles(x, df_temp, data_gene, axg, colors) + + #plot isoform expression percentage + axt=plt.subplot(panels,2,2) + plotStacked(x,df_temp,axt,colors) + + #prepare annotation + annotation_cut=prepareAnnotation(annotation,df_temp) + + #plot annotation + #axa=plt.subplot(panels,2,(3,4)) + plotAnnotation(annotation_cut, df_temp, plt, colors,longest_length) + + plotCodingPotential(plt,panels,df_temp) -print(transcripts) \ No newline at end of file +fig.savefig(snakemake.output[0]) \ No newline at end of file diff --git a/isoform_transcripts.ipynb b/isoform_transcripts.ipynb index 0c4f65c..c0882b9 100644 --- a/isoform_transcripts.ipynb +++ b/isoform_transcripts.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -71,28 +71,28 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['4e6fca82-c640-4aac-be00-57223afed52e|41',\n", - " '0807443e-5f7f-4b1b-b59f-b6a737ad89a9|16']" + "['6356d404-48e6-4c7a-8fbd-5f6e7f9b42bc|5',\n", + " 'a5e2697c-74a1-4ac9-a5b2-d974af83137d|4']" ] }, - "execution_count": 51, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "gene_oID[\"MLLT1\"]" + "gene_oID[\"KTN1-AS1\"]" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -109,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -153,28 +153,16 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'142e462f-f586-42fc-97b9-b2e3bfa1fd0d'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0moID_tID\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"142e462f-f586-42fc-97b9-b2e3bfa1fd0d\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m: '142e462f-f586-42fc-97b9-b2e3bfa1fd0d'" - ] - } - ], + "outputs": [], "source": [ "oID_tID[\"142e462f-f586-42fc-97b9-b2e3bfa1fd0d\"]" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -186,7 +174,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -208,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -234,36 +222,9 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "92b2df50-a034-43a9-a75a-ad52cbccf075|6 AKAP8L\n", - "78c3fe18-5c83-402c-8269-c07aedbd40a4|3 c1554cca-de07-418f-9818-eca090c80b38 TCONS_00030383\n", - "9a2337b7-740a-4725-838a-8847f42a61bc|6 392ae6cc-c2f2-47a3-bb89-3fdf6caa6719 TCONS_00030384\n", - "fd563dd9-374b-4439-a696-517bcfe44fad|3 79b8683e-5bbc-4bdd-a545-b464ff3564e3 TCONS_00030385\n", - "e172e6af-1c34-43dd-bdb6-3656d795a229|3 4a88e4c4-0501-49dc-b89a-1990cdbaf23c TCONS_00030386\n", - "fc150c74-c6c6-4e61-b4d5-d3639c0dee91|33 89d79286-527c-427b-bb19-83b110feb370 TCONS_00030387\n", - "84a1fcd2-ec02-488e-b1e0-706944cb8fc5|3 9f9ce2f8-92d1-4a3c-90f7-68fcda8fb148 TCONS_00030388\n", - "e0325806-c4fd-4d16-a8a3-3cd3ee13cdfc|6 388b7956-d60f-42cc-948c-f607b14cab9d TCONS_00030389\n", - "b655aec1-864d-418b-b78a-f19b975b80fb|21 6be26262-4cff-4aca-8f54-9c7b3e5bdf22 TCONS_00030390\n", - "bdc78543-c0d2-4ef3-b74e-e60e1c97e9d0|8 ab1a0ddb-1b97-47bc-b004-6c34f2f09b6a TCONS_00030392\n", - "db5576fd-1921-46ce-a0d2-412728ab9db8|3 c62aeaa1-b7c1-47ad-b0a3-9815ad439460 TCONS_00032730\n", - "3fe42f15-a4d1-49ae-a646-64d71a8c43e6|3 36436437-fd4b-4802-b9da-0f048cb4c048 TCONS_00032731\n", - "287bdde2-eb53-4bbc-b96a-5b26773c3dd8|9 36436437-fd4b-4802-b9da-0f048cb4c048 TCONS_00032733\n", - "5a5a062a-2f6c-46d2-aea4-a5e80bbab920|3 21c301c2-0b56-4ee7-8b6b-27f35c89cb08 TCONS_00032734\n", - "6464d1df-7486-4008-a0bb-40d6227027a5|5 fb469fcf-d099-4192-a71a-d5c4c4ae37ba TCONS_00032735\n", - "1a232563-721a-4f76-83af-18a01a9cf221|10 fb469fcf-d099-4192-a71a-d5c4c4ae37ba TCONS_00032736\n", - "43396b69-e18d-4578-903b-782887dd7340|4 fb469fcf-d099-4192-a71a-d5c4c4ae37ba TCONS_00032737\n", - "8004538e-da37-43d1-b8cc-2e8cffc77383|3 fb469fcf-d099-4192-a71a-d5c4c4ae37ba TCONS_00032738\n", - "3debe6b9-8564-4b1f-a8a5-8d3587e3a789|3 fb469fcf-d099-4192-a71a-d5c4c4ae37ba TCONS_00032739\n", - "7ccaec48-68b4-416b-b2b8-79d9e200e14b|11 fb469fcf-d099-4192-a71a-d5c4c4ae37ba TCONS_00032740\n" - ] - } - ], + "outputs": [], "source": [ "print(tID_oID[\"TCONS_00032753\"],oID_gene[tID_oID[\"TCONS_00032753\"]])\n", "\n", @@ -277,7 +238,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -298,30 +259,11 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": null, "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ABI2\n", - " OJ32 OJ33 OJ34 class_code gene_id \\\n", - "31847 12.061978 19.252866 14.095533 = ENSG00000138443.16 \n", - "22086 16.082637 13.730726 17.263069 j ENSG00000138443.16 \n", - "79409 11.200408 5.671387 5.384810 = ENSG00000138443.16 \n", - "64102 3.159089 2.387952 1.583768 = ENSG00000138443.16 \n", - "\n", - " gene_name ref_transcript transcript_id \n", - "31847 ABI2 ENST00000261017.9 TCONS_00035868 \n", - "22086 ABI2 ENST00000295851.10 TCONS_00035869 \n", - "79409 ABI2 ENST00000261018.11 TCONS_00035870 \n", - "64102 ABI2 ENST00000424558.5 TCONS_00035871 \n" - ] - } - ], + "outputs": [], "source": [ "group = df_rep1.groupby([\"gene_name\"])\n", "\n", @@ -337,44 +279,11 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": null, "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "FUS\n", - " OJ40 OJ41 OJ42 class_code gene_id \\\n", - "45283 14.890654 7.486137 11.079592 j ENSG00000089280.18 \n", - "53160 17.810391 4.277792 8.897248 = ENSG00000089280.18 \n", - "68181 131.388127 31.816081 114.992736 = ENSG00000089280.18 \n", - "55151 93.431557 28.073012 56.405196 j ENSG00000089280.18 \n", - "58630 61.606433 41.173751 59.426903 = ENSG00000089280.18 \n", - "64460 13.722760 3.208344 5.707669 j ENSG00000089280.18 \n", - "47855 6.131446 2.138896 6.043414 c ENSG00000089280.18 \n", - "84426 14.014734 6.951413 14.101299 c ENSG00000089280.18 \n", - "36211 16.058549 6.416689 8.393630 c ENSG00000089280.18 \n", - "35581 0.875921 0.267362 1.175108 x ENSG00000089280.18 \n", - "9268 0.000000 0.267362 0.671490 s ENSG00000089280.18 \n", - "\n", - " gene_name ref_transcript transcript_id \n", - "45283 FUS ENST00000254108.11 TCONS_00022528 \n", - "53160 FUS ENST00000566605.5 TCONS_00022529 \n", - "68181 FUS ENST00000254108.11 TCONS_00022530 \n", - "55151 FUS ENST00000254108.11 TCONS_00022531 \n", - "58630 FUS ENST00000487509.6 TCONS_00022532 \n", - "64460 FUS ENST00000254108.11 TCONS_00022533 \n", - "47855 FUS ENST00000487045.6 TCONS_00022534 \n", - "84426 FUS ENST00000487509.6 TCONS_00022535 \n", - "36211 FUS ENST00000254108.11 TCONS_00022536 \n", - "35581 FUS ENST00000254108.11 TCONS_00023974 \n", - "9268 FUS ENST00000254108.11 TCONS_00023975 \n" - ] - } - ], + "outputs": [], "source": [ "group = df_rep2.groupby([\"gene_name\"])\n", "count = 0\n", @@ -392,21 +301,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'df' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mgroup\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"gene_name\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mcount\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mgene\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mitem\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mgroup\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mif\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgene\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"TLL3\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgene\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined" - ] - } - ], + "outputs": [], "source": [ "group = df.groupby([\"gene_name\"])\n", "count = 0\n", @@ -431,7 +328,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -455,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -467,7 +364,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -483,20 +380,9 @@ }, { "cell_type": "code", - "execution_count": 302, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['gene_id \"ENSG00000203667.10\"', ' transcript_id \"ENST00000391839.6\"', ' gene_type \"protein_coding\"', ' gene_name \"COX20\"', ' transcript_type \"lncRNA\"', ' transcript_name \"COX20-202\"', ' level 2', ' transcript_support_level \"1\"', ' hgnc_id \"HGNC:26970\"', ' havana_gene \"OTTHUMG00000040401.2\"', ' havana_transcript \"OTTHUMT00000097176.1\"', ''] transcript\n", - "['gene_id \"ENSG00000203667.10\"', ' transcript_id \"ENST00000391839.6\"', ' gene_type \"protein_coding\"', ' gene_name \"COX20\"', ' transcript_type \"lncRNA\"', ' transcript_name \"COX20-202\"', ' exon_number 1', ' exon_id \"ENSE00001899074.1\"', ' level 2', ' transcript_support_level \"1\"', ' hgnc_id \"HGNC:26970\"', ' havana_gene \"OTTHUMG00000040401.2\"', ' havana_transcript \"OTTHUMT00000097176.1\"', ''] exon\n", - "['gene_id \"ENSG00000203667.10\"', ' transcript_id \"ENST00000391839.6\"', ' gene_type \"protein_coding\"', ' gene_name \"COX20\"', ' transcript_type \"lncRNA\"', ' transcript_name \"COX20-202\"', ' exon_number 2', ' exon_id \"ENSE00003677820.1\"', ' level 2', ' transcript_support_level \"1\"', ' hgnc_id \"HGNC:26970\"', ' havana_gene \"OTTHUMG00000040401.2\"', ' havana_transcript \"OTTHUMT00000097176.1\"', ''] exon\n", - "['gene_id \"ENSG00000203667.10\"', ' transcript_id \"ENST00000391839.6\"', ' gene_type \"protein_coding\"', ' gene_name \"COX20\"', ' transcript_type \"lncRNA\"', ' transcript_name \"COX20-202\"', ' exon_number 3', ' exon_id \"ENSE00001889952.1\"', ' level 2', ' transcript_support_level \"1\"', ' hgnc_id \"HGNC:26970\"', ' havana_gene \"OTTHUMG00000040401.2\"', ' havana_transcript \"OTTHUMT00000097176.1\"', ''] exon\n" - ] - } - ], + "outputs": [], "source": [ "info = list(pd_aline[\"info\"])\n", "types = list(pd_aline[\"type\"])\n", @@ -512,7 +398,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "scrolled": true }, @@ -523,28 +409,11 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "chr8\t26044860\t26045413\tENST00000520164\n", - "chr8\t25841725\t25844611\tENST00000520164\n", - "chr8\t26041288\t26041488\tENST00000408929\n", - "chr8\t26040939\t26041002\tENST00000408929\n", - "chr8\t26040616\t26040671\tENST00000408929\n", - "chr8\t26040066\t26040101\tENST00000408929\n", - "chr8\t25844362\t25844611\tENST00000408929\n", - "chr8\t25858059\t25858083\tENST00000535548\n", - "chr8\t25850594\t25850761\tENST00000535548\n", - "chr8\t25844609\t25844640\tENST00000535548\n" - ] - } - ], + "outputs": [], "source": [ "df_utr_regions = pd.read_csv(\"/project/owlmayerTemporary/Sid/nanopore-analysis/ReferenceData/df_utr_regions.csv\")\n", "\n", @@ -567,7 +436,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -589,19 +458,9 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1 TCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n", - "392 GAGCTGGAGATTGGATCACAG\n", - "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCATGTGTCTGGTTGTTTGAAA\n" - ] - } - ], + "outputs": [], "source": [ "if transcript_id in trans_utr:\n", " seq = s\n", @@ -614,43 +473,9 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-1 GGCATCGGCGCGGTCAGCCTCGTGGCGCGCCCACGCCCCCACGCCGGCTCTTCCCGGGGTCCTTCCGTGCGCGTTGATATGATTGGCCGGCGAATCGTGGTTCTCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n", - "430 GAGCTGGAGATTGGATCACAG\n", - "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCATGTGTCTGGTTGTTTG\n", - "-1 GGCATCGGCGCGGTCAGCCTCGTGGCGCGCCCACGCCCCCACGCCGGCTCTTCCCGGGGTCCTTCCGTGCGCGTTGATATGATTGGCCGGCGAATCGTGGTTCTCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n", - "-1 AAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCATGTGTCTGGTTGTTTGAAA\n", - "-1 ATATGATTGGCCGGCGAATCGTGGTTCTCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n", - "-1 GAAGTGTCTAGCAG\n", - "430 GAGCTGGAGATTGGATCACAG\n", - "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCATGTG\n", - "1 TCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n", - "-1 GAAGTGTCTAGCAG\n", - "392 GAGCTGGAGATTGGATCACAG\n", - "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTC\n", - "1 TCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n", - "392 GAGCTGGAGATTGGATCACAG\n", - "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCA\n", - "1 TCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n", - "392 GAGCTGGAGATTGGATCACAG\n", - "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCATGTGTCTGGTTGTTTGAAA\n", - "1 TCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n", - "-1 AATGTCACTGCCATGGCCGCCTTGCTGCATTTCTGAGGATGCTTCATCTCTCCACCTTCTTCTCCACTCAGCAGCCAGCAGGGCACTGTGGAAATCGGAGTCACATGAGCTGGCACCTCTGTTCAGAACCCTCCAGGGCTCCACATCTCTCTCACCCAAATGCCAAAGACCTCCCCACGCCCCCACAATCCCCCACGACCTGGCCACTGGCCTCCCACCACCTTCCAGCTCCAGCGGCTCCTACCACATTTAAGGCTTTCCTTCCTAGTTTTAATTTTTCCTCGTCAGCAGTTGATTTTATTATTTTCTTGTTTATTGGTATTTTCCCACTAGAAATGAAGCTGCGTGAAGTTAGAGATTTTTTTTTTTGGTCTGTGTTCCTAATTAGCTCATTGCTATACCCCTGGCGCCCAGAACAATGCCTTGGACACAGTACGCAGTAGACTAAATAAATACTTGTTGAATGACTGACTGACGGAATGACGGCTGTGTGGGGAGTGGATTGGGTCGTGAGGCAGAGGCTGCGGTGGAAACTCAGGCAGGAGGTGATGGTGGTTCTTGGGGCTGCGGAATGCCAAGTTTAGAAGCTCTTCCTCTGCTGTGGCACATGAACCGGTCACTCGAGAAGGCTTTTAGATTTACTTTGCCTAATCCCCTCTTAGTGCATGTGGGGAAACTGAGGTACACAAAAGGAATTCCCCACCAAGTTAGGGGCAGAACCTAGCCCCCTTGTCTCCCAGATGGATATCTTCTTTTTTTTTTGAGACGGAGTCTTGCTCTGTTGCCCAGGCTGGAGTGCAGTGGTACCATCTTGGCTCACTGCAACCTCTGCTTCCCAGGTTCAAGCGATTCTCCTGCCTCAGCCTCCTGAGTGTCTGCGATTACAGGTGCACACAACCACGCCTGGCTAATTTTTGTATTTTTAGTAGAGACGGGGTTTCACCGTGTTGGTCAGGGTGACCTCAAACTCCTGACCTCATGATCCACCCAGCTCAGCCTCCCAACGTGCTGGGATTACAGGCATGAGCCACCGTGCCTGGCTGGACATCTTGTTATTAAAGCTTCTTCTCTCTTTGTAGGGGAGGGGGAGATGCCTCTGGTGGAGAAGACCAGTGTGGCAGTGACTGTGTCTGTTAGTGAACCTGGTGGCTGGTTGAGGGTCTGTCGTGGTGACTGAGGACACATACAAAGTGCTTTTCTCAGTGGTCACCTTGGTGTTGGTGAATAAGGGTCAGAAGATGGCTCCTGTCCTAGGGCACTGCCAGTCGGTTTGGAAGCTGAAATGCCTGCTTAGCAGTTTGAGGAAACACAGACCTTGGAGGATCTTCTGGTTGCCTCTTCAAGAATTCATTCTATTCCCCTTCTGCTCCCCAAATTTGCTTTTCTTGGGGTGGGTCTTGGTTGGCCTAAGCCAAGAAAGTATGGCATCTACTCCTTCCATAGCAATAGCTCAGGAATAGGCAGTGACCCAGACCTGAACCAATCAGTGCATGGAATTACCCCTGGCCAAAGTGGTTGATTGAGGCTGGGTGCAAGCAGAGTTGTGAGAAGGCTCCCATTTGGTGGTTGGAGAGATCGCACTTGCTCCAGAGGTCATAATGTGCAGATCTGAGGCTTGGAACTGCTGCAGACATTTTGCTACCACAAGTGAAGCCACCCTGACGACACAGTTGACAATTTGGAGCAGGGCAGAGCTGAGAGAACAGCAGGGAAACAGCCAGAGTCTTGCTCAAGCCTCCCTGAAGTATCTATACCCCTGGACTCTAGTTATGGGGGCTAATAAATGTTATATACTGTTTAAGGTA\n", - "15 GCTGTCTGAAGATAGATCGCCATC\n", - "-1 AGATGACTTGCCCCAAGTCCTTCAGCTCATTCATGCTGGGGAAAGGAGTAAGCTTCAGGCGTCTTCCCCTGGAGTTCACGCCACCTCTGACAGCAAGTGAGCCGTTTGCTACTCAAGTGCTGTTTCTTGCTTTTTTAAG\n", - "-1 TGAGGGAATTGGGGCTTGGAGTGCAAGCATTGGGAAGAATTTCCCAGGAAGAGAGATGCACAGATGTGAAGAACTCGAAGGCAAGAGAAAGCCGGGGGGTTGTGTGGCAGGTAGAAGTGCCAGGACCGTGGAGCGTGTGGACATG\n", - "-1 GAAGTGTCTAGCAGGTACTGAGATT\n", - "430 GAGCTGGAGATTGGATCACAG\n", - "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCA\n" - ] - } - ], + "outputs": [], "source": [ "for t in trans_utr:\n", " seq = s\n", @@ -665,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -678,27 +503,18 @@ "\n", "a = '''GCTCAGTCCTCCAGGCGTCGGTACTCAGCGGTGTTGGAACTTCGTTGCTTGCTTGCCTGTGCGCGCGTGCGCGGACATGGCCTCAAACGATTATACCCAACAAGCAACCCAAAGCTATGGGGCCTACCCCACCCAGCCCGGGCAGGGCTATTCCCAGCAGAGCAGTCAGCCCTACGGACAGCAGAGTTACAGTGGTTATAGCCAGTCCACGGACACTTCAGGCTATGGCCAGAGCAGCTATTCTTCTTATGGCCAGAGCCAGAACACAGGCTATGGAACTCAGTCAACTCCCCAGGGATATGGCTCGACTGGCGGCTATGGCAGTAGCCAGAGCTCCCAATCGTCTTACGGGCAGCAGTCCTCCTACCCTGGCTATGGCCAGCAGCCAGCTCCCAGCAGCACCTCGGGAAGTTACGGTAGCAGTTCTCAGAGCAGCAGCTATGGGCAGCCCCAGAGTGGGAGCTACAGCCAGCAGCCTAGCTATGGTGGACAGCAGCAAAGCTATGGACAGCAGCAAAGCTATAATCCCCCTCAGGGCTATGGACAGCAGAACCAGTACAACAGCAGCAGTGGTGGTGGAGGTGGAGGTGGAGGTGGAGGTAACTATGGCCAAGATCAATCCTCCATGAGTAGTGGTGGTGGCAGTGGTGGCGGTTATGGCAATCAAGACCAGAGTGGTGGAGGTGGCAGCGGTGGCTATGGACAGCAGGACCGTGGAGGCCGCGGCAGGGGTGGCAGTGGTGGCGGCGGCGGCGGCGGCGGTGGTGGTTACAACCGCAGCAGTGGTGGCTATGAACCCAGAGGTCGTGGAGGTGGCCGTGGAGGCAGAGGTGGCATGGGGTAGGTGTCTCATGAGCCAGGGAGTATCTTTGGTGGGGAGTGTGGAGGATTGCATGAATCTCCCTGAAGCCAGTCCCTAGTGCATGGTTTAGTATTCTTGTTGTCTAGGGATCTGTGAGGGCTTTGATTTGGGGGCAGTGACTTTCTTTTTACATCCCCATTTTATTTTTGTGAGAACTTGGGAGCCTGAACTCCCATCCATACCACTGAATAGAGATTTTGAGTAATGATACTTGTTTCCAAAAAAAAAGAAACCATACATAGATACGTATGGATTGGAGTCATTAATATCCTAGGCAAGAAACATGGAAGTGAAGACTTCTTTCTCTGCAAGGGAAACCGATGATCCCACTCCTGGGAAATAGTAGGGAAACTTGGTATGTGTATTCCCATGTGTCCTCTAGGGAGTTGGTAATGGTTAACCTGACTTCAGCTTCCAGGAATTGGCTACTCTTCCCGTTTTCTATAGTCATTTGAATCCACGAGCTTGATTTGCACTAATTTGACCGACATTGATTTTGTGTGTGACTTGGTTTATGGGGCCAGCTGACTGAAGTAAGCAGACCTTTTGGGCAAAAATATGCTTTGACAGTGGTCTCCCACCTATTTGTTCCACTGTCTGCCTTCCCCTGGTTACTTAAAATTCATCAGCTTGTCCAACTGGACCTTCTTTCCTTCCTGCTGAAGTTGATTTGAAGTAAAACCTTAGATTTGATGTTAAAACAGTTGTCAAATCTGTTGGTAAATAAGATTTGAAGGACCCTACTCTGTCTCCCTTGAAAAAGGGGAGGAATGTCAGTGTTACTGTTTTTGGAAAAAGTAGATTTTTAAACCGAGTTTGGAAATGGTAAGTATGCAGAGGTGGGTGGGGGCAATCTCAAAAACGTGCAAAAATGAGGAAAACAAAAATGAGGAAATGTGTGCGTGTGTTTAATGCAAAACTTTAAAAAGAAAAACAACTGTTATGTGACTGTTAACTTGCTCTGCATTTTATGTGCCACAGGTATGAAAGGTGACATTGCAAAATACTCCGCTCTTCTCGCAGTGTAGAAGGGGTGACCCCGGGGGTTGGGGGAGATCAAAAACAGCTCAGTAGTTAGGACAGAGCTTAGCTAAGTTTGTCTTGCTTTAAGGGGAAGTTGCCTTTGGTTTTGACTTTTTATGGAATGGGGTTGGGTCTGCTTGCTGCTTTCAAAGCAAAAACCACAAAAATGTGTTCAAGGCTACCCCAGCCTGGTGTGAAATGTCTTCTGGGTAAATTGGGGTAGGGTTTTTAAACCAACTACTTGGTTGTCAACCACTTGCGACAAGAGGAAAAAAAAACATCTGCTCCATCGGAAGAACGACCAAGGAAAATGGGTTATTTTTTTTCCAGAGGAAATAGATAACGTAACCTTTTAAAGCAAAATCTTTATAAACTGTGTCTGAGAAATTGCACACGTGTGTGTGACATGCTCAAAGGTCAGACAAGGGGTGGTCAGGAAGGGATGTATTTTAGTAGCCACTTGTATCTTTTTCCAAAAACACCTACCCATGTTTGGGGAATGTTAAACAAAATCAAAAAACAACCTTTTGTAGCCGTTGGAAGCTTCATGTCCTTTCTTCTAACTTGTCTTCTCCAGCGGAAGTGACCGTGGTGGCTTCAATAAATTTGGTGGTAAGTGAACAGAGTTTCCAAAATTCCCAACTCCCAGCAATGCTTTGTCTGATTGTTCATTTGCAGATGTCTTAGCGTGTTAATTTAAATGTCAAAGGTTTTGAGGTGTCCAGAACCACCTCCAGAAAGGGGTAGGGTAGAATGCCACCTGTTGCCTGGTGTGTGCTAACCTGGAGCAGGTAGGGGTAAGACTCAATAGTCATCTTTTACCAAATGGGTTTGCCCCAGGTTAATAAGAGGGGTCTAGTAGGCCTTGGACTGGGCCGTTGCCACACCTGGCACTTAGTGACCATCATCATGAGAAACTGGAGAGTGCGTGCTGGAACACGTGGTGCCATCTTGGCTTTAGGATCCTTTTGATCGTTGTGTCCAAGGCTTGTGTGTGTGTGAGTGTGTGGGAGACAACTCCGAATGTTTAATTCTGGAAGAGGGATGTAACATTGCCCTGAGGATGGTGAAGTTGGTATACATTTATAAAGTACGGAATGGTGTCAATGAATGCAATTCTATGTATATGGACTTAACTGAGATGGGCAAATAGAAACTAGCTCTGGGAAGGAACATGTGCACTACTTCAAGAAAGATTGGAAGCATGTGTGGCTCATGGGAAATAACCAGGTCTTAAACAGCACAAACTGAATTCGTGGACCAGGAAGGTCTTAAACAGCACAAACTGAATTCATGGAAAAATGACAAATTTGAGAAGTCTCCCAGTAAGCTGGAACTTTTCTGGTTTGGTTAACAAAAGGTTTCTTGATTTGTTTCAAGATTTAAAGCCAAAGGTGTGGGTTCATGACTTAGGTGTCATTGCGTGTGGGTACAATATTTATATATGGCGAATTCAGATAAACATTGGTCAAAGATGGTCTCTGGAAAAACAAAATAGAGGCTGCATTACGGAAATAAGATTTCTGGTCTGTTCCCTGGGACATGCTTAAAAAATACAATAGCTATTATGTATGGTTTTTATTTTCATGTGGTTTCGGGGAAACAACACGGTTTTAAGGATGGTTTCTAAAGATGAAATTAAAAATTGTTCCACAAGGGTTAAGTGTCTGGTGGTAAAGTTGGGAGAAACTGGATGGATGCACATCGCATGGCTGGTGGCGAGCCCATCTCTCTTCTCTCGGGTGAGAGAACCGGGCCAAGCTGAGTTGGTTTGTTCACTTTAATGGGTCTCCGTTTCCCCTGCCACCTGTGCTGAGGACATTTCCCAGCCTGAGCTGGGGGAGGCAGCATTTGCTGAAGTGTGGAGTTGTCTCTGTGGAGACTCAAGTTACAGATCTTAAGGGGCCTGCCTAGAATTTTCTCCTCTGGGCAGGCGACCCAGGAAAGGGTTTGGAGTGAGGCTGTGAGCACTTACTTGATATTTTACAAGTTTGGATTTGGTGTTAATTTTTTTCCTTGTCCGTTTTTTCCTGTTGACTAACGGCTCATCTTTTCCTTGTTTTTGTTTTTTTTTTGTTCTTTTTTTCCATGTCACTAAAGGCCCTCGGGACCAAGGATCACGTCATGACTCCGAACAGGATAATTCAGACAACAACACCATCTTTGTGCAAGGCCTGGGTGAGAATGTTACAATTGAGTCTGTGGCTGATTACTTCAAGCAGATTGGTATTATTAAGACAAACAAGAAAACGGGACAGCCCATGATTAATTTGTACACAGACAGGGAAACTGGCAAGCTGAAGGGAGAGGCAACGGTCTCTTTTGATGACCCACCTTCAGCTAAAGCAGCTATTGACTGGTTTGATGGTAAAGAATTCTCCGGAAATCCTATCAAGGTCTCATTTGCTACTCGCCGGGCAGACTTTAATCGGGGTGGTGGCAATGGTCGTGGAGGCCGAGGGCGAGGAGGACCCATGGGCCGTGGAGGCTATGGAGGTGGTGGCAGTGGTGGTGGTGGCCGAGGAGGATTTCCCAGTGGAGGTGGTGGCGGTGGAGGACAGCAGCGAGCTGGTGACTGGAAGTGTCCTAATCCCACCTGTGAGAATATGAACTTCTCTTGGAGGAATGAATGCAACCAGTGTAAGGCCCCTAAACCAGATGGCCCAGGAGGGGGACCAGGTGGCTCTCACATGGGGGGTAACTACGGGGATGATCGTCGTGGTGGCAGAGGAGGCTATGATCGAGGCGGCTACCGGGGCCGCGGCGGGGACCGTGGAGGCTTCCGAGGGGGCCGGGGTGGTGGGGACAGAGGTGGCTTTGGCCCTGGCAAGATGGATTCCAGGGGTGAGCACAGACAGGATCGCAGGGAGAGGCCGTATTAATTAGCCTGGCTCCCCAGGTTCTGGAACAGCTTTTTGTCCTGTACCCAGTGTTACCCTCGTTATTTTGTAACCTTCCAATTCCTGATCACCCAAGGGTTTTTTTGTGTCGGACTATGTAATTGTAACTATACCTCTGGTTCCCATTAAAAGTGACCATTTTAGTTAAA'''\n", "\n", - "a = '''AGTGCGGAGCCTTAGGCGGAGCGAAGAGAACCGGTCGCGGCAATCCTAGCGCGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCACCCGCATCCGCTGCGGGAGTCCGAGCCGGAACCACACCCAAGTAGCTGCCCTTTCCTCTTCTGTCATCTCACCGCCCCACCACAGACCGCGTTCCCCGAGGAAACCGGCCGCCCACGCCCGGAGCATCCTCCCCTGTTGAGCGGGCGCTGACGGACCCGGCGGCATGATGCGGCTGCGAGGCTCGGGGATGCTGCGGGACCTGCTCCTGCGGTCGCCCGCCGGCGTGAGCGCGACTCTGCGGCGGGCACAGCCCTTGGTCACCCTGTGCCGGCGTCCCCGAGGCGGGGGACGGCCGGCCGCGGGCCCGGCTGCCGCCGCGCGACTCCACCCGTGGTGGGGCGGGGGCGGCTGGCCGGCGGAGCCCCTCGCGCGGGGCCTGTCCAGCTCTCCTTCGGAGATCTTGCAGGAGCTGGGCAAGGGGAGCACGCATCCGCAGCCCGGGGTGTCGCCACCCGCTGCCCCGGCGGCGCCCGGCCCCAAGGACGGCCCCGGGGAGACGGACGCGTTTGGCAACAGCGAGGGCAAAGAGCTGGTGGCCTCAGGTGAAAATAAAATAAAACAGGGTCTGTTACCTAGCTTGGAAGATTTGCTGTTCTATACAATTGCTGAAGGACAAGAGAAAATACCTGTTCATAAATTTATTACAGCACTCAAATCTACAGGATTGCGAACGTCTGATCCCAGGTTGAAAGAGTGTATGGATATGTTAAGATTAACTCTTCAAACAACATCAGATGGTGTCATGCTAGACAAAGATCTTTTTAAAAAATGTGTTCAGAGCAACATTGTTTTGTTGACACAAGCATTTAGAAGAAAGTTTGTGATTCCTGACTTTATGTCTTTTACCTCACACATTGATGAGTTATATGAAAGTGCTAAAAAGCAGTCTGGAGGAAAGGTTGCAGATTATATTCCTCAACTGGCCAAATTCAGTCCCGATTTGTGGGGTGTGTCTGTTTGTACAGTAGATGGACAGAGGCATTCTACTGGAGATACCAAAGTTCCCTTCTGTCTTCAGTCCTGTGTAAAACCTTTGAAATATGCCATTGCTGTTAATGATCTTGGAACTGAATATGTGCATCGATATGTTGGAAAAGAGCCGAGTGGACTAAGATTCAACAAACTATTTTTGAATGAAGATGATAAACCACATAATCCTATGGTAAATGCTGGAGCAATTGTTGTGACTTCACTAATAAAGCAAGGAGTAAATAATGCTGAAAAATTTGACTATGTCATGCAGTTTTTGAATAAGATGGCTGGTAATGAATATGTTGGATTCAGTAATGCAACGTTTCAGTCTGAAAGAGAAAGTGGAGATCGAAATTTTGCAATAGGATATTACTTAAAAGAAAAGAAGTGTTTTCCAGAAGGCACAGACATGGTTGGTATATTAGACTTCTACTTCCAGCTGTGCTCCATTGAAGTGACTTGTGAATCAGCCAGTGTGATGGCTGCGACACTGGCTAATGGTGGTTTCTGCCCAATTACTGGTGAAAGAGTACTGAGCCCTGAAGCAGTTCGAAATACATTGAGTTTGATGCATTCCTGTGGCATGTATGACTTCTCAGGGCAGTTTGCTTTCCATGTTGGTCTTCCTGCAAAATCTGGAGTTGCTGGGGGCATTCTTTTAGTTGTCCCCAATGTTATGGGTATGATGTGCTGGTCTCCTCCTCTGGATAAGATGGGCAACAGTGTTAAGGGAATTCACTTTTGTCACGATCTTGTTTCTCTGTGTAATTTCCATAACTATGATAATTTGAGACACTTTGCAAAAAAACTTGATCCTCGAAGAGAAGGTGGTGATCAAAGGGTAAAGTCAGTGATAAATCTTTTGTTTGCTGCATATACTGGAGATGTGTCTGCACTTCGAAGATTTGCTTTGTCAGCTATGGACATGGAACAGCGGGACTATGATTCTAGAACAGCACTCCATGTAGCTGCTGCAGAGGGTCATGTTGAAGTTGTTAAATTTTTGCTGGAAGCCTGCAAAGTAAACCCTTTCCCCAAGGACAGGTGGAATAACACTCCCATGGATGAAGCACTGCACTTTGGACACCATGATGTATTTAAAATTCTCCAAGAATACCAAGTCCAGTACACACCTCAAGGAGATTCTGACAACGGGAAGGAAAATCAAACCGTCCATAAGAATCTTGATGGATTGTTGTAATGGTCTCAAATCCCAAGATTTAAATCACTTACCTATTTAATTGTGGAAAATGATTATGAAGAACATGTGTATTTCTATCTGGTAGTGATGTATATTTTACATTTGTCATTTCAGTGTTACTGGAGTTTTCTTCATTGTGCACACAGGACAAATCTGATCTCTTTGGGAAAAAATAGAAATAAAACAATCTCCCTCCATAATGTGAGCAATATTACCTCGTGCATTGTATAATTTGATGTAAAAGAAATAGTTACCAATGCTAGCTTGTGTGGTCTTCCATGATTTATTTGTGTTTTGTGAATTTTCAATTTATGGTGATGATCTGCTGATATGCATTTATAAAGTAAGCTCTGTTGTACAGTCTGTCCAAATGGGTCAAGGTTGCCTTTAGAAGCAAATAGTGTGATTTTCAAGACTTCAAATACAAATTTAGTTTAAGTGTTTGAACAACTATATGCACTTACGGTTGTGTGTTTAAAATGTCTCTCTCACCCCCTAGCTTCATGATGTGACTCTTAAAAAACTATAATAGTTAACAACTGTTAGTAAGATAGACCAATTCTGATTAGACTTTATCAGGGAATCTGTTTAAGATATGTTTGGTGACCAAAACGTATGTGTGAATGTAGTTATAATGCTTTTGAAAAATTTTCCTTTTTCTATATCCCCTTAGTCCAGCCTCTCTTCTCAGACATTTAGCTATCTGCCTCTTTCCTTTAGCTGGGAAAGTGAGAGCTGGCATACTATGCAGTTTTTATGTTTTCCATAGTAAGTCAGAAAATGCCTCCTATTTCTGGCATCAGAACTTTGCCATTTGTCTACAGAAGACGAACCAGAGACAAAATTACTAAGTATAAATTAGTCAAGTTTATCAGTCTAAAAAACGAAGGGATGTGCAACTGCAGCTCTTTAAGAAGTTTTTTTTTTTTAGCTTCTAGGGTAAAGATAAATTCAGAAATGCTCTAAGCTACCAAAGTTATTCTGAAAGTATGGGAACTGCTACAACTAACAAACATTTGTTTCCAAGCCTGTCATTAAGAGTCTGCATCAAGAGATTTGTCCTCCTTGGGGGACCACTGGATCATTCCAGATTTCTTGTGATTTTTCTATTGTGTAATTCTTGGTGGGCTCTGTAGTTTAATAATAAGAAAAAGGCCATTTCATTTTAAATTGTGACCTATAATTCTTTGTCTTGGGTTGGTAATTCAGGATTCATTTGGAAAGTGGGTAAAAGGGGCTTCAAAAAACGGATAGAACAGGATTTTCTAGGAGTTACACATACATTTTATCCTGTCATACCTCGAGATAAAGTGGCATGTTAGTGAGGAGTTCTGATATTAAGCACACACACACATGCACACAAATGGACTTCTCTGAAGCTGTGTTTAGTGAAATGAGCTCAAGTACATGAATGTTAGTTGTTATCACATACAGCAAATTCCTTTTTTTTTCTTTTTCTATGAGCACACTCTGCTGCTTCTAAACTTTACATGCCTGATGGCACCTTACTCCAGCAGCCTCCAGGTGCTTTCATTTTCACTTCCAGTCTAAGCCAGTGGCTCCTGCCACTGCCCTCCCATTACCTAGATGGCACCTCCTTTGGTGAAACCACGGCCAATGTTCCTTAGCTGCACCAGGCCCGAAGCTGTTCCCATGCTTGAGCTTCCATGGGGAGGATGCTGAGTGAGCAGTTTCCTACCCCGTGGATCTAGCAAGCCATGGAGACAGGTAGCATTTGTAAGATGCTGCACAGGAGCAGCATTATCCCCAAAGATATTACAGGGTAGACACGTTTTAACTGAAATCAATCAAGATAACTTTATTCAAAGAGCAGCCCGCTTTGTGTGACTAAAATGAAACAAGACAGTTGAATTGTGTGACTTGAAGATTACCAATGATTTTGAGGCTTTTCTATAATAAAAAGAGGTTCTAACCATTATTTGGGAACAAAGAGAGTTTTCATCTTTTTTCAGATCAAAACCATTCTGTAAAATCTTTGTTGTTTAATTAAATGTGCCGTTATTTACCCCTGATGTTATTTATGACTATGTGCCGATTCCTGCTCGGGCTGTTTGCTGTTGGCTGGTAATAATATATTTGATTTAAATGCTGTTGACTGTGCTATTAACTGCTGCCGTCAGTAAACTCCAAAGATCTTTTTGTTTTGGCTTTAGTATCATATGTGCTTTTTCTGTATCCTGAGCGCTCTATATGATCATGTTAATTTAAAGCTTTATACACATTGTTGTTTTTGCTGGTCTCATCTTTGGTAATATGCTATACCCCACTGCTGCCCGACACTGCCCTTTAGCTGCAGAGCTGGATTAGCTGTTGACCATTTGATGCTGTTGTCTGTCTGGCAGGGACTGAATGACCTGATGTCAGATTTAGATTCTTCCTGGGGATTACACAGCTATGAATGTATTTGCTTCTAAAACCTCCCAAAGTGAATCTAATCTTAAAACTACAAGTTGTAAGTATTCTGAAATTGGGAAACATTTATTTTAAATGCAATCAGGTAGTGTTGCTTTTTACAGCATAATAAATATATGTATCAAAAAAAAAA'''" + "a = '''GGCTGCAGCCGGGCTCCGTGGCGCTCGCAGCCACCGCCTCCTCTCGGCTCCAGGTCTTCCCCTTCTTTTTACAACTGATCCTGTTGGGGATTTTTTTTTTTTCTAAATTGGAACGGTGGGGAGGAGCAGGGAGGGGGGACCTGGAGGAAGGGGAGAGATTAGGCAGCCATCAATTTCCTCCAGTTTCTCCCAGAACAGGTGATGCTTCTAAATTGTGATCACTTTCAGGAGGCAGCACTGCAGCTGGAAGGATGCGAGCGACCTAGGGTGGAGTGGCTGAGGCGGCAGATCTGAACTTGCGGAGGATAAGAACCCAAACTTTGACTACATCAGTCCGCACCTCGCCAGTGAAGCAAAGGACGGGTTATCTTTTTTTTTTTTCTAAGACTCAAACTTGGGCACTTGATCCCTTTTCTTGGATTGCTTTGGAGGAGACGATTTGCTGGCAACGTTGGGAACAGTCAGGACTGTGTTGTAACTCTTACTTTTAAAGCGACAGTAGAGGATCAGACTTTTTAAATGTTTGGAATTCAAGATACTTTAGGAAGAGGACCAACTCTGAAAGAGAAATCGCTGGGCGCGGAGATGGATTCGGTCAGGTCCTGGGTCCGGAATGTCGGAGTGGTGGACGCTAATGTCGCCGCGCAGAGCGGGGTCGCCCTGTCCCGGGCCCACTTTGAGAAACAGCCTCCTTCCAACTTGAGGAAATCCAACTTCTTTCACTTCGTCCTGGCGCTCTATGACAGGCAGGGCCAGCCGGTGGAGATCGAGCGGACGGCCTTCGTGGACTTTGTGGAGAATGACAAAGAACAAGGCAACGAGAAGACCAACAACGGCACTCACTACAAGTTACAGCTCCTCTACAGCAACGGTGTCCGCACGGAACAGGACCTCTATGTCAGGCTCATCGACTCGGTCACCAAGCAGCCCATCGCTTACGAGGGACAGAATAAGAATCCGGAAATGTGCCGAGTTCTCCTGACGCACGAAGTGATGTGTAGTCGATGCTGCGAAAAGAAAAGCTGTGGAAACCGAAATGAGACTCCATCGGACCCAGTCATAATTGACAGATTCTTTTTAAAATTTTTCCTCAAGTGCAATCAGAATTGTTTGAAAACAGCAGGAAACCCAAGGGACATGAGACGGTTTCAGGTTGTGTTGTCAACAACGGTGAATGTGGATGGACACGTCCTGGCTGTTTCTGACAACATGTTTGTTCATAACAACTCCAAGCATGGACGGAGAGCAAGAAGACTCGATCCATCGGAAGCTACCCCCTGCATCAAAGCCATTAGCCCGAGTGAAGGCTGGACCACAGGAGGAGCCATGGTCATCATCATCGGGGACAACTTCTTTGATGGTCTCCAAGTGGTGTTTGGGACTATGCTTGTATGGAGCGAGCTAATAACCCCTCATGCCATCAGAGTACAGACTCCTCCCCGGCACATCCCAGGCGTGGTAGAGGTGACATTATCTTATAAATCTAAACAGTTCTGCAAAGGAGCCCCAGGAAGGTTCATTTACACAGCATTAAATGAACCCACCATAGACTATGGCTTCCAGAGACTGCAGAAGGTCATCCCTAGGCATCCTGGAGATCCTGAGAGATTAGCTAAGGAGATGCTGTTGAAAAGAGCTGCAGATCTAGTGGAAGCTCTTTATGGCACACCACACAATAACCAGGACATCATTTTGAAGCGAGCCGCAGACATTGCTGAAGCTCTCTACAGCGTCCCCAGGAATCCCAGCCAGCTTCCAGCCCTCTCTAGCTCCCCAGCGCACAGTGGCATGATGGGAATCAACTCCTATGGCAGCCAGCTTGGGGTCAGCATCTCAGAGTCAACACAAGGAAATAATCAAGGGTACATCCGCAACACAAGCAGCATCTCTCCGCGGGGATACTCTTCCAGCTCCACGCCTCAACAGTCTAATTACAGTACCTCCAGCAACAGTATGAATGGCTACAGCAATGTCCCCATGGCCAACTTGGGTGTTCCAGGTTCACCAGGATTTCTAAATGGCTCACCCACCGGCTCTCCTTATGGAATCATGTCATCAAGTCCCACCGTTGGGTCTTCCAGCACATCCTCCATCCTCCCATTTTCCTCTTCAGTTTTTCCTGCTGTCAAACAGAAGAGTGCCTTTGCCCCTGTCATCAGGCCCCAAGGCTCCCCTTCACCTGCCTGCTCCAGCGGCAATGGAAATGGATTCAGAGCCATGACCGGACTTGTTGTACCCCCGATGTAAAGAAGAACTGCTTTCTTATAGCACAAAACTACTTACTCTGATGGACCAATAATGAAGAAAGCACTAGGAGCTCTTTTGGGGGTGTAGTGGTGCCCCCACATGAACATGATGGACACCCTTGGGTCTGCAAGGAGCCAGCATCTTACTTGGTCCCACGTCCTCCTATAGCTCTGATGGTGGCTACACAAACTGACCCTCTTGGGACAAGGACAAAAGATGTCATTGACGTAGTCAGTGCTAAGAGCAGAAATGCAATTCTTTGTTATGAACATTATGAAAACCACCTTCCTATGTTTGTAAAATATTTAAGAAAAAATTGGCAAACAATTAATGCTTAATATTTTGGATACTATTTGTTTTTCTTTGTAGGAAAAAAAAGTTGAAAGTTTCTATTTTCTATGAAGCCTTTCAGATACCAATTTAGTTTATGCAGAAAAAAATTGAACAAAACAGGGTACCAGCACGGAAGACTTTCTTAAAACGCAACCTGAATTGAATGATGAAATGTTGTATGTGTGTTTGCTTATAGCTTAATCTCTTTAAAAAATGAACAAAAAAAA'''" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "M+ANLGVPGSPGFLNGSPTGSPYGIM+SSSPTVGSSSTSSILPFSSSVFPAVKQKSAFAPVIRPQGSPSPACSSGNGNGFRAM+TGLVVPPM+_RRTAFL_HKTTYSDGPIM+KKALGALLGV_WCPHM+NM+M+DTLGSARSQHLTWSHVLL_L_WWLHKLTLLGQGQKM+SLT_SVLRAEM+QFFVM+NIM+KTTFLCL_NI_EKIGKQLM+LNILDTICFSL_EKKVESFYFL_SLSDTNLVYAEKN_TKQGTSTEDFLKTQPELNDEM+LYVCLLIA_SL_KM+NKKREKF_NVLQLFK_INV_LIVSRGRN_DLFLEERGISLPDVK_K_CKHVVTSFKGV_LLLQLLTRFLSPVLHPLLFHHFIDPRARTCRFPQNTLM+QKQTL_HM+HKYSNPSIHSQLRLISK_STLL_ILFKIPIISFQRSPLSSPENSHV_VCLRLSASQGKKHLTF_QSQNHRLS_PRDPLGIPPHQNPTTPSALGAWFSTEVHSPIELQQHGLEKITLFDGLFRVSLRTSKSFPLPVNFPFYGIKLS_KT_LCHSTVSSQE_DGM+WERDIKSPNPLF_CSKKGKTRFLGDPEIRGICVLCSTVTQPFDGLFIVL_LALRGEEPCLSC_QWLILESLCNPRDISRHIGIANKDVTCFHVGVLLIDS_APQ_SLYTDLVWGSWLGFQH_K_M+QHSWKNIM+SASEPILDDFINSAYLLFDPLM+QAHIYPAPTVLFAFNPFWNYPV_LILFLCT_ESKVLETWHAYNYC_QSLIVRA_SAKIHGNIREKVPQ_ET_LM+EGEECVEYECYLPKVKAEQN_ACLALDLSQSNIRKFCFF_VALLLEPVNFPFA_R_VQYCLQNDCRISW_LYTEKCV_LNTRHLDHSARTLAATELFNCTNVCKDYF_CTNKLKTKLPCPR_SLLFNHSSPKLFLCHLEYEYFYW_LQTLSCVCKRSCVRFFSFKGTIYFIIYLSFVIAL_FFSVFLCTFQYAWFKNH_LLYLLYSKVSFSHCIVSFYCFALSVIYSIKLLCTELFVKTAFLFTVFNGLT_RISCL_NEYVYFSFKL_IILTKK_NFCTVK\n", - "WPTWVFQVHQDF_M+AHPPALLM+ESCHQVPPLGLPAHPPSSHFPLQFFLLSNRRVPLPLSSGPKAPLHLPAPAAM+EM+DSEP_PDLLYPRCKEELLSYSTKLLTLM+DQ__RKH_ELFWGCSGAPT_T_WTPLGLQGASILLGPTSSYSSDGGYTN_PSWDKDKRCH_RSQC_EQKCNSLL_TL_KPPSYVCKIFKKKLANN_CLIFWILFVFLCRKKKLKVSIFYEAFQIPI_FM+QKKIEQNRVPARKTFLKRNLN_M+M+KCCM+CVCL_LNLFKK_TKKGKNFKM+FYNYLNK_M+CNLL_VEVEIKTFFWKREGFLFLM+_NENDANM+__QVLKVCDYYCSYLLDSYPLSCTPSFSITLLTQGQEHVDSHRTP_CRNRHSSTCTNTPTHQFIPSSV_FLNSPHSCEYFLKFLLLAFNVLP_AVQRIPM+FESVSGFQHPKERSTSPSSRAKTTDFLNQGTPWEFLLTRTPPLPPLWVPGSVQRYTAP_NFSNTVWRR_LCLM+DSLESVLEHPSPFLSLSISPFM+ALNSLEKHSYATRQFLVKNETECGRGILKALIPSSDVRRKARPDSWEIQRLGASAFYAAQLPSPLM+GSSSFCD_LLEGKNLALAANSG_FLKAYVILETYPGILGLPIKM+SLVSTLGFSLLTHEHHNRVYTQT_FGVLGWDFSIRNRCSTHGKTS_VHLSQF_M+ISLILHTFSLTH_CRPTSIQPQLSYLPLTHSGITQYDLYYFYAHENPRF_KPGM+LITTADSPS_SGLKAPKYTET_EKKSPSEKPD_WKGKNVWNM+NVTFQK_RQSKIKHV_PWI_ANPT_GNFVFFK_HCF_NL_IFLLHEDECSTVFKM+IVEFLGSFTPKNACN_IPDILTIQLEPWQQQSYLIVQM+CVRIIFSVLIN_KQSYPVLVSHCYSIIPVPSYFCATWNM+SISIGNYKLYPVFVRGAV_DFFHLKGQFTSLFIFLLL_RCNSSQFFFAHFNM+HGLKTINFSTFCTVRFHFHTV_FHFIVLLCQLYTV_SCYAQSFL_RQLFCLLFLM+VLLKEYLVCKM+NM+STSVLNFKLS_QKNKIFVL_K\n", - "GQLGCSRFTRISKWLTHRLSLWNHVIKSHRWVFQHILHPPIFLFSFSCCQTEECLCPCHQAPRLPFTCLLQRQWKWIQSHDRTCCTPDVKKNCFLIAQNYLL_WTNNEESTRSSFGGVVVPPHEHDGHPWVCKEPASYLVPRPPIALM+VATQTDPLGTRTKDVIDVVSAKSRNAILCYEHYENHLPM+FVKYLRKNWQTINA_YFGYYLFFFVGKKS_KFLFSM+KPFRYQFSLCRKKLNKTGYQHGRLS_NAT_IE__NVVCVFAYSLISLKNEQKKGKILKCFTII_INKCVTYCK_R_KLRPFFGRERDFSS_CKM+KM+M+QTCSNKF_RCVIITAVTY_ILIPCPAPPPFPSLY_PKGKNM+_IPTEHPNAETDTLAHAQILQPINSFPAPSDF_IVHTLVNTF_NSYY_LSTFSPEQSREFPCLSLSQAFSIPRKEAPHLLAEPKPQTFLTKGPLGNSSSPEPHHSLRFGCLVQYRGTQPHRTSATRFGEDNSV_WTL_SQS_NIQVLSSPCQFPLLWH_TLLKNIAM+PLDSF_SRM+RRNVGEGY_KP_SPLLM+FEERQDQILGRSRD_GHLRSM+QHSYPAL_WALHRFVTSS_RGRTLP_LLTVVDS_KLM+_S_RHIQAYWDCQ_RCHLFPRWGSPY_LM+STTIESIHRLSLGFLVGISALEIDAALM+EKHHECI_ANFR_FH_FCIPSL_PIDAGPHLSSPNCLICL_PILELPSM+TYIISM+HM+RIQGFRNLACL_LLLTVPHSQGLKRQNTRKHKRKSPPVRNLIDGRGRM+CGI_M+LPSKSEGRAKLSM+FSPGFEPIQHKEILFFLSSIAFRTCEFSFCM+KM+SAVLSSK_L_NFLVALHRKM+RVTKYQTS_PFS_NPGSNRAI_LYKCV_GLFLVY__IKNKATLSSLVIAIQSFQSQVIFVPLGI_VFLLVTTNSILCL_EELCKIFFI_RDNLLHYLSFFCYSVVILLSFSLHISICM+V_KPLTSLPFVQ_GFIFTLYSFILLFCFVSYIQYKVAM+HRAFCKDSFFVYCF_WSYLKNILFVK_ICLLQF_TLNYPNKKIKFLYCKK\n" - ] - } - ], + "outputs": [], "source": [ + "a = \"TGGACAGCTCCCGCTCACCCAAACAGAAGACGTCGGCGCCGGAGCGGGCTCGGACATGGCGAGGCTGCGAGCCGGCCCGAGCGGCGGGGCCCGGTGATCCCTCCCTCCCTCCCCGTCCCCTCCCCTCTCCCGCACGCACGCCCCGTCCGCCCCCACCCCGCCCCCACCCCGGGCGAGCCCGCCCGCAGCCCGGGGCGCACACCCGCACGCGCACTCCTCTCCACTCACTCCCGCGCCCGCCCCCACTCCCGCAGCCGAGCCCCGCCACGCGCGCCTTGCCCGCCCGCCGGCCGCCCCCGCCGCCCCCGCCGCCCCCGGGCCCTGATGGACTGAATGAAGGCTGCCTACACCGCCTATCGATGCCTCACCAAAGACCTAGAAGGCTGCGCCATGAACCCGGAGCTGACAATGGAAAGTCTGGGCACTTTGCACGGGCCGGCCGGCGGCGGCAGTGGCGGGGGCGGCGGCGGGGGCGGCGGGGGCGGCGGCGGGGGCCCGGGCCATGAGCAGGAGCTGCTGGCCAGCCCCAGCCCCCACCACGCGGGCCGCGGCGCCGCTGGCTCGCTGCGGGGCCCTCCGCCGCCTCCAACCGCGCACCAGGAGCTGGGCACGGCGGCAGCGGCGGCAGCGGCGGCGTCGCGCTCGGCCATGGTCACCAGCATGGCCTCGATCCTGGACGGCGGCGACTACCGGCCCGAGCTCTCCATCCCGCTGCACCACGCCATGAGCATGTCCTGCGACTCGTCTCCGCCTGGCATGGGCATGAGCAACACCTACACCACGCTGACACCGCTCCAGCCGCTGCCACCCATCTCCACCGTGTCTGACAAGTTCCACCACCCTCACCCGCACCACCATCCGCACCACCACCACCACCACCACCACCAGCGCCTGTCCGGCAACGTCAGCGGCAGCTTCACCCTCATGCGCGACGAGCGCGGGCTCCCGGCCATGAACAACCTCTACAGTCCCTACAAGGAGATGCCCGGCATGAGCCAGAGCCTGTCCCCGCTGGCCGCCACGCCGCTGGGCAACGGGCTAGGCGGCCTCCACAACGCGCAGCAGAGTCTGCCCAACTACGGTCCGCCGGGCCACGACAAAATGCTCAGCCCCAACTTCGACGCGCACCACACTGCCATGCTGACCCGCGGTGAGCAACACCTGTCCCGCGGCCTGGGCACCCCACCTGCGGCCATGATGTCGCACCTGAACGGCCTGCACCACCCGGGCCACACTCAGTCTCACGGGCCGGTGCTGGCACCCAGTCGCGAGCGGCCACCCTCGTCCTCATCGGGCTCGCAGGTGGCCACGTCGGGCCAGCTGGAAGAAATCAACACCAAAGAGGTGGCCCAGCGCATCACAGCGGAGCTGAAGCGCTACAGTATCCCCCAGGCGATCTTTGCGCAGAGGGTGCTGTGCCGGTCTCAGGGGACTCTCTCCGACCTGCTCCGGAATCCAAAACCGTGGAGTAAACTCAAATCTGGCAGGGAGACCTTCCGCAGGATGTGGAAGTGGCTTCAGGAGCCCGAGTTCCAGCGCATGTCCGCCTTACGCCTGGCAGCGTGCAAACGCAAAGAGCAAGAACCAAACAAAGACAGGAACAATTCCCAGAAGAAGTCCCGCCTGGTGTTCACTGACCTCCAACGCCGAACACTCTTCGCCATCTTCAAGGAGAACAAACGCCCGTCAAAGGAGATGCAGATCACCATTTCCCAGCAGCTGGGCCTGGAGCTCACAACCGTCAGCAACTTCTTCATGAACGCCCGGCGCCGCAGCCTGGAGAAGTGGCAAGACGATCTGAGCACAGGGGGCTCCTCGTCCACCTCCAGCACGTGTACCAAAGCATGATGGAAGGACTCTCACTTGGGCACAAGTCACCTCCAAATGAGGACAACAGATACCAAAAGAAAACAAAGGAAAAAGACACCGGATTCCTAGCTGGGGCCCTTCACTGGTGATTTGAAAGCACAATTCTCTTGCAAAGAAACTTATATTCTAGCTGTAATCATAGGCCAGGTGTTCTTCTTTTGTTTTTAATGGCTATGGAGTCCAAGTGCAAGCTGAAAAATTAATCTCTTAGAACCAGACACTGTTCTCTGAGCATGCTAAGCATCCCAGAAACCCAAATGGGGCCTTCCTGGAGCGAGTTAATTCCAGTATGGTGTCAACCAAGCTCGGGATTGCTTAAAATATCATCCATCCCACTTCAGGTCCTGTCAGCTTCTTGCAGTCAGAGTTCCTATGAGTAACAATAGGAGTTTGGCCTATGTAAGGACTCTGAGTTTAGGCTTCCAAGATACAACAATAAGAGAAGAATCTAGCAACGAGAATGACCTCATTTGCTTTCCACATGCTTAGCCTCATTATACCATGTTATGTCCAAGTTCACAGCCACAACATCAGAATGGTAATTACTGAGCACAAGTTTTAAATATGGACGTTAAAAAAAAAAATCCAAGGACCTGTTTTTCCAACCCAGACATCTTTTCATTGAATGATTTAGAAAGCTTTAAGTTGATCCAGCTTACAATTTTTTTTTTCTTTACCTCCTGGAAATCTCATATGGTCTTGGATCCGTCAAAAAAACCAGTCAGTTCACTTGCGCTCAAAGTATCAAGCACAACAAAGATAAACAGAAGTGAGGAAGGTTCTGGGTTCACTACATCTGGATTTTCAAGACACCTATTGTGAAGTCATTAGGGAATTGATGAGAATATGGCTTCAAGCACATTTTGCAGTTTGCTACAAATTCTGTTGTACATAATGCAGACGCACACTCAGGAGGCCAATTTAACTGTTAACAGTGCATGGAGCGAATGCAGCATTTTAAAAGATCTAGGTTTTTTTAGGTCATTAATGTGTCCTTGGTTGATCAGTCATCTGGTCCCTCCTACTGTGTGTTATGACCACCACGTAATCCATTCTCGCTCTTTCTGATTTGGGGTTTTTCCTCATCCATCCCATTAGTAGGGATGTTTTCTGTGTTTTCTAGCAAGAAAAAAAAATCAATCAATCAAACCTGCATACATGTTACTCATGACTGTCATCTAGTCCTAAATCTCTTCTGTTGTTGAATCATCCTTGCAAAACAGCTGAATACATCTGGAGAAAACACAGCACACCAAAGAAGCAGAATACTGCAAACCAAAGACATTTATGACTTGTCATTTTCTAGCCTAAAAATACTGTGATTACTTTTAGAAATCAGAAAACCTCTGCAACTCCGAATGGCATTCAGCTCTTGCATTTGGCGCATCATCGGGCTGAGCGGACCAGCTACACCAAGGACATTAGCCAAGCCACCCAGAGGGGTGGCTTTGCCACACCAGTTGTCACCTTCCCATAGCAAGTGGAAGAGCGCCCACAGAACTCTGGGAGATTGCAAAGGTCACAATGTGCATATTTACCAGTGAATGGCCCCGGGTGGGGCCACGTGGGGGTGTTCAAAGCAAGCCAAACGCTGCAATCATTCTTTACAGACACTTGAGACTGACTTTTTTATGAATTACTTAGTCGAAACCAAAGAAACTTTTTCTGCACCTACTTCTGCAACAAACAAAACTGTCCCATTAAAATGAATAAATAAATCCGTAAATCAATGGAAATCACCACCAATAAGAAGGAAGCACGCCAGAAAATAAACGAAAACAAAAACAGGGAGACACACTGTGTTCAAACAGACCTCTTGGGACATTTTTTGGAAGCAGATTTTAAAGAAAGGGTTGAGACAAAGATAGAAATAAGGAAGAGCCTCAGTGGCTGCTGCTTCATTTGACAACTCACACGGTAATCTTAAAGCTGAAGATTGTCTTTAATTTGTGCCTATGCAGTTTTTCAAAAGAACACGGAACAGAGCAACAGAAACCTCAACAGCTACAATACCAAAGATGAGGATTTCTCACACCTTTTGTTTCAGTTCATTATCTCCTCTTGCCTGGCTAAAATACTAATAGCGCCATTGAACTGTATAAAGGTAATCAATTATGTTTCTCTGAGCAACAAAAGGAAAGGGCCATTTATTTGATTTTATTGTTTCATTTCAATTTTGTCTTATGGTTTTTTGCCCCAACATGGAATCTCTCAAAAGTTTCCATGGACTCCAAGTTTAAGATGTTGGGATATTGAACAGTTCTCTCTGCTCAGCAGAGGGTAGGGAATAACATTATCACTTGAATGTTCTTTGCTTAACCCTTAGACTTGGTTCCTTCTATGTTCAGAGTCTCATCATCAGGGGAAGGAAAGGGAGTGAGGGTCAGGGATAGGGGTCTTGGTGATGCATCCTCTCCCGAGCCACAGAACCAAAGAGTTTATAGAGGAATTTACAGCCTCGTTTTCATGTGATTGCTACATCCTAACAGGGCTTCATTTGGGGGTGGGGGGAAACATGTAAAAATAATTGCCAGTTTCTACTTTTCTATTAGCTTTTTAAAAATCAGCTGTAAAGTTGCATTTCTAAAGAAAGATATATATAATATATAAAATACATATATAGATCAACTTGACATTGGTGATAACCAAAATTATTGCTGTCCAAATTCATGTCTTGTTTTGGTCCAGTGCTTCATTTGCTAAGTATTCGGTTCAGAATTTTTCTCATTTCTCATGCCATTCCAGAGTTAATTTGCCACTGTGGATGATTTGAAGTATTCAGATCTCTATGGAAGTTTCTGGGACAGGTTTAAAGTCAAGATCAAGCATTTTAGCATTTAACCTGTTGATAAATGGATCCATGGTGTACATGAGTTTTATTTGTATTCGGAGTCATCTCTATTCTATCCCTCAGCCTCGATTAAGGTGGTGAGTGAAGTGCATCCAACAGACTCGGCCCAGAACTGGGTCCTGACAGTGGGGTGCTCATCTTCTGTAACTGTTGGGAAGGCTCGGTGGTCCATTTTCACCAGTTAAAGAATATGAGGCCAGCCCAGAAATCTGTTCTCCAGGAGCTGCCCTGTCCCATCTGGGTGTGCCAGACCCCCTCAGTGAGCAGGTCCACCAAAGGGACTTCTCACAGGGGAAGCCCAACTCCTGTTGCAATGGGTTGATAGATTTCCTCAGGGTGGTAATTACCAATTCGTATTTTGACAAGCCTATGTGCAACCACAGCTGGCACTGGGGTGGGCAGTGGTGTTGGGTGGGATGGGGGAGAGTGTCTCAATCCTGAAGAGAAAATATAAAGCAGGTTTTGGGGAGACTTCTGGAGTCCTGCCCCTAGAGAGCCCCATTGTTGTTCTTTGTGCCCCCTCCTCATTCCCCCTATGTGGGTCTCCCTATGCAGGAGCTGTGAGAGAATGTGACTCTCCACAATTTTTATAATTCATCCTTCCTAGGAGATTGTTCATTGGCTCTTCCCTTGTGTCCCTTTGTCCCTTGCTCATACTCCATGTTTCCTTTGTCAAAGGACTAAGAAAAGAGCATATTTCAGCAGAGGAGTGTTCCCATGTGGGTTGATTTCAACTTGGGTATTTCTAAAAGAGTCCTTGTGACATGTGTCCAGTGGAAATGGTTGCTCTTTTCCAGACTGGATTGAGGAATGGAGCCTGTTTGATTTGGTTAGTGATTCTTTGACATACTAATCTCAGCGTTTGGGTCTCCAGCATCCTCTGAAGATGTCTAGACTAGTAGAGGCTGCCTTTGTGACCTGACATTACAACATTGGTCAAACCAGTCCTCTGATAATCAGAAGAACATGTCATAATTGTTTAAAAAAAAAAAAAAGGCAAGAATTTCTCTCCAAGGAGCTTTAATAAATGTCTCATTCCAGATAATGTCATACCAGAGAAAAGTGCTTGCTTTTAGAAAATTATTTACATACATATATAAATATATATGTGTATCTATACAGTTATGTATCAAAATTTTAAGCCCTGCAGAATTTCAATTTGTTAGAAATCTAACAGAAAAAAATTTCTATATTGAAAGGTAATAGAATTTAACCCAGTGAGTTTACTCAAGGATTTTTAAATTTAAGTTAATAATTTCAGAGAAAATAACCATTTGGGTGTGGTTATAGTTTAGTATCCATTACCTCAATCCAAGGAAAATTCCAGGCATTCCTCAACCATCAGGAAAAGGTACAGTGTGAAGGAACAGTTCTCAGCCAAATTTCACATTCTTGAGGCAACAGAAATCAAAACACTCAGAGCCATTGAGTGGAAAAACAATTTACTTTATTCCTTTACACAAATAGGCTTGCATTGTTTTTGTTTTAATGTGATTTTGGTACTAGGGATATAATTATTTCATTCCAGGAAATAATAAAAAAAAACAGACAGAGCCAATACATTTCTTTTTTTAAAGGAAACAGCAACAACAATAAAAACTCAGCACCAATATTTAAAAGCTTTTCCAAAATGTAAAAGAAGTGTTTAGCTTGCACCATGCATAAAGGTGCAGGCTAGTTGAACCAGGAAGCATGGCACTTCCTCTGGAGAAATCCAGAAAGAGTTGCTTCTAAGCTCCCTTTTCCCCCTGCAGGCTCTTGGCAATTGTAGGCTTTAGCAAATCCAGAATAATTTTCAATTCAAGCTAAAATAAAATCAACATTTGGAATGTAAATCTGATACACACACACTTTTCTAAGTCAAACAACATATTTCAAAACCAAAAATAAATACCTTTTAGATAATCAGTTATTTTCTTTGTCTATACTGGGCACCCACCTACTAGTGCCAGTAAATTCAAGTTGAACAGATTTTTAAAATCACTATTATCTGGGTATGGGGGAAACTTCCCCACTTTTGAAAATGTTGGTAGAATTATAGGAATGTCTGTTTGATTATCATTACCAAAGTGTCATGACAGTATGCCTTTGTAGTGAACTCGGATTTTCAGGAGTTTGAATAGTTGGATATTTTAAAATCTAAGAAGAAAAGGCCTGTTTCCAATGTTGTTGAAGAATAATGAACTCTATTAAAAAGTGGAGAAAAAGATAATACATGTGGTCAAGGTTGACCACAAGGCCCAGGCACAACTACCTTGGCGATAATCTTCTAGATTCGTAACAGGTTAGAGCTGACTTTTTGTTTTTGTTGTTGCTGATGCTGTGTGATTCAGACTTCTCAGCCTAACCAGGAAGAGTAAGTGGAAATGGTAGATGAAGAAGGGGTAGAGCTGGTGTATCTATAACTTTCTGATATTTGTCTGCCAAACTTGATATATTAGTAATTTTTTTATCTTTAGCTAAGATCAAGTCACCCCTGAAACAACAGGAGATTCTAGTTTTAAAATAAGGCCACAAAAATCCTTACGGAATGAAGAATGGCACCCCAGTTGGTTGTATAAGTCTCATAAGATAATGATGTTGATTTTAAATATGGATGTCTCAATGCCTGTTTTCTATCAATGATTTGTTTGTTTCCAAGGTCGGGGAGGGAAAGAGGGGAGGGTTTATCTGTTTTAGAAAGTCTCAGAATACTTATAAAATACAGAAGTAGTTATTAAAATATATAGGACCTCACATAGGTAGATACAGAACTTACCATTGAGGCTGATGGGCTGTTGTGTGAATCACACAGGACCTTAAATGAGGCTCATTATTCTCACACACCAAAATGACTCTGACAGCCTGAAGCAGTTATTGCTAGAGCCCAAGCTTTCCTTGGAGGTTTTGGAGTTAGGTTGATTGGAAGTAACCAGCTAATACCTTTTCTAGTGGAGAAAAAGACATTGCTACCAGCTTGTTCATCCCATAGAAGTCTTCCACTCTGCTCCATTTTTAGCAGCAAGCATTTCATGTAGCATAAACCTTGGCAGATAAGTGTGCCTAAGGTTTATACAGTCTGTCCGCTTGGATGTATACAAATTTAGATACATATTTTAACATGTGTTCTCATAGATGACTTTATAACAACACACATTACCTATAGGTGTCTAGACTGTGTACATACAAGTGTGTACAGACAAGCTTCATACGTATATACTGTAATCCGTTACAACAAATAAATTTTAAATCATCGTTTAACATGTATGTGGTACTTCTACAGTGTACATTGTTTTCATTATTTATTGTAACATTGAAAACCACAGTGCAGGGAAAACAAAAGTATCCCAGCATCTTCATCCTGTACACTTGGAATTAATTTCATTTGGGCATATCCAAGATAAACTCAACTTTCAAGAAATCTTGTATATTATTTAATCATCTGTGTTAGGATGACACCTATGATTGATGACTTCGGTTGAATAGCTTTATTCTGGATTTTTCATAACTAAAGCTAAATCCAAAGACCTGAAAAAGGACAAAAAGAAAAAAAAAAAAAGAAA\"\n", "a = a.upper()\n", "a = a.replace(\"\\n\",\"\")\n", " \n", @@ -743,19 +559,9 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MMRLRGSGMLRDLLLRSPAGVSATLRRAQPLVTLCRRPRGGGRPAAGPAAAARLHPWWGGGGWPAEPLARGLSSSPSEILQELGKGSTHPQPGVSPPAAPAAPGPKDGPGETDAFGNSEGKELVASGENKIKQGLLPSLEDLLFYTIAEGQEKIPVHKFITALKSTGLRTSDPRLKECMDMLRLTLQTTSDGVMLDKDLFKKCVQSNIVLLTQAFRRKFVIPDFMSFTSHIDELYESAKKQSGGKVADYIPQLAKFSPDLWGVSVCTVDGQRHSTGDTKVPFCLQSCVKPLKYAIAVNDLGTEYVHRYVGKEPSGLRFNKLFLNEDDKPHNPMVNAGAIVVTSLIKQGVNNAEKFDYVMQFLNKMAGNEYVGFSNATFQSERESGDRNFAIGYYLKEKKCFPEGTDMVGILDFYFQLCSIEVTCESASVMAATLANGGFCPITGERVLSPEAVRNTLSLMHSCGMYDFSGQFAFHVGLPAKSGVAGGILLVVPNVMGMMCWSPPLDKMGNSVKGIHFCHDLVSLCNFHNYDNLRHFAKKLDPRREGGDQRVKSVINLLFAAYTGDVSALRRFALSAMDMEQRDYDSRTALHVAAAEGHVEVVKFLLEACKVNPFPKDRWNNTPMDEALHFGHHDVFKILQEYQVQYTPQGDSDNGKENQTVHKNLDGLL\n", - "MVSC\n", - "MCSEQHCFVDTSI\n" - ] - } - ], + "outputs": [], "source": [ "def translate(seq,orf):\n", " seq = seq.upper()\n", @@ -806,7 +612,7 @@ " \n", " return protein\n", "\n", - "a = \"AGTGCGGAGCCTTAGGCGGAGCGAAGAGAACCGGTCGCGGCAATCCTAGCGCGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCACCCGCATCCGCTGCGGGAGTCCGAGCCGGAACCACACCCAAGTAGCTGCCCTTTCCTCTTCTGTCATCTCACCGCCCCACCACAGACCGCGTTCCCCGAGGAAACCGGCCGCCCACGCCCGGAGCATCCTCCCCTGTTGAGCGGGCGCTGACGGACCCGGCGGCATGATGCGGCTGCGAGGCTCGGGGATGCTGCGGGACCTGCTCCTGCGGTCGCCCGCCGGCGTGAGCGCGACTCTGCGGCGGGCACAGCCCTTGGTCACCCTGTGCCGGCGTCCCCGAGGCGGGGGACGGCCGGCCGCGGGCCCGGCTGCCGCCGCGCGACTCCACCCGTGGTGGGGCGGGGGCGGCTGGCCGGCGGAGCCCCTCGCGCGGGGCCTGTCCAGCTCTCCTTCGGAGATCTTGCAGGAGCTGGGCAAGGGGAGCACGCATCCGCAGCCCGGGGTGTCGCCACCCGCTGCCCCGGCGGCGCCCGGCCCCAAGGACGGCCCCGGGGAGACGGACGCGTTTGGCAACAGCGAGGGCAAAGAGCTGGTGGCCTCAGGTGAAAATAAAATAAAACAGGGTCTGTTACCTAGCTTGGAAGATTTGCTGTTCTATACAATTGCTGAAGGACAAGAGAAAATACCTGTTCATAAATTTATTACAGCACTCAAATCTACAGGATTGCGAACGTCTGATCCCAGGTTGAAAGAGTGTATGGATATGTTAAGATTAACTCTTCAAACAACATCAGATGGTGTCATGCTAGACAAAGATCTTTTTAAAAAATGTGTTCAGAGCAACATTGTTTTGTTGACACAAGCATTTAGAAGAAAGTTTGTGATTCCTGACTTTATGTCTTTTACCTCACACATTGATGAGTTATATGAAAGTGCTAAAAAGCAGTCTGGAGGAAAGGTTGCAGATTATATTCCTCAACTGGCCAAATTCAGTCCCGATTTGTGGGGTGTGTCTGTTTGTACAGTAGATGGACAGAGGCATTCTACTGGAGATACCAAAGTTCCCTTCTGTCTTCAGTCCTGTGTAAAACCTTTGAAATATGCCATTGCTGTTAATGATCTTGGAACTGAATATGTGCATCGATATGTTGGAAAAGAGCCGAGTGGACTAAGATTCAACAAACTATTTTTGAATGAAGATGATAAACCACATAATCCTATGGTAAATGCTGGAGCAATTGTTGTGACTTCACTAATAAAGCAAGGAGTAAATAATGCTGAAAAATTTGACTATGTCATGCAGTTTTTGAATAAGATGGCTGGTAATGAATATGTTGGATTCAGTAATGCAACGTTTCAGTCTGAAAGAGAAAGTGGAGATCGAAATTTTGCAATAGGATATTACTTAAAAGAAAAGAAGTGTTTTCCAGAAGGCACAGACATGGTTGGTATATTAGACTTCTACTTCCAGCTGTGCTCCATTGAAGTGACTTGTGAATCAGCCAGTGTGATGGCTGCGACACTGGCTAATGGTGGTTTCTGCCCAATTACTGGTGAAAGAGTACTGAGCCCTGAAGCAGTTCGAAATACATTGAGTTTGATGCATTCCTGTGGCATGTATGACTTCTCAGGGCAGTTTGCTTTCCATGTTGGTCTTCCTGCAAAATCTGGAGTTGCTGGGGGCATTCTTTTAGTTGTCCCCAATGTTATGGGTATGATGTGCTGGTCTCCTCCTCTGGATAAGATGGGCAACAGTGTTAAGGGAATTCACTTTTGTCACGATCTTGTTTCTCTGTGTAATTTCCATAACTATGATAATTTGAGACACTTTGCAAAAAAACTTGATCCTCGAAGAGAAGGTGGTGATCAAAGGGTAAAGTCAGTGATAAATCTTTTGTTTGCTGCATATACTGGAGATGTGTCTGCACTTCGAAGATTTGCTTTGTCAGCTATGGACATGGAACAGCGGGACTATGATTCTAGAACAGCACTCCATGTAGCTGCTGCAGAGGGTCATGTTGAAGTTGTTAAATTTTTGCTGGAAGCCTGCAAAGTAAACCCTTTCCCCAAGGACAGGTGGAATAACACTCCCATGGATGAAGCACTGCACTTTGGACACCATGATGTATTTAAAATTCTCCAAGAATACCAAGTCCAGTACACACCTCAAGGAGATTCTGACAACGGGAAGGAAAATCAAACCGTCCATAAGAATCTTGATGGATTGTTGT\"\n", + "a = \"TGGACAGCTCCCGCTCACCCAAACAGAAGACGTCGGCGCCGGAGCGGGCTCGGACATGGCGAGGCTGCGAGCCGGCCCGAGCGGCGGGGCCCGGTGATCCCTCCCTCCCTCCCCGTCCCCTCCCCTCTCCCGCACGCACGCCCCGTCCGCCCCCACCCCGCCCCCACCCCGGGCGAGCCCGCCCGCAGCCCGGGGCGCACACCCGCACGCGCACTCCTCTCCACTCACTCCCGCGCCCGCCCCCACTCCCGCAGCCGAGCCCCGCCACGCGCGCCTTGCCCGCCCGCCGGCCGCCCCCGCCGCCCCCGCCGCCCCCGGGCCCTGATGGACTGAATGAAGGCTGCCTACACCGCCTATCGATGCCTCACCAAAGACCTAGAAGGCTGCGCCATGAACCCGGAGCTGACAATGGAAAGTCTGGGCACTTTGCACGGGCCGGCCGGCGGCGGCAGTGGCGGGGGCGGCGGCGGGGGCGGCGGGGGCGGCGGCGGGGGCCCGGGCCATGAGCAGGAGCTGCTGGCCAGCCCCAGCCCCCACCACGCGGGCCGCGGCGCCGCTGGCTCGCTGCGGGGCCCTCCGCCGCCTCCAACCGCGCACCAGGAGCTGGGCACGGCGGCAGCGGCGGCAGCGGCGGCGTCGCGCTCGGCCATGGTCACCAGCATGGCCTCGATCCTGGACGGCGGCGACTACCGGCCCGAGCTCTCCATCCCGCTGCACCACGCCATGAGCATGTCCTGCGACTCGTCTCCGCCTGGCATGGGCATGAGCAACACCTACACCACGCTGACACCGCTCCAGCCGCTGCCACCCATCTCCACCGTGTCTGACAAGTTCCACCACCCTCACCCGCACCACCATCCGCACCACCACCACCACCACCACCACCAGCGCCTGTCCGGCAACGTCAGCGGCAGCTTCACCCTCATGCGCGACGAGCGCGGGCTCCCGGCCATGAACAACCTCTACAGTCCCTACAAGGAGATGCCCGGCATGAGCCAGAGCCTGTCCCCGCTGGCCGCCACGCCGCTGGGCAACGGGCTAGGCGGCCTCCACAACGCGCAGCAGAGTCTGCCCAACTACGGTCCGCCGGGCCACGACAAAATGCTCAGCCCCAACTTCGACGCGCACCACACTGCCATGCTGACCCGCGGTGAGCAACACCTGTCCCGCGGCCTGGGCACCCCACCTGCGGCCATGATGTCGCACCTGAACGGCCTGCACCACCCGGGCCACACTCAGTCTCACGGGCCGGTGCTGGCACCCAGTCGCGAGCGGCCACCCTCGTCCTCATCGGGCTCGCAGGTGGCCACGTCGGGCCAGCTGGAAGAAATCAACACCAAAGAGGTGGCCCAGCGCATCACAGCGGAGCTGAAGCGCTACAGTATCCCCCAGGCGATCTTTGCGCAGAGGGTGCTGTGCCGGTCTCAGGGGACTCTCTCCGACCTGCTCCGGAATCCAAAACCGTGGAGTAAACTCAAATCTGGCAGGGAGACCTTCCGCAGGATGTGGAAGTGGCTTCAGGAGCCCGAGTTCCAGCGCATGTCCGCCTTACGCCTGGCAGCGTGCAAACGCAAAGAGCAAGAACCAAACAAAGACAGGAACAATTCCCAGAAGAAGTCCCGCCTGGTGTTCACTGACCTCCAACGCCGAACACTCTTCGCCATCTTCAAGGAGAACAAACGCCCGTCAAAGGAGATGCAGATCACCATTTCCCAGCAGCTGGGCCTGGAGCTCACAACCGTCAGCAACTTCTTCATGAACGCCCGGCGCCGCAGCCTGGAGAAGTGGCAAGACGATCTGAGCACAGGGGGCTCCTCGTCCACCTCCAGCACGTGTACCAAAGCATGATGGAAGGACTCTCACTTGGGCACAAGTCACCTCCAAATGAGGACAACAGATACCAAAAGAAAACAAAGGAAAAAGACACCGGATTCCTAGCTGGGGCCCTTCACTGGTGATTTGAAAGCACAATTCTCTTGCAAAGAAACTTATATTCTAGCTGTAATCATAGGCCAGGTGTTCTTCTTTTGTTTTTAATGGCTATGGAGTCCAAGTGCAAGCTGAAAAATTAATCTCTTAGAACCAGACACTGTTCTCTGAGCATGCTAAGCATCCCAGAAACCCAAATGGGGCCTTCCTGGAGCGAGTTAATTCCAGTATGGTGTCAACCAAGCTCGGGATTGCTTAAAATATCATCCATCCCACTTCAGGTCCTGTCAGCTTCTTGCAGTCAGAGTTCCTATGAGTAACAATAGGAGTTTGGCCTATGTAAGGACTCTGAGTTTAGGCTTCCAAGATACAACAATAAGAGAAGAATCTAGCAACGAGAATGACCTCATTTGCTTTCCACATGCTTAGCCTCATTATACCATGTTATGTCCAAGTTCACAGCCACAACATCAGAATGGTAATTACTGAGCACAAGTTTTAAATATGGACGTTAAAAAAAAAAATCCAAGGACCTGTTTTTCCAACCCAGACATCTTTTCATTGAATGATTTAGAAAGCTTTAAGTTGATCCAGCTTACAATTTTTTTTTTCTTTACCTCCTGGAAATCTCATATGGTCTTGGATCCGTCAAAAAAACCAGTCAGTTCACTTGCGCTCAAAGTATCAAGCACAACAAAGATAAACAGAAGTGAGGAAGGTTCTGGGTTCACTACATCTGGATTTTCAAGACACCTATTGTGAAGTCATTAGGGAATTGATGAGAATATGGCTTCAAGCACATTTTGCAGTTTGCTACAAATTCTGTTGTACATAATGCAGACGCACACTCAGGAGGCCAATTTAACTGTTAACAGTGCATGGAGCGAATGCAGCATTTTAAAAGATCTAGGTTTTTTTAGGTCATTAATGTGTCCTTGGTTGATCAGTCATCTGGTCCCTCCTACTGTGTGTTATGACCACCACGTAATCCATTCTCGCTCTTTCTGATTTGGGGTTTTTCCTCATCCATCCCATTAGTAGGGATGTTTTCTGTGTTTTCTAGCAAGAAAAAAAAATCAATCAATCAAACCTGCATACATGTTACTCATGACTGTCATCTAGTCCTAAATCTCTTCTGTTGTTGAATCATCCTTGCAAAACAGCTGAATACATCTGGAGAAAACACAGCACACCAAAGAAGCAGAATACTGCAAACCAAAGACATTTATGACTTGTCATTTTCTAGCCTAAAAATACTGTGATTACTTTTAGAAATCAGAAAACCTCTGCAACTCCGAATGGCATTCAGCTCTTGCATTTGGCGCATCATCGGGCTGAGCGGACCAGCTACACCAAGGACATTAGCCAAGCCACCCAGAGGGGTGGCTTTGCCACACCAGTTGTCACCTTCCCATAGCAAGTGGAAGAGCGCCCACAGAACTCTGGGAGATTGCAAAGGTCACAATGTGCATATTTACCAGTGAATGGCCCCGGGTGGGGCCACGTGGGGGTGTTCAAAGCAAGCCAAACGCTGCAATCATTCTTTACAGACACTTGAGACTGACTTTTTTATGAATTACTTAGTCGAAACCAAAGAAACTTTTTCTGCACCTACTTCTGCAACAAACAAAACTGTCCCATTAAAATGAATAAATAAATCCGTAAATCAATGGAAATCACCACCAATAAGAAGGAAGCACGCCAGAAAATAAACGAAAACAAAAACAGGGAGACACACTGTGTTCAAACAGACCTCTTGGGACATTTTTTGGAAGCAGATTTTAAAGAAAGGGTTGAGACAAAGATAGAAATAAGGAAGAGCCTCAGTGGCTGCTGCTTCATTTGACAACTCACACGGTAATCTTAAAGCTGAAGATTGTCTTTAATTTGTGCCTATGCAGTTTTTCAAAAGAACACGGAACAGAGCAACAGAAACCTCAACAGCTACAATACCAAAGATGAGGATTTCTCACACCTTTTGTTTCAGTTCATTATCTCCTCTTGCCTGGCTAAAATACTAATAGCGCCATTGAACTGTATAAAGGTAATCAATTATGTTTCTCTGAGCAACAAAAGGAAAGGGCCATTTATTTGATTTTATTGTTTCATTTCAATTTTGTCTTATGGTTTTTTGCCCCAACATGGAATCTCTCAAAAGTTTCCATGGACTCCAAGTTTAAGATGTTGGGATATTGAACAGTTCTCTCTGCTCAGCAGAGGGTAGGGAATAACATTATCACTTGAATGTTCTTTGCTTAACCCTTAGACTTGGTTCCTTCTATGTTCAGAGTCTCATCATCAGGGGAAGGAAAGGGAGTGAGGGTCAGGGATAGGGGTCTTGGTGATGCATCCTCTCCCGAGCCACAGAACCAAAGAGTTTATAGAGGAATTTACAGCCTCGTTTTCATGTGATTGCTACATCCTAACAGGGCTTCATTTGGGGGTGGGGGGAAACATGTAAAAATAATTGCCAGTTTCTACTTTTCTATTAGCTTTTTAAAAATCAGCTGTAAAGTTGCATTTCTAAAGAAAGATATATATAATATATAAAATACATATATAGATCAACTTGACATTGGTGATAACCAAAATTATTGCTGTCCAAATTCATGTCTTGTTTTGGTCCAGTGCTTCATTTGCTAAGTATTCGGTTCAGAATTTTTCTCATTTCTCATGCCATTCCAGAGTTAATTTGCCACTGTGGATGATTTGAAGTATTCAGATCTCTATGGAAGTTTCTGGGACAGGTTTAAAGTCAAGATCAAGCATTTTAGCATTTAACCTGTTGATAAATGGATCCATGGTGTACATGAGTTTTATTTGTATTCGGAGTCATCTCTATTCTATCCCTCAGCCTCGATTAAGGTGGTGAGTGAAGTGCATCCAACAGACTCGGCCCAGAACTGGGTCCTGACAGTGGGGTGCTCATCTTCTGTAACTGTTGGGAAGGCTCGGTGGTCCATTTTCACCAGTTAAAGAATATGAGGCCAGCCCAGAAATCTGTTCTCCAGGAGCTGCCCTGTCCCATCTGGGTGTGCCAGACCCCCTCAGTGAGCAGGTCCACCAAAGGGACTTCTCACAGGGGAAGCCCAACTCCTGTTGCAATGGGTTGATAGATTTCCTCAGGGTGGTAATTACCAATTCGTATTTTGACAAGCCTATGTGCAACCACAGCTGGCACTGGGGTGGGCAGTGGTGTTGGGTGGGATGGGGGAGAGTGTCTCAATCCTGAAGAGAAAATATAAAGCAGGTTTTGGGGAGACTTCTGGAGTCCTGCCCCTAGAGAGCCCCATTGTTGTTCTTTGTGCCCCCTCCTCATTCCCCCTATGTGGGTCTCCCTATGCAGGAGCTGTGAGAGAATGTGACTCTCCACAATTTTTATAATTCATCCTTCCTAGGAGATTGTTCATTGGCTCTTCCCTTGTGTCCCTTTGTCCCTTGCTCATACTCCATGTTTCCTTTGTCAAAGGACTAAGAAAAGAGCATATTTCAGCAGAGGAGTGTTCCCATGTGGGTTGATTTCAACTTGGGTATTTCTAAAAGAGTCCTTGTGACATGTGTCCAGTGGAAATGGTTGCTCTTTTCCAGACTGGATTGAGGAATGGAGCCTGTTTGATTTGGTTAGTGATTCTTTGACATACTAATCTCAGCGTTTGGGTCTCCAGCATCCTCTGAAGATGTCTAGACTAGTAGAGGCTGCCTTTGTGACCTGACATTACAACATTGGTCAAACCAGTCCTCTGATAATCAGAAGAACATGTCATAATTGTTTAAAAAAAAAAAAAAGGCAAGAATTTCTCTCCAAGGAGCTTTAATAAATGTCTCATTCCAGATAATGTCATACCAGAGAAAAGTGCTTGCTTTTAGAAAATTATTTACATACATATATAAATATATATGTGTATCTATACAGTTATGTATCAAAATTTTAAGCCCTGCAGAATTTCAATTTGTTAGAAATCTAACAGAAAAAAATTTCTATATTGAAAGGTAATAGAATTTAACCCAGTGAGTTTACTCAAGGATTTTTAAATTTAAGTTAATAATTTCAGAGAAAATAACCATTTGGGTGTGGTTATAGTTTAGTATCCATTACCTCAATCCAAGGAAAATTCCAGGCATTCCTCAACCATCAGGAAAAGGTACAGTGTGAAGGAACAGTTCTCAGCCAAATTTCACATTCTTGAGGCAACAGAAATCAAAACACTCAGAGCCATTGAGTGGAAAAACAATTTACTTTATTCCTTTACACAAATAGGCTTGCATTGTTTTTGTTTTAATGTGATTTTGGTACTAGGGATATAATTATTTCATTCCAGGAAATAATAAAAAAAAACAGACAGAGCCAATACATTTCTTTTTTTAAAGGAAACAGCAACAACAATAAAAACTCAGCACCAATATTTAAAAGCTTTTCCAAAATGTAAAAGAAGTGTTTAGCTTGCACCATGCATAAAGGTGCAGGCTAGTTGAACCAGGAAGCATGGCACTTCCTCTGGAGAAATCCAGAAAGAGTTGCTTCTAAGCTCCCTTTTCCCCCTGCAGGCTCTTGGCAATTGTAGGCTTTAGCAAATCCAGAATAATTTTCAATTCAAGCTAAAATAAAATCAACATTTGGAATGTAAATCTGATACACACACACTTTTCTAAGTCAAACAACATATTTCAAAACCAAAAATAAATACCTTTTAGATAATCAGTTATTTTCTTTGTCTATACTGGGCACCCACCTACTAGTGCCAGTAAATTCAAGTTGAACAGATTTTTAAAATCACTATTATCTGGGTATGGGGGAAACTTCCCCACTTTTGAAAATGTTGGTAGAATTATAGGAATGTCTGTTTGATTATCATTACCAAAGTGTCATGACAGTATGCCTTTGTAGTGAACTCGGATTTTCAGGAGTTTGAATAGTTGGATATTTTAAAATCTAAGAAGAAAAGGCCTGTTTCCAATGTTGTTGAAGAATAATGAACTCTATTAAAAAGTGGAGAAAAAGATAATACATGTGGTCAAGGTTGACCACAAGGCCCAGGCACAACTACCTTGGCGATAATCTTCTAGATTCGTAACAGGTTAGAGCTGACTTTTTGTTTTTGTTGTTGCTGATGCTGTGTGATTCAGACTTCTCAGCCTAACCAGGAAGAGTAAGTGGAAATGGTAGATGAAGAAGGGGTAGAGCTGGTGTATCTATAACTTTCTGATATTTGTCTGCCAAACTTGATATATTAGTAATTTTTTTATCTTTAGCTAAGATCAAGTCACCCCTGAAACAACAGGAGATTCTAGTTTTAAAATAAGGCCACAAAAATCCTTACGGAATGAAGAATGGCACCCCAGTTGGTTGTATAAGTCTCATAAGATAATGATGTTGATTTTAAATATGGATGTCTCAATGCCTGTTTTCTATCAATGATTTGTTTGTTTCCAAGGTCGGGGAGGGAAAGAGGGGAGGGTTTATCTGTTTTAGAAAGTCTCAGAATACTTATAAAATACAGAAGTAGTTATTAAAATATATAGGACCTCACATAGGTAGATACAGAACTTACCATTGAGGCTGATGGGCTGTTGTGTGAATCACACAGGACCTTAAATGAGGCTCATTATTCTCACACACCAAAATGACTCTGACAGCCTGAAGCAGTTATTGCTAGAGCCCAAGCTTTCCTTGGAGGTTTTGGAGTTAGGTTGATTGGAAGTAACCAGCTAATACCTTTTCTAGTGGAGAAAAAGACATTGCTACCAGCTTGTTCATCCCATAGAAGTCTTCCACTCTGCTCCATTTTTAGCAGCAAGCATTTCATGTAGCATAAACCTTGGCAGATAAGTGTGCCTAAGGTTTATACAGTCTGTCCGCTTGGATGTATACAAATTTAGATACATATTTTAACATGTGTTCTCATAGATGACTTTATAACAACACACATTACCTATAGGTGTCTAGACTGTGTACATACAAGTGTGTACAGACAAGCTTCATACGTATATACTGTAATCCGTTACAACAAATAAATTTTAAATCATCGTTTAACATGTATGTGGTACTTCTACAGTGTACATTGTTTTCATTATTTATTGTAACATTGAAAACCACAGTGCAGGGAAAACAAAAGTATCCCAGCATCTTCATCCTGTACACTTGGAATTAATTTCATTTGGGCATATCCAAGATAAACTCAACTTTCAAGAAATCTTGTATATTATTTAATCATCTGTGTTAGGATGACACCTATGATTGATGACTTCGGTTGAATAGCTTTATTCTGGATTTTTCATAACTAAAGCTAAATCCAAAGACCTGAAAAAGGACAAAAAGAAAAAAAAAAAAAGAAA\"\n", "print(translate(a,0))\n", "print(translate(a,1))\n", "print(translate(a,2))" @@ -814,25 +620,23 @@ }, { "cell_type": "code", - "execution_count": 160, + "execution_count": 9, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "264 CGGCATGA MMRLRGSGMLRDLLLRSPAGVSATLRRAQPLVTLCRRPRGGGRPAAGPAAAARLHPWWGGGGWPAEPLARGLSSSPSEILQELGKGSTHPQPGVSPPAAPAAPGPKDGPGETDAFGNSEGKELVASGENKIKQGLLPSLEDLLFYTIAEGQEKIPVHKFITALKSTGLRTSDPRLKECMDMLRLTLQTTSDGVMLDKDLFKKCVQSNIVLLTQAFRRKFVIPDFMSFTSHIDELYESAKKQSGGKVADYIPQLAKFSPDLWGVSVCTVDGQRHSTGDTKVPFCLQSCVKPLKYAIAVNDLGTEYVHRYVGKEPSGLRFNKLFLNEDDKPHNPMVNAGAIVVTSLIKQGVNNAEKFDYVMQFLNKMAGNEYVGFSNATFQSERESGDRNFAIGYYLKEKKCFPEGTDMVGILDFYFQLCSIEVTCESASVMAATLANGGFCPITGERVLSPEAVRNTLSLMHSCGMYDFSGQFAFHVGLPAKSGVAGGILLVVPNVMGMMCWSPPLDKMGNSVKGIHFCHDLVSLCNFHNYDNLRHFAKKLDPRREGGDQRVKSVINLLFAAYTGDVSALRRFALSAMDMEQRDYDSRTALHVAAAEGHVEVVKFLLEACKVNPFPKDRWNNTPMDEALHFGHHDVFKILQEYQVQYTPQGDSDNGKENQTVHKNLDGLL 0.003137706\n" + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '/home/annaldas/projects/result/ONECUT2/ONECUT2_seq.fa'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0mtranscripts_filename\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/home/annaldas/projects/result/ONECUT2/ONECUT2_seq.fa\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 127\u001b[0;31m \u001b[0mtranscripts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSeqIO\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtranscripts_filename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"fasta\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 128\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[0mgene\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"ONECUT2\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/Bio/SeqIO/__init__.py\u001b[0m in \u001b[0;36mindex\u001b[0;34m(filename, format, alphabet, key_function)\u001b[0m\n\u001b[1;32m 951\u001b[0m \u001b[0mrepr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"SeqIO.index(%r, %r, alphabet=%r, key_function=%r)\"\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 952\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey_function\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 953\u001b[0;31m return _IndexedSeqFileDict(proxy_class(filename, format, alphabet),\n\u001b[0m\u001b[1;32m 954\u001b[0m key_function, repr, \"SeqRecord\")\n\u001b[1;32m 955\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/Bio/SeqIO/_index.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, filename, format, alphabet)\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 184\u001b[0m \u001b[0;34m\"\"\"Initialize the class.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 185\u001b[0;31m \u001b[0mSeqFileRandomAccess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 186\u001b[0m marker = {\"ace\": b\"CO \",\n\u001b[1;32m 187\u001b[0m \u001b[0;34m\"embl\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34mb\"ID \"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/Bio/SeqIO/_index.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, filename, format, alphabet)\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0;34m\"\"\"Initialize the class.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 46\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_handle\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_open_for_random_access\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 47\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_alphabet\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_format\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/Bio/File.py\u001b[0m in \u001b[0;36m_open_for_random_access\u001b[0;34m(filename)\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[0mIf\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mfile\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mgzipped\u001b[0m \u001b[0mbut\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mBGZF\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ma\u001b[0m \u001b[0mspecific\u001b[0m \u001b[0mValueError\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mraised\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 131\u001b[0m \"\"\"\n\u001b[0;32m--> 132\u001b[0;31m \u001b[0mhandle\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 133\u001b[0m \u001b[0mmagic\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 134\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseek\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/home/annaldas/projects/result/ONECUT2/ONECUT2_seq.fa'" ] - }, - { - "data": { - "text/plain": [ - "'MMRLRGSGMLRDLLLRSPAGVSATLRRAQPLVTLCRRPRGGGRPAAGPAAAARLHPWWGGGGWPAEPLARGLSSSPSEILQELGKGSTHPQPGVSPPAAPAAPGPKDGPGETDAFGNSEGKELVASGENKIKQGLLPSLEDLLFYTIAEGQEKIPVHKFITALKSTGLRTSDPRLKECMDMLRLTLQTTSDGVMLDKDLFKKCVQSNIVLLTQAFRRKFVIPDFMSFTSHIDELYESAKKQSGGKVADYIPQLAKFSPDLWGVSVCTVDGQRHSTGDTKVPFCLQSCVKPLKYAIAVNDLGTEYVHRYVGKEPSGLRFNKLFLNEDDKPHNPMVNAGAIVVTSLIKQGVNNAEKFDYVMQFLNKMAGNEYVGFSNATFQSERESGDRNFAIGYYLKEKKCFPEGTDMVGILDFYFQLCSIEVTCESASVMAATLANGGFCPITGERVLSPEAVRNTLSLMHSCGMYDFSGQFAFHVGLPAKSGVAGGILLVVPNVMGMMCWSPPLDKMGNSVKGIHFCHDLVSLCNFHNYDNLRHFAKKLDPRREGGDQRVKSVINLLFAAYTGDVSALRRFALSAMDMEQRDYDSRTALHVAAAEGHVEVVKFLLEACKVNPFPKDRWNNTPMDEALHFGHHDVFKILQEYQVQYTPQGDSDNGKENQTVHKNLDGLL'" - ] - }, - "execution_count": 160, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ @@ -888,13 +692,13 @@ " translating = True\n", " aa = \"\"\n", " \n", + " in_utr = False\n", + " for utr in utr_regions:\n", + " start,stop = utr\n", + " if ((start < i) and (i < stop)):\n", + " in_utr = True\n", + " \n", " while(translating): \n", - " in_utr = False\n", - " for utr in utr_regions:\n", - " start,stop = utr\n", - " if ((start < i) and (i < stop)):\n", - " in_utr = True\n", - " \n", " if ((len(seq) < 3) or (in_utr)):\n", " translating = False\n", " aa = \"\"\n", @@ -906,47 +710,80 @@ " aa += codon_table[codon]\n", " seq = seq[3:]\n", " i += 3\n", - " return aa\n", + " return aa,i\n", "\n", + "def find_utrs(seq,utr):\n", + " pos = seq.find(utr)\n", + " if (pos == -1):\n", + " if (len(utr) > 20): \n", + " for i in range(len(utr) - 1,len(utr)*5//10 - 1,-1):\n", + " pos = seq.find(utr[:i])\n", + " return pos\n", "\n", "def translate_aa_seq(seq,enst,gene_utrs):\n", " utr_regions = []\n", " for utr in gene_utrs[enst]:\n", - " pos = seq.find(utr)\n", + " pos = find_utrs(seq,utr)\n", " if (pos != -1):\n", " utr_regions.append([pos,pos + len(utr)])\n", " \n", - " longest_aa_seq = \"\"\n", + " longest_aa_seq = \"M\"\n", " longest_aa_seq_sc = 0\n", + " longest_aa_seq_sc_end = 0\n", " for i in range(len(seq)):\n", " if (seq[i:i+3] == \"ATG\"):\n", " sc = score(seq[i-4:i+4],0)\n", - " aa = translate(seq[i:], i, utr_regions)\n", - " if ((aa != \"\") and (sc > longest_aa_seq_sc) and (aa not in longest_aa_seq)):\n", - " print(i,seq[i-4:i+4],aa,sc)\n", + " aa,end = translate(seq[i:], i, utr_regions)\n", + " #print(i,seq[i-4:i+4],aa,sc, end)\n", + " if ((len(aa) > 20) and (sc > longest_aa_seq_sc) and (i > longest_aa_seq_sc_end)):\n", " longest_aa_seq = aa\n", " longest_aa_seq_sc = sc\n", + " longest_aa_seq_sc_end = end\n", " return (longest_aa_seq,longest_aa_seq_sc)\n", "\n", - "def find_all_aa_seqs(seq,enst):\n", - " gene_utrs = determine_utrs(\"GLS\")\n", + "\n", + "\n", + "def translate_aa_seq_length(seq,enst,gene_utrs):\n", + " utr_regions = []\n", " \n", - " if (enst in gene_utrs):\n", - " longest_aa_seq,longest_aa_seq_sc = translate_aa_seq(seq,enst,gene_utrs)\n", - " else:\n", - " longest_aa_seq = \"\"\n", - " longest_aa_seq_sc = 0\n", - " for enst_id in gene_utr:\n", - " aa_seq,aa_seq_sc = translate_aa_seq(seq,enst_id,gene_utrs)\n", - " if (aa_seq_sc > longest_aa_seq_sc):\n", - " longest_aa_seq = aa_seq\n", - " longest_aa_seq_sc = aa_seq_sc\n", + " longest_aa_seq = \"M\"\n", + " for i in range(len(seq)):\n", + " if (seq[i:i+3] == \"ATG\"):\n", + " aa,end = translate(seq[i:], i, utr_regions)\n", + " #print(i,seq[i-4:i+4],aa, end)\n", + " if (len(aa) > len(longest_aa_seq)):\n", + " longest_aa_seq = aa\n", + " return longest_aa_seq\n", + "\n", + "def find_all_aa_seqs(seq,enst,gene):\n", + " gene_utrs = determine_utrs(gene)\n", + " \n", + " longest_aa_seq = translate_aa_seq_length(seq,enst,gene_utrs)\n", + " if gene in gene_utrs:\n", + " for utr in gene_utrs[gene]:\n", + " if (find_utrs(seq,utr) != -1):\n", + " longest_aa_seq,longest_aa_seq_sc = translate_aa_seq(seq,enst,gene_utrs)\n", " \n", " return longest_aa_seq\n", " \n", - "enst = \"ENST00000320717\"\n", - "a=\"AGTGCGGAGCCTTAGGCGGAGCGAAGAGAACCGGTCGCGGCAATCCTAGCGCGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCACCCGCATCCGCTGCGGGAGTCCGAGCCGGAACCACACCCAAGTAGCTGCCCTTTCCTCTTCTGTCATCTCACCGCCCCACCACAGACCGCGTTCCCCGAGGAAACCGGCCGCCCACGCCCGGAGCATCCTCCCCTGTTGAGCGGGCGCTGACGGACCCGGCGGCATGATGCGGCTGCGAGGCTCGGGGATGCTGCGGGACCTGCTCCTGCGGTCGCCCGCCGGCGTGAGCGCGACTCTGCGGCGGGCACAGCCCTTGGTCACCCTGTGCCGGCGTCCCCGAGGCGGGGGACGGCCGGCCGCGGGCCCGGCTGCCGCCGCGCGACTCCACCCGTGGTGGGGCGGGGGCGGCTGGCCGGCGGAGCCCCTCGCGCGGGGCCTGTCCAGCTCTCCTTCGGAGATCTTGCAGGAGCTGGGCAAGGGGAGCACGCATCCGCAGCCCGGGGTGTCGCCACCCGCTGCCCCGGCGGCGCCCGGCCCCAAGGACGGCCCCGGGGAGACGGACGCGTTTGGCAACAGCGAGGGCAAAGAGCTGGTGGCCTCAGGTGAAAATAAAATAAAACAGGGTCTGTTACCTAGCTTGGAAGATTTGCTGTTCTATACAATTGCTGAAGGACAAGAGAAAATACCTGTTCATAAATTTATTACAGCACTCAAATCTACAGGATTGCGAACGTCTGATCCCAGGTTGAAAGAGTGTATGGATATGTTAAGATTAACTCTTCAAACAACATCAGATGGTGTCATGCTAGACAAAGATCTTTTTAAAAAATGTGTTCAGAGCAACATTGTTTTGTTGACACAAGCATTTAGAAGAAAGTTTGTGATTCCTGACTTTATGTCTTTTACCTCACACATTGATGAGTTATATGAAAGTGCTAAAAAGCAGTCTGGAGGAAAGGTTGCAGATTATATTCCTCAACTGGCCAAATTCAGTCCCGATTTGTGGGGTGTGTCTGTTTGTACAGTAGATGGACAGAGGCATTCTACTGGAGATACCAAAGTTCCCTTCTGTCTTCAGTCCTGTGTAAAACCTTTGAAATATGCCATTGCTGTTAATGATCTTGGAACTGAATATGTGCATCGATATGTTGGAAAAGAGCCGAGTGGACTAAGATTCAACAAACTATTTTTGAATGAAGATGATAAACCACATAATCCTATGGTAAATGCTGGAGCAATTGTTGTGACTTCACTAATAAAGCAAGGAGTAAATAATGCTGAAAAATTTGACTATGTCATGCAGTTTTTGAATAAGATGGCTGGTAATGAATATGTTGGATTCAGTAATGCAACGTTTCAGTCTGAAAGAGAAAGTGGAGATCGAAATTTTGCAATAGGATATTACTTAAAAGAAAAGAAGTGTTTTCCAGAAGGCACAGACATGGTTGGTATATTAGACTTCTACTTCCAGCTGTGCTCCATTGAAGTGACTTGTGAATCAGCCAGTGTGATGGCTGCGACACTGGCTAATGGTGGTTTCTGCCCAATTACTGGTGAAAGAGTACTGAGCCCTGAAGCAGTTCGAAATACATTGAGTTTGATGCATTCCTGTGGCATGTATGACTTCTCAGGGCAGTTTGCTTTCCATGTTGGTCTTCCTGCAAAATCTGGAGTTGCTGGGGGCATTCTTTTAGTTGTCCCCAATGTTATGGGTATGATGTGCTGGTCTCCTCCTCTGGATAAGATGGGCAACAGTGTTAAGGGAATTCACTTTTGTCACGATCTTGTTTCTCTGTGTAATTTCCATAACTATGATAATTTGAGACACTTTGCAAAAAAACTTGATCCTCGAAGAGAAGGTGGTGATCAAAGGGTAAAGTCAGTGATAAATCTTTTGTTTGCTGCATATACTGGAGATGTGTCTGCACTTCGAAGATTTGCTTTGTCAGCTATGGACATGGAACAGCGGGACTATGATTCTAGAACAGCACTCCATGTAGCTGCTGCAGAGGGTCATGTTGAAGTTGTTAAATTTTTGCTGGAAGCCTGCAAAGTAAACCCTTTCCCCAAGGACAGGTGGAATAACACTCCCATGGATGAAGCACTGCACTTTGGACACCATGATGTATTTAAAATTCTCCAAGAATACCAAGTCCAGTACACACCTCAAGGAGATTCTGACAACGGGAAGGAAAATCAAACCGTCCATAAGAATCTTGATGGATTGTTGTAATGGTCTCAAATCCCAAGATTTAAATCACTTACCTATTTAATTGTGGAAAATGATTATGAAGAACATGTGTATTTCTATCTGGTAGTGATGTATATTTTACATTTGTCATTTCAGTGTTACTGGAGTTTTCTTCATTGTGCACACAGGACAAATCTGATCTCTTTGGGAAAAAATAGAAATAAAACAATCTCCCTCCATAATGTGAGCAATATTACCTCGTGCATTGTATAATTTGATGTAAAAGAAATAGTTACCAATGCTAGCTTGTGTGGTCTTCCATGATTTATTTGTGTTTTGTGAATTTTCAATTTATGGTGATGATCTGCTGATATGCATTTATAAAGTAAGCTCTGTTGTACAGTCTGTCCAAATGGGTCAAGGTTGCCTTTAGAAGCAAATAGTGTGATTTTCAAGACTTCAAATACAAATTTAGTTTAAGTGTTTGAACAACTATATGCACTTACGGTTGTGTGTTTAAAATGTCTCTCTCACCCCCTAGCTTCATGATGTGACTCTTAAAAAACTATAATAGTTAACAACTGTTAGTAAGATAGACCAATTCTGATTAGACTTTATCAGGGAATCTGTTTAAGATATGTTTGGTGACCAAAACGTATGTGTGAATGTAGTTATAATGCTTTTGAAAAATTTTCCTTTTTCTATATCCCCTTAGTCCAGCCTCTCTTCTCAGACATTTAGCTATCTGCCTCTTTCCTTTAGCTGGGAAAGTGAGAGCTGGCATACTATGCAGTTTTTATGTTTTCCATAGTAAGTCAGAAAATGCCTCCTATTTCTGGCATCAGAACTTTGCCATTTGTCTACAGAAGACGAACCAGAGACAAAATTACTAAGTATAAATTAGTCAAGTTTATCAGTCTAAAAAACGAAGGGATGTGCAACTGCAGCTCTTTAAGAAGTTTTTTTTTTTTAGCTTCTAGGGTAAAGATAAATTCAGAAATGCTCTAAGCTACCAAAGTTATTCTGAAAGTATGGGAACTGCTACAACTAACAAACATTTGTTTCCAAGCCTGTCATTAAGAGTCTGCATCAAGAGATTTGTCCTCCTTGGGGGACCACTGGATCATTCCAGATTTCTTGTGATTTTTCTATTGTGTAATTCTTGGTGGGCTCTGTAGTTTAATAATAAGAAAAAGGCCATTTCATTTTAAATTGTGACCTATAATTCTTTGTCTTGGGTTGGTAATTCAGGATTCATTTGGAAAGTGGGTAAAAGGGGCTTCAAAAAACGGATAGAACAGGATTTTCTAGGAGTTACACATACATTTTATCCTGTCATACCTCGAGATAAAGTGGCATGTTAGTGAGGAGTTCTGATATTAAGCACACACACACATGCACACAAATGGACTTCTCTGAAGCTGTGTTTAGTGAAATGAGCTCAAGTACATGAATGTTAGTTGTTATCACATACAGCAAATTCCTTTTTTTTTCTTTTTCTATGAGCACACTCTGCTGCTTCTAAACTTTACATGCCTGATGGCACCTTACTCCAGCAGCCTCCAGGTGCTTTCATTTTCACTTCCAGTCTAAGCCAGTGGCTCCTGCCACTGCCCTCCCATTACCTAGATGGCACCTCCTTTGGTGAAACCACGGCCAATGTTCCTTAGCTGCACCAGGCCCGAAGCTGTTCCCATGCTTGAGCTTCCATGGGGAGGATGCTGAGTGAGCAGTTTCCTACCCCGTGGATCTAGCAAGCCATGGAGACAGGTAGCATTTGTAAGATGCTGCACAGGAGCAGCATTATCCCCAAAGATATTACAGGGTAGACACGTTTTAACTGAAATCAATCAAGATAACTTTATTCAAAGAGCAGCCCGCTTTGTGTGACTAAAATGAAACAAGACAGTTGAATTGTGTGACTTGAAGATTACCAATGATTTTGAGGCTTTTCTATAATAAAAAGAGGTTCTAACCATTATTTGGGAACAAAGAGAGTTTTCATCTTTTTTCAGATCAAAACCATTCTGTAAAATCTTTGTTGTTTAATTAAATGTGCCGTTATTTACCCCTGATGTTATTTATGACTATGTGCCGATTCCTGCTCGGGCTGTTTGCTGTTGGCTGGTAATAATATATTTGATTTAAATGCTGTTGACTGTGCTATTAACTGCTGCCGTCAGTAAACTCCAAAGATCTTTTTGTTTTGGCTTTAGTATCATATGTGCTTTTTCTGTATCCTGAGCGCTCTATATGATCATGTTAATTTAAAGCTTTATACACATTGTTGTTTTTGCTGGTCTCATCTTTGGTAATATGCTATACCCCACTGCTGCCCGACACTGCCCTTTAGCTGCAGAGCTGGATTAGCTGTTGACCATTTGATGCTGTTGTCTGTCTGGCAGGGACTGAATGACCTGATGTCAGATTTAGATTCTTCCTGGGGATTACACAGCTATGAATGTATTTGCTTCTAAAACCTCCCAAAGTGAATCTAATCTTAAAACTACAAGTTGTAAGTATTCTGAAATTGGGAAACATTTATTTTAAATGCAATCAGGTAGTGTTGCTTTTTACAGCATAATAAATATATGTATCAAAAAAAAAA\"\n", - "find_all_aa_seqs(a,enst)" + "transcripts_filename = \"/home/annaldas/projects/result/ONECUT2/ONECUT2_seq.fa\"\n", + "transcripts = SeqIO.index(transcripts_filename, \"fasta\")\n", + "\n", + "gene = \"ONECUT2\"\n", + "\n", + "for transcript in transcripts:\n", + " seq = str(transcripts[transcript].seq).strip()\n", + " enst = str(transcripts[transcript].id).split(\"|\")[-1].strip()\n", + " protein = find_all_aa_seqs(seq,enst,gene)\n", + " transcript_name = str(transcripts[transcript].id)\n", + " transcript_name = str(transcripts[transcript].id)\n", + " transcript_filename = transcript_name.replace(\"|\",\"_\")\n", + " transcript_filename = transcript_filename.replace(\"_\",\"\")\n", + " print(transcript_name,transcript_filename,protein)\n", + " \n", + "#enst = \"enst\"\n", + "#a=\"TGGACAGCTCCCGCTCACCCAAACAGAAGACGTCGGCGCCGGAGCGGGCTCGGACATGGCGAGGCTGCGAGCCGGCCCGAGCGGCGGGGCCCGGTGATCCCTCCCTCCCTCCCCGTCCCCTCCCCTCTCCCGCACGCACGCCCCGTCCGCCCCCACCCCGCCCCCACCCCGGGCGAGCCCGCCCGCAGCCCGGGGCGCACACCCGCACGCGCACTCCTCTCCACTCACTCCCGCGCCCGCCCCCACTCCCGCAGCCGAGCCCCGCCACGCGCGCCTTGCCCGCCCGCCGGCCGCCCCCGCCGCCCCCGCCGCCCCCGGGCCCTGATGGACTGAATGAAGGCTGCCTACACCGCCTATCGATGCCTCACCAAAGACCTAGAAGGCTGCGCCATGAACCCGGAGCTGACAATGGAAAGTCTGGGCACTTTGCACGGGCCGGCCGGCGGCGGCAGTGGCGGGGGCGGCGGCGGGGGCGGCGGGGGCGGCGGCGGGGGCCCGGGCCATGAGCAGGAGCTGCTGGCCAGCCCCAGCCCCCACCACGCGGGCCGCGGCGCCGCTGGCTCGCTGCGGGGCCCTCCGCCGCCTCCAACCGCGCACCAGGAGCTGGGCACGGCGGCAGCGGCGGCAGCGGCGGCGTCGCGCTCGGCCATGGTCACCAGCATGGCCTCGATCCTGGACGGCGGCGACTACCGGCCCGAGCTCTCCATCCCGCTGCACCACGCCATGAGCATGTCCTGCGACTCGTCTCCGCCTGGCATGGGCATGAGCAACACCTACACCACGCTGACACCGCTCCAGCCGCTGCCACCCATCTCCACCGTGTCTGACAAGTTCCACCACCCTCACCCGCACCACCATCCGCACCACCACCACCACCACCACCACCAGCGCCTGTCCGGCAACGTCAGCGGCAGCTTCACCCTCATGCGCGACGAGCGCGGGCTCCCGGCCATGAACAACCTCTACAGTCCCTACAAGGAGATGCCCGGCATGAGCCAGAGCCTGTCCCCGCTGGCCGCCACGCCGCTGGGCAACGGGCTAGGCGGCCTCCACAACGCGCAGCAGAGTCTGCCCAACTACGGTCCGCCGGGCCACGACAAAATGCTCAGCCCCAACTTCGACGCGCACCACACTGCCATGCTGACCCGCGGTGAGCAACACCTGTCCCGCGGCCTGGGCACCCCACCTGCGGCCATGATGTCGCACCTGAACGGCCTGCACCACCCGGGCCACACTCAGTCTCACGGGCCGGTGCTGGCACCCAGTCGCGAGCGGCCACCCTCGTCCTCATCGGGCTCGCAGGTGGCCACGTCGGGCCAGCTGGAAGAAATCAACACCAAAGAGGTGGCCCAGCGCATCACAGCGGAGCTGAAGCGCTACAGTATCCCCCAGGCGATCTTTGCGCAGAGGGTGCTGTGCCGGTCTCAGGGGACTCTCTCCGACCTGCTCCGGAATCCAAAACCGTGGAGTAAACTCAAATCTGGCAGGGAGACCTTCCGCAGGATGTGGAAGTGGCTTCAGGAGCCCGAGTTCCAGCGCATGTCCGCCTTACGCCTGGCAGCGTGCAAACGCAAAGAGCAAGAACCAAACAAAGACAGGAACAATTCCCAGAAGAAGTCCCGCCTGGTGTTCACTGACCTCCAACGCCGAACACTCTTCGCCATCTTCAAGGAGAACAAACGCCCGTCAAAGGAGATGCAGATCACCATTTCCCAGCAGCTGGGCCTGGAGCTCACAACCGTCAGCAACTTCTTCATGAACGCCCGGCGCCGCAGCCTGGAGAAGTGGCAAGACGATCTGAGCACAGGGGGCTCCTCGTCCACCTCCAGCACGTGTACCAAAGCATGATGGAAGGACTCTCACTTGGGCACAAGTCACCTCCAAATGAGGACAACAGATACCAAAAGAAAACAAAGGAAAAAGACACCGGATTCCTAGCTGGGGCCCTTCACTGGTGATTTGAAAGCACAATTCTCTTGCAAAGAAACTTATATTCTAGCTGTAATCATAGGCCAGGTGTTCTTCTTTTGTTTTTAATGGCTATGGAGTCCAAGTGCAAGCTGAAAAATTAATCTCTTAGAACCAGACACTGTTCTCTGAGCATGCTAAGCATCCCAGAAACCCAAATGGGGCCTTCCTGGAGCGAGTTAATTCCAGTATGGTGTCAACCAAGCTCGGGATTGCTTAAAATATCATCCATCCCACTTCAGGTCCTGTCAGCTTCTTGCAGTCAGAGTTCCTATGAGTAACAATAGGAGTTTGGCCTATGTAAGGACTCTGAGTTTAGGCTTCCAAGATACAACAATAAGAGAAGAATCTAGCAACGAGAATGACCTCATTTGCTTTCCACATGCTTAGCCTCATTATACCATGTTATGTCCAAGTTCACAGCCACAACATCAGAATGGTAATTACTGAGCACAAGTTTTAAATATGGACGTTAAAAAAAAAAATCCAAGGACCTGTTTTTCCAACCCAGACATCTTTTCATTGAATGATTTAGAAAGCTTTAAGTTGATCCAGCTTACAATTTTTTTTTTCTTTACCTCCTGGAAATCTCATATGGTCTTGGATCCGTCAAAAAAACCAGTCAGTTCACTTGCGCTCAAAGTATCAAGCACAACAAAGATAAACAGAAGTGAGGAAGGTTCTGGGTTCACTACATCTGGATTTTCAAGACACCTATTGTGAAGTCATTAGGGAATTGATGAGAATATGGCTTCAAGCACATTTTGCAGTTTGCTACAAATTCTGTTGTACATAATGCAGACGCACACTCAGGAGGCCAATTTAACTGTTAACAGTGCATGGAGCGAATGCAGCATTTTAAAAGATCTAGGTTTTTTTAGGTCATTAATGTGTCCTTGGTTGATCAGTCATCTGGTCCCTCCTACTGTGTGTTATGACCACCACGTAATCCATTCTCGCTCTTTCTGATTTGGGGTTTTTCCTCATCCATCCCATTAGTAGGGATGTTTTCTGTGTTTTCTAGCAAGAAAAAAAAATCAATCAATCAAACCTGCATACATGTTACTCATGACTGTCATCTAGTCCTAAATCTCTTCTGTTGTTGAATCATCCTTGCAAAACAGCTGAATACATCTGGAGAAAACACAGCACACCAAAGAAGCAGAATACTGCAAACCAAAGACATTTATGACTTGTCATTTTCTAGCCTAAAAATACTGTGATTACTTTTAGAAATCAGAAAACCTCTGCAACTCCGAATGGCATTCAGCTCTTGCATTTGGCGCATCATCGGGCTGAGCGGACCAGCTACACCAAGGACATTAGCCAAGCCACCCAGAGGGGTGGCTTTGCCACACCAGTTGTCACCTTCCCATAGCAAGTGGAAGAGCGCCCACAGAACTCTGGGAGATTGCAAAGGTCACAATGTGCATATTTACCAGTGAATGGCCCCGGGTGGGGCCACGTGGGGGTGTTCAAAGCAAGCCAAACGCTGCAATCATTCTTTACAGACACTTGAGACTGACTTTTTTATGAATTACTTAGTCGAAACCAAAGAAACTTTTTCTGCACCTACTTCTGCAACAAACAAAACTGTCCCATTAAAATGAATAAATAAATCCGTAAATCAATGGAAATCACCACCAATAAGAAGGAAGCACGCCAGAAAATAAACGAAAACAAAAACAGGGAGACACACTGTGTTCAAACAGACCTCTTGGGACATTTTTTGGAAGCAGATTTTAAAGAAAGGGTTGAGACAAAGATAGAAATAAGGAAGAGCCTCAGTGGCTGCTGCTTCATTTGACAACTCACACGGTAATCTTAAAGCTGAAGATTGTCTTTAATTTGTGCCTATGCAGTTTTTCAAAAGAACACGGAACAGAGCAACAGAAACCTCAACAGCTACAATACCAAAGATGAGGATTTCTCACACCTTTTGTTTCAGTTCATTATCTCCTCTTGCCTGGCTAAAATACTAATAGCGCCATTGAACTGTATAAAGGTAATCAATTATGTTTCTCTGAGCAACAAAAGGAAAGGGCCATTTATTTGATTTTATTGTTTCATTTCAATTTTGTCTTATGGTTTTTTGCCCCAACATGGAATCTCTCAAAAGTTTCCATGGACTCCAAGTTTAAGATGTTGGGATATTGAACAGTTCTCTCTGCTCAGCAGAGGGTAGGGAATAACATTATCACTTGAATGTTCTTTGCTTAACCCTTAGACTTGGTTCCTTCTATGTTCAGAGTCTCATCATCAGGGGAAGGAAAGGGAGTGAGGGTCAGGGATAGGGGTCTTGGTGATGCATCCTCTCCCGAGCCACAGAACCAAAGAGTTTATAGAGGAATTTACAGCCTCGTTTTCATGTGATTGCTACATCCTAACAGGGCTTCATTTGGGGGTGGGGGGAAACATGTAAAAATAATTGCCAGTTTCTACTTTTCTATTAGCTTTTTAAAAATCAGCTGTAAAGTTGCATTTCTAAAGAAAGATATATATAATATATAAAATACATATATAGATCAACTTGACATTGGTGATAACCAAAATTATTGCTGTCCAAATTCATGTCTTGTTTTGGTCCAGTGCTTCATTTGCTAAGTATTCGGTTCAGAATTTTTCTCATTTCTCATGCCATTCCAGAGTTAATTTGCCACTGTGGATGATTTGAAGTATTCAGATCTCTATGGAAGTTTCTGGGACAGGTTTAAAGTCAAGATCAAGCATTTTAGCATTTAACCTGTTGATAAATGGATCCATGGTGTACATGAGTTTTATTTGTATTCGGAGTCATCTCTATTCTATCCCTCAGCCTCGATTAAGGTGGTGAGTGAAGTGCATCCAACAGACTCGGCCCAGAACTGGGTCCTGACAGTGGGGTGCTCATCTTCTGTAACTGTTGGGAAGGCTCGGTGGTCCATTTTCACCAGTTAAAGAATATGAGGCCAGCCCAGAAATCTGTTCTCCAGGAGCTGCCCTGTCCCATCTGGGTGTGCCAGACCCCCTCAGTGAGCAGGTCCACCAAAGGGACTTCTCACAGGGGAAGCCCAACTCCTGTTGCAATGGGTTGATAGATTTCCTCAGGGTGGTAATTACCAATTCGTATTTTGACAAGCCTATGTGCAACCACAGCTGGCACTGGGGTGGGCAGTGGTGTTGGGTGGGATGGGGGAGAGTGTCTCAATCCTGAAGAGAAAATATAAAGCAGGTTTTGGGGAGACTTCTGGAGTCCTGCCCCTAGAGAGCCCCATTGTTGTTCTTTGTGCCCCCTCCTCATTCCCCCTATGTGGGTCTCCCTATGCAGGAGCTGTGAGAGAATGTGACTCTCCACAATTTTTATAATTCATCCTTCCTAGGAGATTGTTCATTGGCTCTTCCCTTGTGTCCCTTTGTCCCTTGCTCATACTCCATGTTTCCTTTGTCAAAGGACTAAGAAAAGAGCATATTTCAGCAGAGGAGTGTTCCCATGTGGGTTGATTTCAACTTGGGTATTTCTAAAAGAGTCCTTGTGACATGTGTCCAGTGGAAATGGTTGCTCTTTTCCAGACTGGATTGAGGAATGGAGCCTGTTTGATTTGGTTAGTGATTCTTTGACATACTAATCTCAGCGTTTGGGTCTCCAGCATCCTCTGAAGATGTCTAGACTAGTAGAGGCTGCCTTTGTGACCTGACATTACAACATTGGTCAAACCAGTCCTCTGATAATCAGAAGAACATGTCATAATTGTTTAAAAAAAAAAAAAAGGCAAGAATTTCTCTCCAAGGAGCTTTAATAAATGTCTCATTCCAGATAATGTCATACCAGAGAAAAGTGCTTGCTTTTAGAAAATTATTTACATACATATATAAATATATATGTGTATCTATACAGTTATGTATCAAAATTTTAAGCCCTGCAGAATTTCAATTTGTTAGAAATCTAACAGAAAAAAATTTCTATATTGAAAGGTAATAGAATTTAACCCAGTGAGTTTACTCAAGGATTTTTAAATTTAAGTTAATAATTTCAGAGAAAATAACCATTTGGGTGTGGTTATAGTTTAGTATCCATTACCTCAATCCAAGGAAAATTCCAGGCATTCCTCAACCATCAGGAAAAGGTACAGTGTGAAGGAACAGTTCTCAGCCAAATTTCACATTCTTGAGGCAACAGAAATCAAAACACTCAGAGCCATTGAGTGGAAAAACAATTTACTTTATTCCTTTACACAAATAGGCTTGCATTGTTTTTGTTTTAATGTGATTTTGGTACTAGGGATATAATTATTTCATTCCAGGAAATAATAAAAAAAAACAGACAGAGCCAATACATTTCTTTTTTTAAAGGAAACAGCAACAACAATAAAAACTCAGCACCAATATTTAAAAGCTTTTCCAAAATGTAAAAGAAGTGTTTAGCTTGCACCATGCATAAAGGTGCAGGCTAGTTGAACCAGGAAGCATGGCACTTCCTCTGGAGAAATCCAGAAAGAGTTGCTTCTAAGCTCCCTTTTCCCCCTGCAGGCTCTTGGCAATTGTAGGCTTTAGCAAATCCAGAATAATTTTCAATTCAAGCTAAAATAAAATCAACATTTGGAATGTAAATCTGATACACACACACTTTTCTAAGTCAAACAACATATTTCAAAACCAAAAATAAATACCTTTTAGATAATCAGTTATTTTCTTTGTCTATACTGGGCACCCACCTACTAGTGCCAGTAAATTCAAGTTGAACAGATTTTTAAAATCACTATTATCTGGGTATGGGGGAAACTTCCCCACTTTTGAAAATGTTGGTAGAATTATAGGAATGTCTGTTTGATTATCATTACCAAAGTGTCATGACAGTATGCCTTTGTAGTGAACTCGGATTTTCAGGAGTTTGAATAGTTGGATATTTTAAAATCTAAGAAGAAAAGGCCTGTTTCCAATGTTGTTGAAGAATAATGAACTCTATTAAAAAGTGGAGAAAAAGATAATACATGTGGTCAAGGTTGACCACAAGGCCCAGGCACAACTACCTTGGCGATAATCTTCTAGATTCGTAACAGGTTAGAGCTGACTTTTTGTTTTTGTTGTTGCTGATGCTGTGTGATTCAGACTTCTCAGCCTAACCAGGAAGAGTAAGTGGAAATGGTAGATGAAGAAGGGGTAGAGCTGGTGTATCTATAACTTTCTGATATTTGTCTGCCAAACTTGATATATTAGTAATTTTTTTATCTTTAGCTAAGATCAAGTCACCCCTGAAACAACAGGAGATTCTAGTTTTAAAATAAGGCCACAAAAATCCTTACGGAATGAAGAATGGCACCCCAGTTGGTTGTATAAGTCTCATAAGATAATGATGTTGATTTTAAATATGGATGTCTCAATGCCTGTTTTCTATCAATGATTTGTTTGTTTCCAAGGTCGGGGAGGGAAAGAGGGGAGGGTTTATCTGTTTTAGAAAGTCTCAGAATACTTATAAAATACAGAAGTAGTTATTAAAATATATAGGACCTCACATAGGTAGATACAGAACTTACCATTGAGGCTGATGGGCTGTTGTGTGAATCACACAGGACCTTAAATGAGGCTCATTATTCTCACACACCAAAATGACTCTGACAGCCTGAAGCAGTTATTGCTAGAGCCCAAGCTTTCCTTGGAGGTTTTGGAGTTAGGTTGATTGGAAGTAACCAGCTAATACCTTTTCTAGTGGAGAAAAAGACATTGCTACCAGCTTGTTCATCCCATAGAAGTCTTCCACTCTGCTCCATTTTTAGCAGCAAGCATTTCATGTAGCATAAACCTTGGCAGATAAGTGTGCCTAAGGTTTATACAGTCTGTCCGCTTGGATGTATACAAATTTAGATACATATTTTAACATGTGTTCTCATAGATGACTTTATAACAACACACATTACCTATAGGTGTCTAGACTGTGTACATACAAGTGTGTACAGACAAGCTTCATACGTATATACTGTAATCCGTTACAACAAATAAATTTTAAATCATCGTTTAACATGTATGTGGTACTTCTACAGTGTACATTGTTTTCATTATTTATTGTAACATTGAAAACCACAGTGCAGGGAAAACAAAAGTATCCCAGCATCTTCATCCTGTACACTTGGAATTAATTTCATTTGGGCATATCCAAGATAAACTCAACTTTCAAGAAATCTTGTATATTATTTAATCATCTGTGTTAGGATGACACCTATGATTGATGACTTCGGTTGAATAGCTTTATTCTGGATTTTTCATAACTAAAGCTAAATCCAAAGACCTGAAAAAGGACAAAAAGAAAAAAAAAAAAAGAAA\"\n", + "#find_all_aa_seqs(a,enst,\"ONECUT2\")" ] }, { @@ -954,18 +791,25 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "test = determine_utrs(\"ONECUT2\")\n", + "for i in test:\n", + " for j in test[i]:\n", + " print(i,find_utrs(a,j), j)" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "len(\"MALNGAEVDDFSWEPPTEAETKVLQARRERQDRISRLMGDYLLRGYRMLGETCADCGTILLQDKQRKIYCVACQELDSDVDKDNPALRDVVPQPLPF\") *3" + ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1010,25 +854,9 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'TCONS_00010063|ENST00000533115': {'IPR039499'},\n", - " 'TCONS_00010061|TCONS_00010060': {'IPR009563'},\n", - " 'TCONS_00010064|ENST00000533115': {'IPR009563'},\n", - " 'TCONS_00011857|ENST00000531405': {'IPR009563'},\n", - " 'TCONS_00010062|ENST00000533115': {'IPR009563'},\n", - " 'TCONS_00010060|ENST00000533115': {'IPR009563'}}" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "domains" ] diff --git a/test/list.txt b/test/list.txt index a6957de..5e478ac 100644 --- a/test/list.txt +++ b/test/list.txt @@ -1,38 +1,27 @@ -BRD4 -BRD3 -BRD2 -PAF1 -CTR9 -CDC73 -LEO1 -RTF1 -WDR61 -SPT5 -SPT4 -SPT6 -TCEA1 -TCEA2 -TCEA3 -TCEANC -TCEANC2 -CDK9 -TRIM28 -SUPT16H -SSRP1 -ELF1 -CDK12 -SUPT16H -SSRP1 -ELL2 -AFF4 -SKI -CCNT1 -NELFA -NELFB -NELFC -NELFD -NELFE -TCEAL -SNUPN -MYC -MLLT1 \ No newline at end of file +SMARCC2 +SMARCB1 +SMARCE1 +SMARCD1 +SMARCD2 +SMARCD3 +BRD7 +SMARCA4 +SMARCA2 +ARID1A +ARID1B +ARID2 +PBRM1 +ACTL6A +ACTL6B +PHF10 +DPF1 +DPF2 +DPF3 +BCL7B +BCL7A +BCL7C +BRD9 +BCL7C +SS18 +SS1BL1 +GLTSCR1L diff --git a/translation_protein.py b/translation_protein.py index 1dabda4..a754d89 100644 --- a/translation_protein.py +++ b/translation_protein.py @@ -1,11 +1,7 @@ from Bio import SeqIO import os -def translate(seq,orf): - seq = seq.upper() - seq = seq.replace("\n","") - - table = { +codon_table = { 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', @@ -22,80 +18,133 @@ def translate(seq,orf): 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', - } + } - protein = "" - exon = False - translating = True - i = orf +def determine_utrs(gene): + utr_file = open(snakemake.input[0],"r") + utr_lines = utr_file.readlines() + utr_file.close() + gene_utr = dict() + + for line in utr_lines: + if (line.startswith(">")): + trans_id = line[1:].strip() + if (trans_id not in gene_utr): + gene_utr[trans_id] = [] + else: + gene_utr[trans_id].append(line.strip()) + return gene_utr - while (translating): - codon = seq[i:i+3] +def score(seq,start): + kozak = { + "A":[0.25,0.61,0.27,0.15,1.00,0.00,0.00,0.23], + "C":[0.53,0.02,0.49,0.55,0.00,0.00,0.00,0.16], + "G":[0.15,0.36,0.13,0.21,0.00,0.00,1.00,0.46], + "T":[0.07,0.01,0.11,0.09,0.00,1.00,0.00,0.15] + } + + score = 1.0 + for i in range(start,len(seq)): + score *= kozak[seq[i]][i] + return score - try: table[codon] - except: break - - if (table[codon] == "M"): - exon = True - if (exon): - if (table[codon] == "_"): - exon = False +def translate(seq, i, utr_regions): + translating = True + aa = "" + + in_utr = False + for utr in utr_regions: + start,stop = utr + if ((start < i) and (i < stop)): + in_utr = True + + while(translating): + if ((len(seq) < 3) or (in_utr)): + translating = False + aa = "" + else: + codon = seq[0:3] + if (codon_table[codon] == "_"): translating = False else: - protein += table[codon] - i += 3 - else: + aa += codon_table[codon] + seq = seq[3:] i += 3 - - return protein + return aa,i +def find_utrs(seq,utr): + pos = seq.find(utr) + if (pos == -1): + if (len(utr) > 20): + for i in range(len(utr) - 1,len(utr)*5//10 - 1,-1): + pos = seq.find(utr[:i]) + return pos +def translate_aa_seq(seq,enst,gene_utrs): + utr_regions = [] + for utr in gene_utrs[enst]: + pos = find_utrs(seq,utr) + if (pos != -1): + utr_regions.append([pos,pos + len(utr)]) + + longest_aa_seq = "M" + longest_aa_seq_sc = 0 + longest_aa_seq_sc_end = 0 + for i in range(len(seq)): + if (seq[i:i+3] == "ATG"): + sc = score(seq[i-4:i+4],0) + aa,end = translate(seq[i:], i, utr_regions) + #print(i,seq[i-4:i+4],aa,sc, end) + if ((len(aa) > 20) and (sc > longest_aa_seq_sc) and (i > longest_aa_seq_sc_end)): + longest_aa_seq = aa + longest_aa_seq_sc = sc + longest_aa_seq_sc_end = end + return (longest_aa_seq,longest_aa_seq_sc) -transcripts_filename = snakemake.input[0] + + +def translate_aa_seq_length(seq,enst): + utr_regions = [] + + longest_aa_seq = "M" + for i in range(len(seq)): + if (seq[i:i+3] == "ATG"): + aa,end = translate(seq[i:], i, utr_regions) + #print(i,seq[i-4:i+4],aa, end) + if (len(aa) > len(longest_aa_seq)): + longest_aa_seq = aa + return longest_aa_seq + +def find_all_aa_seqs(seq,enst,gene): + gene_utrs = determine_utrs(gene) + + longest_aa_seq = translate_aa_seq_length(seq,enst) + if gene in gene_utrs: + for utr in gene_utrs[gene]: + if (find_utrs(seq,utr) != -1): + longest_aa_seq,longest_aa_seq_sc = translate_aa_seq(seq,enst,gene_utrs) + + return longest_aa_seq + + +transcripts_filename = snakemake.input[1] transcripts = SeqIO.index(transcripts_filename, "fasta") output = [] -gene = snakemake.input[0].split("/")[5] -os.mkdir("/home/annaldas/projects/result/%s/transcripts" %(gene)) +gene = snakemake.params[0] +os.mkdir("/project/owlmayerTemporary/Sid/isoform_analysis/result/%s/transcripts" %(gene)) for transcript in transcripts: - protein = translate(str(transcripts[transcript].seq),0) - if (protein != ""): - transcript_name = str(transcripts[transcript].id) + "_1" - transcript_filename = transcript_name.replace("|","_") - transcript_filename = transcript_filename.replace("_","") - transcript_filename_path = "/home/annaldas/projects/result/%s/transcripts/%s_map_protein.fa" %(gene,transcript_filename) - #os.mkdir("/home/annaldas/projects/result/%s/transcripts/%s" %(gene,transcript_filename)) - transcript_file = open(transcript_filename_path, "w+") - transcript_file.write(">" + transcript_name + "\n" + protein) - transcript_file.close() - - protein = translate(str(transcripts[transcript].seq),1) - if (protein != ""): - transcript_name = str(transcripts[transcript].id) + "_2" - transcript_filename = transcript_name.replace("|","_") - transcript_filename = transcript_filename.replace("_","") - transcript_filename_path = "/home/annaldas/projects/result/%s/transcripts/%s_map_protein.fa" %(gene,transcript_filename) - #os.mkdir("/home/annaldas/projects/result/%s/transcripts/%s" %(gene,transcript_filename)) - transcript_file = open(transcript_filename_path, "w+") - transcript_file.write(">" + transcript_name + "\n" + protein) - transcript_file.close() - - protein = translate(str(transcripts[transcript].seq),2) - if (protein != ""): - transcript_name = str(transcripts[transcript].id) + "_3" - transcript_filename = transcript_name.replace("|","_") - transcript_filename = transcript_filename.replace("_","") - transcript_filename_path = "/home/annaldas/projects/result/%s/transcripts/%s_map_protein.fa" %(gene,transcript_filename) - #os.mkdir("/home/annaldas/projects/result/%s/transcripts/%s" %(gene,transcript_filename)) - transcript_file = open(transcript_filename_path, "w+") - transcript_file.write(">" + transcript_name + "\n" + protein) - transcript_file.close() + seq = str(transcripts[transcript].seq).strip() + enst = str(transcripts[transcript].id).split("|")[-1].strip() + protein = find_all_aa_seqs(seq,enst,gene) -#output_filename = snakemake.output[0] -#output_file = open(output_filename,"w+") -#output_file.write("\n".join(output)) -#output_file.close() - - \ No newline at end of file + transcript_name = str(transcripts[transcript].id) + transcript_filename = transcript_name.replace("|","_") + transcript_filename = transcript_filename.replace("_","") + transcript_filename_path = "/project/owlmayerTemporary/Sid/isoform_analysis/result/%s/transcripts/%s_map_protein.fa" %(gene,transcript_filename) + transcript_file = open(transcript_filename_path, "w+") + transcript_file.write(">" + transcript_name + "\n" + protein + "\n") + transcript_file.close() +