diff --git a/Snakefile b/Snakefile index 3678212..6b6bd58 100644 --- a/Snakefile +++ b/Snakefile @@ -10,8 +10,8 @@ Transcripts = config["polished_reads"] rule all: input: - expand("/home/annaldas/projects/result/{gene}/{gene}_map_protein_analysis.txt", gene = GENES), - expand("/home/annaldas/projects/result/{gene}/{gene}_blastx_protein_analysis.txt", gene = GENES) + expand("/home/annaldas/projects/result/{gene}/{gene}_transcripts_filtered_coding_potential_analysis.txt", gene = GENES), + #expand("/home/annaldas/projects/result/{gene}/{gene}_blastx_protein_analysis.txt", gene = GENES) #expand("/home/annaldas/projects/result/{gene}/{gene}_sashimi.pdf", gene = GENES) rule gene_transcript: @@ -19,8 +19,8 @@ rule gene_transcript: NanoporeGTF, Transcripts output: - "/home/annaldas/projects/result/{gene}/{gene}_seq.fa", - "/home/annaldas/projects/result/{gene}/{gene}_sashimi.sh" + "/home/annaldas/projects/result/{gene}/{gene}_seq.fa"#, + #"/home/annaldas/projects/result/{gene}/{gene}_sashimi.sh" params: gene = "{gene}" script: @@ -75,41 +75,119 @@ rule transcript_filter_utr: script: "filter_utr.py" -rule mapping: +checkpoint mapping: input: "/home/annaldas/projects/result/{gene}/{gene}_seq_filt.fa" output: - "/home/annaldas/projects/result/{gene}/{gene}_map_protein.fa" + directory("/home/annaldas/projects/result/{gene}/transcripts") #/{transcript}/{transcript}_map_protein.fa" + params: + gene = "{gene}" script: "translation_protein.py" +rule iupred2a_analysis: + input: + "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa", + config["iupred2a"] + output: + "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_transcript_sequence.txt", + "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein_iupred2a.txt" + script: + "iupred2a_analysis.py" + rule interpro_scan: input: - "/home/annaldas/projects/result/{gene}/{gene}_{type}_protein.fa" + "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa" output: - "/home/annaldas/projects/result/{gene}/{gene}_{type}.gff3" + "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map.gff3" params: - db = "Pfam,ProDom,Gene3D,CDD,Coils,MobiDBLite" + db = "Pfam,ProDom,Gene3D,CDD,Coils,MobiDBLite,SMART" shell: "sh /home/annaldas/my_interproscan/interproscan-5.38-76.0/interproscan.sh -i {input} -o {output} -f GFF3 -appl {params.db} -dra" -rule protein_domain_analysis: +rule brewery_analysis: input: - "/home/annaldas/projects/result/{gene}/{gene}_{type}.gff3" + "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa" output: - "/home/annaldas/projects/result/{gene}/{gene}_{type}_protein_analysis.txt" + "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa.ss3" + #"/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa.ss8" params: - gene = "{gene}" + brewery = config["brewery"] + shell: + "python3 {params.brewery} -i {input} --cpu 32 --noTA --noSA --noCD" + +rule functional_site_analysis: + input: + "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa" + output: + "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein_sites.txt" + params: + ps_scan = config["prosite_scan"], + prosite_dat = config["prosite_dat"] + shell: + "perl {params.ps_scan} -d {params.prosite_dat} {input} > {output}" + +rule individual_transcript_analysis: + input: + "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein_iupred2a.txt", + "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map.gff3", + "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein.fa.ss3", + "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein_sites.txt" + output: + "/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein_analysis.txt" script: - "interproscan_analysis.py" + "transcript_analysis.py" -rule sashimi_plot: +def aggregate_input(wildcards): + checkpoint_output = checkpoints.mapping.get(**wildcards).output[0] + return expand("/home/annaldas/projects/result/{gene}/transcripts/{transcript}_map_protein_analysis.txt", + gene=wildcards.gene, + transcript=glob_wildcards(os.path.join(checkpoint_output,"{transcript}_map_protein.fa")).transcript) + +rule aggregate: input: - sashimi_sh = "/home/annaldas/projects/result/{gene}/{gene}_sashimi.sh", - sashimi_py = config["sashimi"], - bams = config["input_bams"], - gtf = NanoporeGTF + aggregate_input output: - "/home/annaldas/projects/result/{gene}/{gene}_sashimi.pdf" + "/home/annaldas/projects/result/{gene}/{gene}_transcripts_filtered_analysis.txt" + params: + gene = "{gene}" shell: - "sh {input.sashimi_sh} {input.sashimi_py} {input.bams} {input.gtf} {output}" \ No newline at end of file + "cat {input} > {output}" + +#rule filter_transcripts: +# input: +# "/home/annaldas/projects/result/{gene}/{gene}_transcripts_analysis.txt" +# output: +# "/home/annaldas/projects/result/{gene}/{gene}_transcripts_filtered_analysis.txt" +# script: +# "filter_transcripts.py" + +rule protein_coding_potential_analysis: + input: + "/home/annaldas/projects/result/{gene}/{gene}_transcripts_filtered_analysis.txt" + output: + "/home/annaldas/projects/result/{gene}/{gene}_transcripts_filtered_coding_potential_analysis.txt" + script: + "interproscan_analysis.py" + +#rule protein_domain_analysis: +# input: +# "/home/annaldas/projects/result/{gene}/transcripts/{transcript}/{transcript}_map.gff3", +# "/home/annaldas/projects/result/{gene}/transcripts/{transcript}/{transcript}_map_protein_iupred2a.txt" +# output: +# "/home/annaldas/projects/result/{gene}/{gene}_map_protein_analysis.txt" +# params: +# gene = "{gene}" +# script: +# "interproscan_analysis.py" + +#rule sashimi_plot: +# input: +# sashimi_sh = "/home/annaldas/projects/result/{gene}/{gene}_sashimi.sh", +# sashimi_py = config["sashimi"], +# bams = config["input_bams"], +# gtf = NanoporeGTF +# output: +# "/home/annaldas/projects/result/{gene}/{gene}_sashimi.pdf" +# shell: +# "sh {input.sashimi_sh} {input.sashimi_py} {input.bams} {input.gtf} {output}" \ No newline at end of file diff --git a/gene_transcripts.py b/gene_transcripts.py index be78531..ef2b0f8 100644 --- a/gene_transcripts.py +++ b/gene_transcripts.py @@ -73,12 +73,12 @@ # Create sashmimi sh -output_filename = snakemake.output[1] -output_file = open(output_filename,"w+") - -chrm,start,stop = gene_pos[gene][0],gene_pos[gene][1],gene_pos[gene][2] -binbash = "#!/bin/bash" -sashimi = "python $1 -b $2 -c %s:%d-%d -g $3 -M 10 -C 3 -O 3 --shrink --alpha 1 --base-size=20 --ann-height=5 --height=7 --width=18 -S both -o $4" %(chrm,start,stop) -output_file.write("\n".join([binbash,sashimi])) -output_file.close() +#output_filename = snakemake.output[1] +#output_file = open(output_filename,"w+") + +#chrm,start,stop = gene_pos[gene][0],gene_pos[gene][1],gene_pos[gene][2] +#binbash = "#!/bin/bash" +#sashimi = "python $1 -b $2 -c %s:%d-%d -g $3 -M 10 -C 3 -O 3 --shrink --alpha 1 --base-size=20 --ann-height=5 --height=7 --width=18 -S both -o $4" %(chrm,start,stop) +#output_file.write("\n".join([binbash,sashimi])) +#output_file.close() diff --git a/genes.tab b/genes.tab index 7dc3bf5..ba1c398 100644 --- a/genes.tab +++ b/genes.tab @@ -1,15 +1,2 @@ gene_symbol -APP -ABI2 -DNMBP -SEPTIN8 -EPB41L5 -SEPTIN6 -TFDP2 -ERBB2 -RPS24 -ZNRD2 -COX11 -ERGIC3 -PFN2 -NKAIN4 \ No newline at end of file +RPS24 \ No newline at end of file diff --git a/interproscan_analysis.py b/interproscan_analysis.py index e69de29..62037c5 100644 --- a/interproscan_analysis.py +++ b/interproscan_analysis.py @@ -0,0 +1,59 @@ +filename = snakemake.input[0] +file = open(filename, "r") +lines = file.readlines() +file.close() + +class transcript: + def __init__(self, tcons, idr, ips, ss8): + self.tcons = tcons + self.idr = idr + self.ips = ips + self.ss8 = ss8 + self.pss = pss + +transcripts = dict() + +idr_lines = [] +ips_lines = [] +ss8_lines = [] +pss_lines = [] + +for line in lines: + if (line.startswith(">")): + if (len(ips_lines) > 0): + transcripts[tcons] = transcript(tcons, idr, ips, ss8) + + new = True + idr_lines = [] + ips_lines = [] + ss8_lines = [] + pss_lines = [] + idr = False + ips = False + ss8 = False + pss = False + tcons = line[1:].strip().split("|")[0] + + + if (line.startswith("#####IUPred2A Analysis")): + idr = True + elif (line.startswith("#####InterProScan")): + ips = True + idr = False + elif (line.startswith("#####BrewerySS8 Analysis")): + ss8 = True + ips = False + + if (idr): + idr_lines.append(line.strip()) + elif (ips): + ips_lines.append(line.strip()) + elif (ss8): + ss8_lines.append(line.strip()) + +transcripts[tcons] = transcript(tcons, idr, ips, ss8) + + + + +print(transcripts) \ No newline at end of file diff --git a/isoform_transcripts.ipynb b/isoform_transcripts.ipynb index d574150..0c4f65c 100644 --- a/isoform_transcripts.ipynb +++ b/isoform_transcripts.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -71,14 +71,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 51, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "['4e6fca82-c640-4aac-be00-57223afed52e|41',\n", + " '0807443e-5f7f-4b1b-b59f-b6a737ad89a9|16']" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gene_oID[\"MLLT1\"]" + ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -95,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -139,23 +153,23 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 27, "metadata": {}, "outputs": [ { "ename": "KeyError", - "evalue": "'cc2e8b66-6da7-4245-9544-c06d33b50252'", + "evalue": "'142e462f-f586-42fc-97b9-b2e3bfa1fd0d'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0moID_tID\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"cc2e8b66-6da7-4245-9544-c06d33b50252\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m: 'cc2e8b66-6da7-4245-9544-c06d33b50252'" + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0moID_tID\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"142e462f-f586-42fc-97b9-b2e3bfa1fd0d\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m: '142e462f-f586-42fc-97b9-b2e3bfa1fd0d'" ] } ], "source": [ - "oID_tID[\"cc2e8b66-6da7-4245-9544-c06d33b50252\"]" + "oID_tID[\"142e462f-f586-42fc-97b9-b2e3bfa1fd0d\"]" ] }, { @@ -441,11 +455,11 @@ }, { "cell_type": "code", - "execution_count": 222, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "afilename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/ReferenceData/gencode.v32.primary_assembly.annotation.gtf\"\n", + "afilename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/ReferenceData/gencode.v32.primary_assembly.annotation.gtf\"\n", "afile = open(afilename,\"r\")\n", "afile_lines = afile.readlines()\n", "afile.close()" @@ -453,18 +467,9 @@ }, { "cell_type": "code", - "execution_count": 298, + "execution_count": 5, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['gene' 'transcript' 'exon' 'CDS' 'start_codon' 'stop_codon' 'UTR'\n", - " 'Selenocysteine']\n" - ] - } - ], + "outputs": [], "source": [ "output = []\n", "for line in afile_lines:\n", @@ -472,8 +477,8 @@ " output.append(line.strip().split(\"\\t\"))\n", "pd_aline = pd.DataFrame(output,columns=[\"chr\",\"source\",\"type\",\"start\",\"stop\",\"a\",\"b\",\"c\",\"info\"])\n", "pd_aline = pd_aline.astype({'start': 'int32', \"stop\":\"int32\"})\n", - "print(pd_aline[\"type\"].unique())\n", - "#pd_aline = pd_aline[pd_aline[\"type\"] == \"UTR\"]\n" + "#print(pd_aline[\"type\"].unique())\n", + "pd_aline = pd_aline[pd_aline[\"type\"] == \"UTR\"]\n" ] }, { @@ -507,18 +512,18 @@ }, { "cell_type": "code", - "execution_count": 224, + "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [], "source": [ - "pd_aline.to_csv(path_or_buf = \"/home/annaldas/projects/nanopore-transcriptome-analysis/df_utr_regions.csv\",index = False)" + "pd_aline.to_csv(path_or_buf = \"/project/owlmayerTemporary/Sid/nanopore-analysis/ReferenceData/df_utr_regions.csv\",index = False)" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 15, "metadata": { "scrolled": true }, @@ -527,22 +532,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "chr11\t65570460\t65570695\tENST00000531405\n", - "chr11\t65571732\t65571888\tENST00000531405\n", - "chr11\t65570477\t65570491\tENST00000309328\n", - "chr11\t65571732\t65571888\tENST00000309328\n", - "chr11\t65570485\t65570695\tENST00000527920\n", - "chr11\t65572478\t65572892\tENST00000527920\n", - "chr11\t65570487\t65570491\tENST00000526877\n", - "chr11\t65571394\t65571858\tENST00000526877\n", - "chr11\t65571732\t65571779\tENST00000533115\n", - "chr11\t65572443\t65572892\tENST00000533115\n", - "chr11\t65573524\t65573942\tENST00000526433\n" + "chr8\t26044860\t26045413\tENST00000520164\n", + "chr8\t25841725\t25844611\tENST00000520164\n", + "chr8\t26041288\t26041488\tENST00000408929\n", + "chr8\t26040939\t26041002\tENST00000408929\n", + "chr8\t26040616\t26040671\tENST00000408929\n", + "chr8\t26040066\t26040101\tENST00000408929\n", + "chr8\t25844362\t25844611\tENST00000408929\n", + "chr8\t25858059\t25858083\tENST00000535548\n", + "chr8\t25850594\t25850761\tENST00000535548\n", + "chr8\t25844609\t25844640\tENST00000535548\n" ] } ], "source": [ - "df_utr_regions = pd.read_csv(\"/home/annaldas/projects/nanopore-transcriptome-analysis/df_utr_regions.csv\")\n", + "df_utr_regions = pd.read_csv(\"/project/owlmayerTemporary/Sid/nanopore-analysis/ReferenceData/df_utr_regions.csv\")\n", "\n", "info = list(df_utr_regions[\"info\"])\n", "chrms = list(df_utr_regions[\"chr\"])\n", @@ -556,18 +560,18 @@ " gene = line[3].split(\" \")[-1][1:-1]\n", " \n", " \n", - " if (gene == \"ZNRD2\"):\n", + " if (gene == \"EBF2\"):\n", " print(chrms[ann] + \"\\t\" + str(start[ann]) + \"\\t\" + str(stop[ann]) + \"\\t\" + transID)\n", " " ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ - "bedfastafile = open(\"/home/annaldas/projects/result/ZNRD2/ZNRD2_utr_regions.fa\")\n", + "bedfastafile = open(\"/home/annaldas/projects/result/RPS24/RPS24_utr_regions.fa\")\n", "bedfastalines = bedfastafile.readlines()\n", "bedfastafile.close()\n", "trans_utr = dict()\n", @@ -579,31 +583,71 @@ " else:\n", " trans_utr[trans_id].append(line.strip())\n", "\n", - "transcript_id = \"ENST00000533115\"\n", - "s = '''GGTGACAACGGCAACATGGCCCTGAACGGAGCTGGTGAGGACCTGGGCGGCAGGGGTTTGTGGCTGTGAGGTACGGGAGGCAGCCCACTCCGGCAAGACCCCCAGTCCCTATGCCTCTCTTCCCCAGAAGTCGACGACTTCTCCTGGGAGCCCCCGACTGAGGCGGAGACGAAGACGATCCTCCTCCAAGACAAACAGCGGAAAATCTACTGCGTGGCTTGTCAGGAACTCGACTCAGACGTGGATAAAGATAATCCCGCTCTGAATGCCCAGGCTGCCCTCTCCCAAGCTCGGGAGCACCAGCTGGCCTCAGCCTCAGAGCTCCCCCTGGGCTCTCGACCTGCGCCCCAGCCCCCAGTACCTCGTCCGGAGCACTGTGAGGGAGCTGCAGCAGGACTCAAGGCAGCCCAGGGGCCACCTGCTCCTGCTGTGCCTCCAAATACAGATGTCATGGCCTGCACACAGACAGCCCTCTTGCAGAAGCTGACCTGGGCCTCTGCTGAACTGGGCTCTAGCACCTCCCTGGAGACTAGCATCCAGCTGTGTGGCCTTATCCGCGCATGTGCGGAGGCCCTGCGCAGCCTGCAGCAGCTACAGCACTAAGAGAAGCCCCTGAGAAAAACCCTCTAGAAAA'''" + "transcript_id = \"ENST00000372360\"\n", + "s = '''CTCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATCATGAACGACACCGTAACTATCCGCACTAGAAAGTTCATGACCAACCGACTACTTCAGAGGAAACAAATGGTCATTGATGTCCTTCACCCCGGGAAGGCGACAGTGCCTAAGACAGAAATTCGGGAAAAACTAGCCAAAATGTACAAGACCACACCGGATGTCATCTTTGTATTTGGATTCAGAACTCATTTTGGTGGTGGCAAGACAACTGGCTTTGGCATGATTTATGATTCCCTGGATTATGCAAAGAAAAATGAACCCAAACATAGACTTGCAAGACATGGCCTGTATGAGAAGAAAAAGACCTCAAGAAAGCAACGAAAGGAACGCAAGAACAGAATGAAGAAAGTCAGGGGGACTGCAAAGGCCAATGTTGGTGCTGGCAAAAAGTGAGCTGGAGATTGGATCACAGCCGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAAC'''" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 TCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n", + "392 GAGCTGGAGATTGGATCACAG\n", + "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCATGTGTCTGGTTGTTTGAAA\n" + ] + } + ], + "source": [ + "if transcript_id in trans_utr:\n", + " seq = s\n", + " for utr in trans_utr[transcript_id]:\n", + " pos = seq.find(utr)\n", + " print(pos, utr)\n", + " if (pos != -1):\n", + " seq = seq[:pos] + seq[pos + len(utr):]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "-1 TTCCTGTGAGCCCGGCGGTGACAACGGCAACATGGCCCTGAACGGAGCTGGTGAGGACCTGGGCGGCAGGGGTTTGTGGCTGTGAGGTACGGGAGGCAGCCCACTCCGGCAAGACCCCCAGTCCCTATGCCTCTCTTCCCCAGAAGTCGACGACTTCTCCTGGGAGCCCCCGACTGAGGCGGAGACGAAGGTGCTGCAGGCGCGACGGGAGCGGCAAGATCGCATCTCCCGGCTC\n", - "-1 AAGAGAAGCCCCTGAGAAAAACCCTCTAGAAAAACAGCTGTTCCTCTGTGTGGTTTGTTTTTTTCCTGGTTCCAAGTGTGCATGCCAGCCCCAGCTCCACTCACCTTTTTCCAGCTTTTGGCCTCTTCACCTCTCCACTCTGCTCTCCTTGACGCC\n", - "1 GTGACAACGGCAAC\n", - "-1 AAGAGAAGCCCCTGAGAAAAACCCTCTAGAAAAACAGCTGTTCCTCTGTGTGGTTTGTTTTTTTCCTGGTTCCAAGTGTGCATGCCAGCCCCAGCTCCACTCACCTTTTTCCAGCTTTTGGCCTCTTCACCTCTCCACTCTGCTCTCCTTGACGCC\n", - "-1 GGCAACATGGCCCTGAACGGAGCTGGTGAGGACCTGGGCGGCAGGGGTTTGTGGCTGTGAGGTACGGGAGGCAGCCCACTCCGGCAAGACCCCCAGTCCCTATGCCTCTCTTCCCCAGAAGTCGACGACTTCTCCTGGGAGCCCCCGACTGAGGCGGAGACGAAGGTGCTGCAGGCGCGACGGGAGCGGCAAGATCGCATCTCCCGGCTC\n", - "-1 AAGAGAGGCCCGGCCCATCCAGAGGGGGTGGGGCAGAGGCGGAGTCTGAGGAGCTGGGGAAGGAACAAAGCGAGGCCTGCGGGCGGCGGCTGGGCTCCGGCGGGGCCGCGGGGTGCGGGGCCTGCGGGCGGCGGCCCGGGCGGAGCGTTGGAGGGAAGGAGGTGGCATCGCCGTCCGCGCCGGCCCCGGCCATGAACGGGCTGCCCTCGGCAGAGGCGCCGGGCGGGGCGGGCTGCGCTTTGGCCGGGCTCCCACCGCTGCCGCGCGGCCTCAGCGGCCTCCTTAATGCGAGCGGGGGCTCGTGGCGGGAGCTGGAGCGCGTCTACAGCCAGCGCAGCCGCATCCACGACGAGCTGAGCCGCGCCGCCCGCGCCCCGGACGGGCCCCGCCACGCCGCCGGCGCCGCCAACGCGG\n", - "5 CAAC\n", - "-1 GAATGCCCAGGCTGCCCTCTCCCAAGCTCGGGAGCACCAGCTGGCCTCAGCCTCAGAGCTCCCCCTGGGCTCTCGACCTGCGCCCCAGCCCCCAGTACCTCGTCCGGAGCACTGTGAGGGAGCTGCAGCAGGACTCAAGGCAGCCCAGGGGCCACCTGCTCCTGCTGTGCCTCCAAATACAGATGTCATGGCCTGCACACAGACAGCCCTCTTGCAGAAGCTGACCTGGGCCTCTGCTGAACTGGGCTCTAGCACCTCCCTGGAGACTAGCATCCAGCTGTGTGGCCTTATCCGCGCATGTGCGGAGGCCCTGCGCAGCCTGCAGCAGCTACAGCACTAAGAGAAGCCCCTGAGAAAAACCCTCTAGAAAAACAGCTGTTCCTCTGTGTGGTTTGTTTTTTTCCTGGTTCCAAGTGTGCATGCCAGCCCCAGCTCCACTCACCTTTTTCCAGCTTTTGGCCTCT\n", - "-1 AAGAGAAGCCCCTGAGAAAAACCCTCTAGAAAAACAGCTGTTCCTCT\n", - "-1 TTTGCGCGACGTGGTCCCACAACCGTTGCCTTTTTAAGAGAGGCCCGGCCCATCCAGAGGGGGTGGGGCAGAGGCGGAGTCTGAGGAGCTGGGGAAGGAACAAAGCGAGGCCTGCGGGCGGCGGCTGGGCTCCGGCGGGGCCGCGGGGTGCGGGGCCTGCGGGCGGCGGCCCGGGCGGAGCGTTGGAGGGAAGGAGGTGGCATCGCCGTCCGCGCCGGCCCCGGCCATGAACGGGCTGCCCTCGGCAGAGGCGCCGGGCGGGGCGGGCTGCGCTTTGGCCGGGCTCCCACCGCTGCCGCGCGGCCTCAGCGGCCTCCTTAATGCGAGCGGGGGCTCGTGGCGGGAGCTGGAGCGCGTCTACAGCCAGCGCAGCCGCATCCACGACGAGCTGAGCCGCGCCGCCCGCGCCCCGGACGGGCCCCGCCACGCCGCCGGCGCCGCCAACGCGG\n", - "ENST00000533115 2 ['AAGAGAAGCCCCTGAGAAAAACCCTCTAGAAAAACAGCTGTTCCTCT', 'TTTGCGCGACGTGGTCCCACAACCGTTGCCTTTTTAAGAGAGGCCCGGCCCATCCAGAGGGGGTGGGGCAGAGGCGGAGTCTGAGGAGCTGGGGAAGGAACAAAGCGAGGCCTGCGGGCGGCGGCTGGGCTCCGGCGGGGCCGCGGGGTGCGGGGCCTGCGGGCGGCGGCCCGGGCGGAGCGTTGGAGGGAAGGAGGTGGCATCGCCGTCCGCGCCGGCCCCGGCCATGAACGGGCTGCCCTCGGCAGAGGCGCCGGGCGGGGCGGGCTGCGCTTTGGCCGGGCTCCCACCGCTGCCGCGCGGCCTCAGCGGCCTCCTTAATGCGAGCGGGGGCTCGTGGCGGGAGCTGGAGCGCGTCTACAGCCAGCGCAGCCGCATCCACGACGAGCTGAGCCGCGCCGCCCGCGCCCCGGACGGGCCCCGCCACGCCGCCGGCGCCGCCAACGCGG']\n", - "-1 GACGACGAGGAGCCTCCCGATGCCAGCCTGCCTCCTGACCCGCCACCCCTTACTGTGCCCCAGACGCACAATGCCCGTGACCAGTGGCTGCAGGATGCCTTCCACATCAGCCTCTGAAGGGCTGGGGGGCAGGGGGCATGCACCCATGCAAAAGGCTCAGAAACTCCCCCTCCGGCAAGCCCTCAGACTTCGGAGCCTGCGCCTTCCCCCCTACCGCCTCACCTCACAGGAGGGCCAGGCATGTATTCCTCAGAGGCGAAACTGCCAAACTCTTTCTCCTGTCTTGGGTTGGCTGGCACTGGGGCGGGCATCTAGGGTACAGCCTCTGCTCATGGCACTGGGCCTCCAGTTCTTCCACATGTGTGCACCCCCAGCTTGGCCAACCCTCAGCCTTGCGGTGGGGCCCGAAGCATCTTCC\n" + "-1 GGCATCGGCGCGGTCAGCCTCGTGGCGCGCCCACGCCCCCACGCCGGCTCTTCCCGGGGTCCTTCCGTGCGCGTTGATATGATTGGCCGGCGAATCGTGGTTCTCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n", + "430 GAGCTGGAGATTGGATCACAG\n", + "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCATGTGTCTGGTTGTTTG\n", + "-1 GGCATCGGCGCGGTCAGCCTCGTGGCGCGCCCACGCCCCCACGCCGGCTCTTCCCGGGGTCCTTCCGTGCGCGTTGATATGATTGGCCGGCGAATCGTGGTTCTCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n", + "-1 AAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCATGTGTCTGGTTGTTTGAAA\n", + "-1 ATATGATTGGCCGGCGAATCGTGGTTCTCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n", + "-1 GAAGTGTCTAGCAG\n", + "430 GAGCTGGAGATTGGATCACAG\n", + "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCATGTG\n", + "1 TCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n", + "-1 GAAGTGTCTAGCAG\n", + "392 GAGCTGGAGATTGGATCACAG\n", + "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTC\n", + "1 TCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n", + "392 GAGCTGGAGATTGGATCACAG\n", + "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCA\n", + "1 TCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n", + "392 GAGCTGGAGATTGGATCACAG\n", + "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCATGTGTCTGGTTGTTTGAAA\n", + "1 TCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATC\n", + "-1 AATGTCACTGCCATGGCCGCCTTGCTGCATTTCTGAGGATGCTTCATCTCTCCACCTTCTTCTCCACTCAGCAGCCAGCAGGGCACTGTGGAAATCGGAGTCACATGAGCTGGCACCTCTGTTCAGAACCCTCCAGGGCTCCACATCTCTCTCACCCAAATGCCAAAGACCTCCCCACGCCCCCACAATCCCCCACGACCTGGCCACTGGCCTCCCACCACCTTCCAGCTCCAGCGGCTCCTACCACATTTAAGGCTTTCCTTCCTAGTTTTAATTTTTCCTCGTCAGCAGTTGATTTTATTATTTTCTTGTTTATTGGTATTTTCCCACTAGAAATGAAGCTGCGTGAAGTTAGAGATTTTTTTTTTTGGTCTGTGTTCCTAATTAGCTCATTGCTATACCCCTGGCGCCCAGAACAATGCCTTGGACACAGTACGCAGTAGACTAAATAAATACTTGTTGAATGACTGACTGACGGAATGACGGCTGTGTGGGGAGTGGATTGGGTCGTGAGGCAGAGGCTGCGGTGGAAACTCAGGCAGGAGGTGATGGTGGTTCTTGGGGCTGCGGAATGCCAAGTTTAGAAGCTCTTCCTCTGCTGTGGCACATGAACCGGTCACTCGAGAAGGCTTTTAGATTTACTTTGCCTAATCCCCTCTTAGTGCATGTGGGGAAACTGAGGTACACAAAAGGAATTCCCCACCAAGTTAGGGGCAGAACCTAGCCCCCTTGTCTCCCAGATGGATATCTTCTTTTTTTTTTGAGACGGAGTCTTGCTCTGTTGCCCAGGCTGGAGTGCAGTGGTACCATCTTGGCTCACTGCAACCTCTGCTTCCCAGGTTCAAGCGATTCTCCTGCCTCAGCCTCCTGAGTGTCTGCGATTACAGGTGCACACAACCACGCCTGGCTAATTTTTGTATTTTTAGTAGAGACGGGGTTTCACCGTGTTGGTCAGGGTGACCTCAAACTCCTGACCTCATGATCCACCCAGCTCAGCCTCCCAACGTGCTGGGATTACAGGCATGAGCCACCGTGCCTGGCTGGACATCTTGTTATTAAAGCTTCTTCTCTCTTTGTAGGGGAGGGGGAGATGCCTCTGGTGGAGAAGACCAGTGTGGCAGTGACTGTGTCTGTTAGTGAACCTGGTGGCTGGTTGAGGGTCTGTCGTGGTGACTGAGGACACATACAAAGTGCTTTTCTCAGTGGTCACCTTGGTGTTGGTGAATAAGGGTCAGAAGATGGCTCCTGTCCTAGGGCACTGCCAGTCGGTTTGGAAGCTGAAATGCCTGCTTAGCAGTTTGAGGAAACACAGACCTTGGAGGATCTTCTGGTTGCCTCTTCAAGAATTCATTCTATTCCCCTTCTGCTCCCCAAATTTGCTTTTCTTGGGGTGGGTCTTGGTTGGCCTAAGCCAAGAAAGTATGGCATCTACTCCTTCCATAGCAATAGCTCAGGAATAGGCAGTGACCCAGACCTGAACCAATCAGTGCATGGAATTACCCCTGGCCAAAGTGGTTGATTGAGGCTGGGTGCAAGCAGAGTTGTGAGAAGGCTCCCATTTGGTGGTTGGAGAGATCGCACTTGCTCCAGAGGTCATAATGTGCAGATCTGAGGCTTGGAACTGCTGCAGACATTTTGCTACCACAAGTGAAGCCACCCTGACGACACAGTTGACAATTTGGAGCAGGGCAGAGCTGAGAGAACAGCAGGGAAACAGCCAGAGTCTTGCTCAAGCCTCCCTGAAGTATCTATACCCCTGGACTCTAGTTATGGGGGCTAATAAATGTTATATACTGTTTAAGGTA\n", + "15 GCTGTCTGAAGATAGATCGCCATC\n", + "-1 AGATGACTTGCCCCAAGTCCTTCAGCTCATTCATGCTGGGGAAAGGAGTAAGCTTCAGGCGTCTTCCCCTGGAGTTCACGCCACCTCTGACAGCAAGTGAGCCGTTTGCTACTCAAGTGCTGTTTCTTGCTTTTTTAAG\n", + "-1 TGAGGGAATTGGGGCTTGGAGTGCAAGCATTGGGAAGAATTTCCCAGGAAGAGAGATGCACAGATGTGAAGAACTCGAAGGCAAGAGAAAGCCGGGGGGTTGTGTGGCAGGTAGAAGTGCCAGGACCGTGGAGCGTGTGGACATG\n", + "-1 GAAGTGTCTAGCAGGTACTGAGATT\n", + "430 GAGCTGGAGATTGGATCACAG\n", + "-1 CGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAACTTCA\n" ] } ], @@ -615,14 +659,13 @@ " print(pos, utr)\n", " if (pos != -1):\n", " seq = seq[:pos] + seq[pos + len(utr):]\n", - " if (t == transcript_id):\n", - " print(t, len(trans_utr[t]),trans_utr[t])\n", + " #print(t, len(trans_utr[t]),trans_utr[t])\n", "a = seq" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -635,12 +678,12 @@ "\n", "a = '''GCTCAGTCCTCCAGGCGTCGGTACTCAGCGGTGTTGGAACTTCGTTGCTTGCTTGCCTGTGCGCGCGTGCGCGGACATGGCCTCAAACGATTATACCCAACAAGCAACCCAAAGCTATGGGGCCTACCCCACCCAGCCCGGGCAGGGCTATTCCCAGCAGAGCAGTCAGCCCTACGGACAGCAGAGTTACAGTGGTTATAGCCAGTCCACGGACACTTCAGGCTATGGCCAGAGCAGCTATTCTTCTTATGGCCAGAGCCAGAACACAGGCTATGGAACTCAGTCAACTCCCCAGGGATATGGCTCGACTGGCGGCTATGGCAGTAGCCAGAGCTCCCAATCGTCTTACGGGCAGCAGTCCTCCTACCCTGGCTATGGCCAGCAGCCAGCTCCCAGCAGCACCTCGGGAAGTTACGGTAGCAGTTCTCAGAGCAGCAGCTATGGGCAGCCCCAGAGTGGGAGCTACAGCCAGCAGCCTAGCTATGGTGGACAGCAGCAAAGCTATGGACAGCAGCAAAGCTATAATCCCCCTCAGGGCTATGGACAGCAGAACCAGTACAACAGCAGCAGTGGTGGTGGAGGTGGAGGTGGAGGTGGAGGTAACTATGGCCAAGATCAATCCTCCATGAGTAGTGGTGGTGGCAGTGGTGGCGGTTATGGCAATCAAGACCAGAGTGGTGGAGGTGGCAGCGGTGGCTATGGACAGCAGGACCGTGGAGGCCGCGGCAGGGGTGGCAGTGGTGGCGGCGGCGGCGGCGGCGGTGGTGGTTACAACCGCAGCAGTGGTGGCTATGAACCCAGAGGTCGTGGAGGTGGCCGTGGAGGCAGAGGTGGCATGGGGTAGGTGTCTCATGAGCCAGGGAGTATCTTTGGTGGGGAGTGTGGAGGATTGCATGAATCTCCCTGAAGCCAGTCCCTAGTGCATGGTTTAGTATTCTTGTTGTCTAGGGATCTGTGAGGGCTTTGATTTGGGGGCAGTGACTTTCTTTTTACATCCCCATTTTATTTTTGTGAGAACTTGGGAGCCTGAACTCCCATCCATACCACTGAATAGAGATTTTGAGTAATGATACTTGTTTCCAAAAAAAAAGAAACCATACATAGATACGTATGGATTGGAGTCATTAATATCCTAGGCAAGAAACATGGAAGTGAAGACTTCTTTCTCTGCAAGGGAAACCGATGATCCCACTCCTGGGAAATAGTAGGGAAACTTGGTATGTGTATTCCCATGTGTCCTCTAGGGAGTTGGTAATGGTTAACCTGACTTCAGCTTCCAGGAATTGGCTACTCTTCCCGTTTTCTATAGTCATTTGAATCCACGAGCTTGATTTGCACTAATTTGACCGACATTGATTTTGTGTGTGACTTGGTTTATGGGGCCAGCTGACTGAAGTAAGCAGACCTTTTGGGCAAAAATATGCTTTGACAGTGGTCTCCCACCTATTTGTTCCACTGTCTGCCTTCCCCTGGTTACTTAAAATTCATCAGCTTGTCCAACTGGACCTTCTTTCCTTCCTGCTGAAGTTGATTTGAAGTAAAACCTTAGATTTGATGTTAAAACAGTTGTCAAATCTGTTGGTAAATAAGATTTGAAGGACCCTACTCTGTCTCCCTTGAAAAAGGGGAGGAATGTCAGTGTTACTGTTTTTGGAAAAAGTAGATTTTTAAACCGAGTTTGGAAATGGTAAGTATGCAGAGGTGGGTGGGGGCAATCTCAAAAACGTGCAAAAATGAGGAAAACAAAAATGAGGAAATGTGTGCGTGTGTTTAATGCAAAACTTTAAAAAGAAAAACAACTGTTATGTGACTGTTAACTTGCTCTGCATTTTATGTGCCACAGGTATGAAAGGTGACATTGCAAAATACTCCGCTCTTCTCGCAGTGTAGAAGGGGTGACCCCGGGGGTTGGGGGAGATCAAAAACAGCTCAGTAGTTAGGACAGAGCTTAGCTAAGTTTGTCTTGCTTTAAGGGGAAGTTGCCTTTGGTTTTGACTTTTTATGGAATGGGGTTGGGTCTGCTTGCTGCTTTCAAAGCAAAAACCACAAAAATGTGTTCAAGGCTACCCCAGCCTGGTGTGAAATGTCTTCTGGGTAAATTGGGGTAGGGTTTTTAAACCAACTACTTGGTTGTCAACCACTTGCGACAAGAGGAAAAAAAAACATCTGCTCCATCGGAAGAACGACCAAGGAAAATGGGTTATTTTTTTTCCAGAGGAAATAGATAACGTAACCTTTTAAAGCAAAATCTTTATAAACTGTGTCTGAGAAATTGCACACGTGTGTGTGACATGCTCAAAGGTCAGACAAGGGGTGGTCAGGAAGGGATGTATTTTAGTAGCCACTTGTATCTTTTTCCAAAAACACCTACCCATGTTTGGGGAATGTTAAACAAAATCAAAAAACAACCTTTTGTAGCCGTTGGAAGCTTCATGTCCTTTCTTCTAACTTGTCTTCTCCAGCGGAAGTGACCGTGGTGGCTTCAATAAATTTGGTGGTAAGTGAACAGAGTTTCCAAAATTCCCAACTCCCAGCAATGCTTTGTCTGATTGTTCATTTGCAGATGTCTTAGCGTGTTAATTTAAATGTCAAAGGTTTTGAGGTGTCCAGAACCACCTCCAGAAAGGGGTAGGGTAGAATGCCACCTGTTGCCTGGTGTGTGCTAACCTGGAGCAGGTAGGGGTAAGACTCAATAGTCATCTTTTACCAAATGGGTTTGCCCCAGGTTAATAAGAGGGGTCTAGTAGGCCTTGGACTGGGCCGTTGCCACACCTGGCACTTAGTGACCATCATCATGAGAAACTGGAGAGTGCGTGCTGGAACACGTGGTGCCATCTTGGCTTTAGGATCCTTTTGATCGTTGTGTCCAAGGCTTGTGTGTGTGTGAGTGTGTGGGAGACAACTCCGAATGTTTAATTCTGGAAGAGGGATGTAACATTGCCCTGAGGATGGTGAAGTTGGTATACATTTATAAAGTACGGAATGGTGTCAATGAATGCAATTCTATGTATATGGACTTAACTGAGATGGGCAAATAGAAACTAGCTCTGGGAAGGAACATGTGCACTACTTCAAGAAAGATTGGAAGCATGTGTGGCTCATGGGAAATAACCAGGTCTTAAACAGCACAAACTGAATTCGTGGACCAGGAAGGTCTTAAACAGCACAAACTGAATTCATGGAAAAATGACAAATTTGAGAAGTCTCCCAGTAAGCTGGAACTTTTCTGGTTTGGTTAACAAAAGGTTTCTTGATTTGTTTCAAGATTTAAAGCCAAAGGTGTGGGTTCATGACTTAGGTGTCATTGCGTGTGGGTACAATATTTATATATGGCGAATTCAGATAAACATTGGTCAAAGATGGTCTCTGGAAAAACAAAATAGAGGCTGCATTACGGAAATAAGATTTCTGGTCTGTTCCCTGGGACATGCTTAAAAAATACAATAGCTATTATGTATGGTTTTTATTTTCATGTGGTTTCGGGGAAACAACACGGTTTTAAGGATGGTTTCTAAAGATGAAATTAAAAATTGTTCCACAAGGGTTAAGTGTCTGGTGGTAAAGTTGGGAGAAACTGGATGGATGCACATCGCATGGCTGGTGGCGAGCCCATCTCTCTTCTCTCGGGTGAGAGAACCGGGCCAAGCTGAGTTGGTTTGTTCACTTTAATGGGTCTCCGTTTCCCCTGCCACCTGTGCTGAGGACATTTCCCAGCCTGAGCTGGGGGAGGCAGCATTTGCTGAAGTGTGGAGTTGTCTCTGTGGAGACTCAAGTTACAGATCTTAAGGGGCCTGCCTAGAATTTTCTCCTCTGGGCAGGCGACCCAGGAAAGGGTTTGGAGTGAGGCTGTGAGCACTTACTTGATATTTTACAAGTTTGGATTTGGTGTTAATTTTTTTCCTTGTCCGTTTTTTCCTGTTGACTAACGGCTCATCTTTTCCTTGTTTTTGTTTTTTTTTTGTTCTTTTTTTCCATGTCACTAAAGGCCCTCGGGACCAAGGATCACGTCATGACTCCGAACAGGATAATTCAGACAACAACACCATCTTTGTGCAAGGCCTGGGTGAGAATGTTACAATTGAGTCTGTGGCTGATTACTTCAAGCAGATTGGTATTATTAAGACAAACAAGAAAACGGGACAGCCCATGATTAATTTGTACACAGACAGGGAAACTGGCAAGCTGAAGGGAGAGGCAACGGTCTCTTTTGATGACCCACCTTCAGCTAAAGCAGCTATTGACTGGTTTGATGGTAAAGAATTCTCCGGAAATCCTATCAAGGTCTCATTTGCTACTCGCCGGGCAGACTTTAATCGGGGTGGTGGCAATGGTCGTGGAGGCCGAGGGCGAGGAGGACCCATGGGCCGTGGAGGCTATGGAGGTGGTGGCAGTGGTGGTGGTGGCCGAGGAGGATTTCCCAGTGGAGGTGGTGGCGGTGGAGGACAGCAGCGAGCTGGTGACTGGAAGTGTCCTAATCCCACCTGTGAGAATATGAACTTCTCTTGGAGGAATGAATGCAACCAGTGTAAGGCCCCTAAACCAGATGGCCCAGGAGGGGGACCAGGTGGCTCTCACATGGGGGGTAACTACGGGGATGATCGTCGTGGTGGCAGAGGAGGCTATGATCGAGGCGGCTACCGGGGCCGCGGCGGGGACCGTGGAGGCTTCCGAGGGGGCCGGGGTGGTGGGGACAGAGGTGGCTTTGGCCCTGGCAAGATGGATTCCAGGGGTGAGCACAGACAGGATCGCAGGGAGAGGCCGTATTAATTAGCCTGGCTCCCCAGGTTCTGGAACAGCTTTTTGTCCTGTACCCAGTGTTACCCTCGTTATTTTGTAACCTTCCAATTCCTGATCACCCAAGGGTTTTTTTGTGTCGGACTATGTAATTGTAACTATACCTCTGGTTCCCATTAAAAGTGACCATTTTAGTTAAA'''\n", "\n", - "a = '''GGTGACAACGGCAACATGGCCCTGAACGGAGCTGGTGAGGACCTGGGCGGCAGGGGTTTGTGGCTGTGAGGTACGGGAGGCAGCCCACTCCGGCAAGACCCCCAGTCCCTATGCCTCTCTTCCCCAGAAGTCGACGACTTCTCCTGGGAGCCCCCGACTGAGGCGGAGACGAAGACGATCCTCCTCCAAGACAAACAGCGGAAAATCTACTGCGTGGCTTGTCAGGAACTCGACTCAGACGTGGATAAAGATAATCCCGCTCTGAATGCCCAGGCTGCCCTCTCCCAAGCTCGGGAGCACCAGCTGGCCTCAGCCTCAGAGCTCCCCCTGGGCTCTCGACCTGCGCCCCAGCCCCCAGTACCTCGTCCGGAGCACTGTGAGGGAGCTGCAGCAGGACTCAAGGCAGCCCAGGGGCCACCTGCTCCTGCTGTGCCTCCAAATACAGATGTCATGGCCTGCACACAGACAGCCCTCTTGCAGAAGCTGACCTGGGCCTCTGCTGAACTGGGCTCTAGCACCTCCCTGGAGACTAGCATCCAGCTGTGTGGCCTTATCCGCGCATGTGCGGAGGCCCTGCGCAGCCTGCAGCAGCTACAGCACTAAGAGAAGCCCCTGAGAAAAACCCTCTAGAAAAA'''" + "a = '''AGTGCGGAGCCTTAGGCGGAGCGAAGAGAACCGGTCGCGGCAATCCTAGCGCGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCACCCGCATCCGCTGCGGGAGTCCGAGCCGGAACCACACCCAAGTAGCTGCCCTTTCCTCTTCTGTCATCTCACCGCCCCACCACAGACCGCGTTCCCCGAGGAAACCGGCCGCCCACGCCCGGAGCATCCTCCCCTGTTGAGCGGGCGCTGACGGACCCGGCGGCATGATGCGGCTGCGAGGCTCGGGGATGCTGCGGGACCTGCTCCTGCGGTCGCCCGCCGGCGTGAGCGCGACTCTGCGGCGGGCACAGCCCTTGGTCACCCTGTGCCGGCGTCCCCGAGGCGGGGGACGGCCGGCCGCGGGCCCGGCTGCCGCCGCGCGACTCCACCCGTGGTGGGGCGGGGGCGGCTGGCCGGCGGAGCCCCTCGCGCGGGGCCTGTCCAGCTCTCCTTCGGAGATCTTGCAGGAGCTGGGCAAGGGGAGCACGCATCCGCAGCCCGGGGTGTCGCCACCCGCTGCCCCGGCGGCGCCCGGCCCCAAGGACGGCCCCGGGGAGACGGACGCGTTTGGCAACAGCGAGGGCAAAGAGCTGGTGGCCTCAGGTGAAAATAAAATAAAACAGGGTCTGTTACCTAGCTTGGAAGATTTGCTGTTCTATACAATTGCTGAAGGACAAGAGAAAATACCTGTTCATAAATTTATTACAGCACTCAAATCTACAGGATTGCGAACGTCTGATCCCAGGTTGAAAGAGTGTATGGATATGTTAAGATTAACTCTTCAAACAACATCAGATGGTGTCATGCTAGACAAAGATCTTTTTAAAAAATGTGTTCAGAGCAACATTGTTTTGTTGACACAAGCATTTAGAAGAAAGTTTGTGATTCCTGACTTTATGTCTTTTACCTCACACATTGATGAGTTATATGAAAGTGCTAAAAAGCAGTCTGGAGGAAAGGTTGCAGATTATATTCCTCAACTGGCCAAATTCAGTCCCGATTTGTGGGGTGTGTCTGTTTGTACAGTAGATGGACAGAGGCATTCTACTGGAGATACCAAAGTTCCCTTCTGTCTTCAGTCCTGTGTAAAACCTTTGAAATATGCCATTGCTGTTAATGATCTTGGAACTGAATATGTGCATCGATATGTTGGAAAAGAGCCGAGTGGACTAAGATTCAACAAACTATTTTTGAATGAAGATGATAAACCACATAATCCTATGGTAAATGCTGGAGCAATTGTTGTGACTTCACTAATAAAGCAAGGAGTAAATAATGCTGAAAAATTTGACTATGTCATGCAGTTTTTGAATAAGATGGCTGGTAATGAATATGTTGGATTCAGTAATGCAACGTTTCAGTCTGAAAGAGAAAGTGGAGATCGAAATTTTGCAATAGGATATTACTTAAAAGAAAAGAAGTGTTTTCCAGAAGGCACAGACATGGTTGGTATATTAGACTTCTACTTCCAGCTGTGCTCCATTGAAGTGACTTGTGAATCAGCCAGTGTGATGGCTGCGACACTGGCTAATGGTGGTTTCTGCCCAATTACTGGTGAAAGAGTACTGAGCCCTGAAGCAGTTCGAAATACATTGAGTTTGATGCATTCCTGTGGCATGTATGACTTCTCAGGGCAGTTTGCTTTCCATGTTGGTCTTCCTGCAAAATCTGGAGTTGCTGGGGGCATTCTTTTAGTTGTCCCCAATGTTATGGGTATGATGTGCTGGTCTCCTCCTCTGGATAAGATGGGCAACAGTGTTAAGGGAATTCACTTTTGTCACGATCTTGTTTCTCTGTGTAATTTCCATAACTATGATAATTTGAGACACTTTGCAAAAAAACTTGATCCTCGAAGAGAAGGTGGTGATCAAAGGGTAAAGTCAGTGATAAATCTTTTGTTTGCTGCATATACTGGAGATGTGTCTGCACTTCGAAGATTTGCTTTGTCAGCTATGGACATGGAACAGCGGGACTATGATTCTAGAACAGCACTCCATGTAGCTGCTGCAGAGGGTCATGTTGAAGTTGTTAAATTTTTGCTGGAAGCCTGCAAAGTAAACCCTTTCCCCAAGGACAGGTGGAATAACACTCCCATGGATGAAGCACTGCACTTTGGACACCATGATGTATTTAAAATTCTCCAAGAATACCAAGTCCAGTACACACCTCAAGGAGATTCTGACAACGGGAAGGAAAATCAAACCGTCCATAAGAATCTTGATGGATTGTTGTAATGGTCTCAAATCCCAAGATTTAAATCACTTACCTATTTAATTGTGGAAAATGATTATGAAGAACATGTGTATTTCTATCTGGTAGTGATGTATATTTTACATTTGTCATTTCAGTGTTACTGGAGTTTTCTTCATTGTGCACACAGGACAAATCTGATCTCTTTGGGAAAAAATAGAAATAAAACAATCTCCCTCCATAATGTGAGCAATATTACCTCGTGCATTGTATAATTTGATGTAAAAGAAATAGTTACCAATGCTAGCTTGTGTGGTCTTCCATGATTTATTTGTGTTTTGTGAATTTTCAATTTATGGTGATGATCTGCTGATATGCATTTATAAAGTAAGCTCTGTTGTACAGTCTGTCCAAATGGGTCAAGGTTGCCTTTAGAAGCAAATAGTGTGATTTTCAAGACTTCAAATACAAATTTAGTTTAAGTGTTTGAACAACTATATGCACTTACGGTTGTGTGTTTAAAATGTCTCTCTCACCCCCTAGCTTCATGATGTGACTCTTAAAAAACTATAATAGTTAACAACTGTTAGTAAGATAGACCAATTCTGATTAGACTTTATCAGGGAATCTGTTTAAGATATGTTTGGTGACCAAAACGTATGTGTGAATGTAGTTATAATGCTTTTGAAAAATTTTCCTTTTTCTATATCCCCTTAGTCCAGCCTCTCTTCTCAGACATTTAGCTATCTGCCTCTTTCCTTTAGCTGGGAAAGTGAGAGCTGGCATACTATGCAGTTTTTATGTTTTCCATAGTAAGTCAGAAAATGCCTCCTATTTCTGGCATCAGAACTTTGCCATTTGTCTACAGAAGACGAACCAGAGACAAAATTACTAAGTATAAATTAGTCAAGTTTATCAGTCTAAAAAACGAAGGGATGTGCAACTGCAGCTCTTTAAGAAGTTTTTTTTTTTTAGCTTCTAGGGTAAAGATAAATTCAGAAATGCTCTAAGCTACCAAAGTTATTCTGAAAGTATGGGAACTGCTACAACTAACAAACATTTGTTTCCAAGCCTGTCATTAAGAGTCTGCATCAAGAGATTTGTCCTCCTTGGGGGACCACTGGATCATTCCAGATTTCTTGTGATTTTTCTATTGTGTAATTCTTGGTGGGCTCTGTAGTTTAATAATAAGAAAAAGGCCATTTCATTTTAAATTGTGACCTATAATTCTTTGTCTTGGGTTGGTAATTCAGGATTCATTTGGAAAGTGGGTAAAAGGGGCTTCAAAAAACGGATAGAACAGGATTTTCTAGGAGTTACACATACATTTTATCCTGTCATACCTCGAGATAAAGTGGCATGTTAGTGAGGAGTTCTGATATTAAGCACACACACACATGCACACAAATGGACTTCTCTGAAGCTGTGTTTAGTGAAATGAGCTCAAGTACATGAATGTTAGTTGTTATCACATACAGCAAATTCCTTTTTTTTTCTTTTTCTATGAGCACACTCTGCTGCTTCTAAACTTTACATGCCTGATGGCACCTTACTCCAGCAGCCTCCAGGTGCTTTCATTTTCACTTCCAGTCTAAGCCAGTGGCTCCTGCCACTGCCCTCCCATTACCTAGATGGCACCTCCTTTGGTGAAACCACGGCCAATGTTCCTTAGCTGCACCAGGCCCGAAGCTGTTCCCATGCTTGAGCTTCCATGGGGAGGATGCTGAGTGAGCAGTTTCCTACCCCGTGGATCTAGCAAGCCATGGAGACAGGTAGCATTTGTAAGATGCTGCACAGGAGCAGCATTATCCCCAAAGATATTACAGGGTAGACACGTTTTAACTGAAATCAATCAAGATAACTTTATTCAAAGAGCAGCCCGCTTTGTGTGACTAAAATGAAACAAGACAGTTGAATTGTGTGACTTGAAGATTACCAATGATTTTGAGGCTTTTCTATAATAAAAAGAGGTTCTAACCATTATTTGGGAACAAAGAGAGTTTTCATCTTTTTTCAGATCAAAACCATTCTGTAAAATCTTTGTTGTTTAATTAAATGTGCCGTTATTTACCCCTGATGTTATTTATGACTATGTGCCGATTCCTGCTCGGGCTGTTTGCTGTTGGCTGGTAATAATATATTTGATTTAAATGCTGTTGACTGTGCTATTAACTGCTGCCGTCAGTAAACTCCAAAGATCTTTTTGTTTTGGCTTTAGTATCATATGTGCTTTTTCTGTATCCTGAGCGCTCTATATGATCATGTTAATTTAAAGCTTTATACACATTGTTGTTTTTGCTGGTCTCATCTTTGGTAATATGCTATACCCCACTGCTGCCCGACACTGCCCTTTAGCTGCAGAGCTGGATTAGCTGTTGACCATTTGATGCTGTTGTCTGTCTGGCAGGGACTGAATGACCTGATGTCAGATTTAGATTCTTCCTGGGGATTACACAGCTATGAATGTATTTGCTTCTAAAACCTCCCAAAGTGAATCTAATCTTAAAACTACAAGTTGTAAGTATTCTGAAATTGGGAAACATTTATTTTAAATGCAATCAGGTAGTGTTGCTTTTTACAGCATAATAAATATATGTATCAAAAAAAAAA'''" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 22, "metadata": { "scrolled": true }, @@ -649,9 +692,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "GDNGNM+ALNGAGEDLGGRGLWL_GTGGSPLRQDPQSLCLSSPEVDDFSWEPPTEAETKTILLQDKQRKIYCVACQELDSDVDKDNPALNAQAALSQAREHQLASASELPLGSRPAPQPPVPRPEHCEGAAAGLKAAQGPPAPAVPPNTDVM+ACTQTALLQKLTWASAELGSSTSLETSIQLCGLIRACAEALRSLQQLQH_EKPLRKTL_K\n", - "VTTATWP_TELVRTWAAGVCGCEVREAAHSGKTPSPYASLPQKSTTSPGSPRLRRRRRRSSSKTNSGKSTAWLVRNSTQTWIKIIPL_M+PRLPSPKLGSTSWPQPQSSPWALDLRPSPQYLVRSTVRELQQDSRQPRGHLLLLCLQIQM+SWPAHRQPSCRS_PGPLLNWALAPPWRLASSCVALSAHVRRPCAACSSYSTKRSP_EKPSRK\n", - "_QRQHGPERSW_GPGRQGFVAVRYGRQPTPARPPVPM+PLFPRSRRLLLGAPD_GGDEDDPPPRQTAENLLRGLSGTRLRRG_R_SRSECPGCPLPSSGAPAGLSLRAPPGLSTCAPAPSTSSGAL_GSCSRTQGSPGATCSCCASKYRCHGLHTDSPLAEADLGLC_TGL_HLPGD_HPAVWPYPRM+CGGPAQPAAATALREAPEKNPLEK\n" + "M+ANLGVPGSPGFLNGSPTGSPYGIM+SSSPTVGSSSTSSILPFSSSVFPAVKQKSAFAPVIRPQGSPSPACSSGNGNGFRAM+TGLVVPPM+_RRTAFL_HKTTYSDGPIM+KKALGALLGV_WCPHM+NM+M+DTLGSARSQHLTWSHVLL_L_WWLHKLTLLGQGQKM+SLT_SVLRAEM+QFFVM+NIM+KTTFLCL_NI_EKIGKQLM+LNILDTICFSL_EKKVESFYFL_SLSDTNLVYAEKN_TKQGTSTEDFLKTQPELNDEM+LYVCLLIA_SL_KM+NKKREKF_NVLQLFK_INV_LIVSRGRN_DLFLEERGISLPDVK_K_CKHVVTSFKGV_LLLQLLTRFLSPVLHPLLFHHFIDPRARTCRFPQNTLM+QKQTL_HM+HKYSNPSIHSQLRLISK_STLL_ILFKIPIISFQRSPLSSPENSHV_VCLRLSASQGKKHLTF_QSQNHRLS_PRDPLGIPPHQNPTTPSALGAWFSTEVHSPIELQQHGLEKITLFDGLFRVSLRTSKSFPLPVNFPFYGIKLS_KT_LCHSTVSSQE_DGM+WERDIKSPNPLF_CSKKGKTRFLGDPEIRGICVLCSTVTQPFDGLFIVL_LALRGEEPCLSC_QWLILESLCNPRDISRHIGIANKDVTCFHVGVLLIDS_APQ_SLYTDLVWGSWLGFQH_K_M+QHSWKNIM+SASEPILDDFINSAYLLFDPLM+QAHIYPAPTVLFAFNPFWNYPV_LILFLCT_ESKVLETWHAYNYC_QSLIVRA_SAKIHGNIREKVPQ_ET_LM+EGEECVEYECYLPKVKAEQN_ACLALDLSQSNIRKFCFF_VALLLEPVNFPFA_R_VQYCLQNDCRISW_LYTEKCV_LNTRHLDHSARTLAATELFNCTNVCKDYF_CTNKLKTKLPCPR_SLLFNHSSPKLFLCHLEYEYFYW_LQTLSCVCKRSCVRFFSFKGTIYFIIYLSFVIAL_FFSVFLCTFQYAWFKNH_LLYLLYSKVSFSHCIVSFYCFALSVIYSIKLLCTELFVKTAFLFTVFNGLT_RISCL_NEYVYFSFKL_IILTKK_NFCTVK\n", + "WPTWVFQVHQDF_M+AHPPALLM+ESCHQVPPLGLPAHPPSSHFPLQFFLLSNRRVPLPLSSGPKAPLHLPAPAAM+EM+DSEP_PDLLYPRCKEELLSYSTKLLTLM+DQ__RKH_ELFWGCSGAPT_T_WTPLGLQGASILLGPTSSYSSDGGYTN_PSWDKDKRCH_RSQC_EQKCNSLL_TL_KPPSYVCKIFKKKLANN_CLIFWILFVFLCRKKKLKVSIFYEAFQIPI_FM+QKKIEQNRVPARKTFLKRNLN_M+M+KCCM+CVCL_LNLFKK_TKKGKNFKM+FYNYLNK_M+CNLL_VEVEIKTFFWKREGFLFLM+_NENDANM+__QVLKVCDYYCSYLLDSYPLSCTPSFSITLLTQGQEHVDSHRTP_CRNRHSSTCTNTPTHQFIPSSV_FLNSPHSCEYFLKFLLLAFNVLP_AVQRIPM+FESVSGFQHPKERSTSPSSRAKTTDFLNQGTPWEFLLTRTPPLPPLWVPGSVQRYTAP_NFSNTVWRR_LCLM+DSLESVLEHPSPFLSLSISPFM+ALNSLEKHSYATRQFLVKNETECGRGILKALIPSSDVRRKARPDSWEIQRLGASAFYAAQLPSPLM+GSSSFCD_LLEGKNLALAANSG_FLKAYVILETYPGILGLPIKM+SLVSTLGFSLLTHEHHNRVYTQT_FGVLGWDFSIRNRCSTHGKTS_VHLSQF_M+ISLILHTFSLTH_CRPTSIQPQLSYLPLTHSGITQYDLYYFYAHENPRF_KPGM+LITTADSPS_SGLKAPKYTET_EKKSPSEKPD_WKGKNVWNM+NVTFQK_RQSKIKHV_PWI_ANPT_GNFVFFK_HCF_NL_IFLLHEDECSTVFKM+IVEFLGSFTPKNACN_IPDILTIQLEPWQQQSYLIVQM+CVRIIFSVLIN_KQSYPVLVSHCYSIIPVPSYFCATWNM+SISIGNYKLYPVFVRGAV_DFFHLKGQFTSLFIFLLL_RCNSSQFFFAHFNM+HGLKTINFSTFCTVRFHFHTV_FHFIVLLCQLYTV_SCYAQSFL_RQLFCLLFLM+VLLKEYLVCKM+NM+STSVLNFKLS_QKNKIFVL_K\n", + "GQLGCSRFTRISKWLTHRLSLWNHVIKSHRWVFQHILHPPIFLFSFSCCQTEECLCPCHQAPRLPFTCLLQRQWKWIQSHDRTCCTPDVKKNCFLIAQNYLL_WTNNEESTRSSFGGVVVPPHEHDGHPWVCKEPASYLVPRPPIALM+VATQTDPLGTRTKDVIDVVSAKSRNAILCYEHYENHLPM+FVKYLRKNWQTINA_YFGYYLFFFVGKKS_KFLFSM+KPFRYQFSLCRKKLNKTGYQHGRLS_NAT_IE__NVVCVFAYSLISLKNEQKKGKILKCFTII_INKCVTYCK_R_KLRPFFGRERDFSS_CKM+KM+M+QTCSNKF_RCVIITAVTY_ILIPCPAPPPFPSLY_PKGKNM+_IPTEHPNAETDTLAHAQILQPINSFPAPSDF_IVHTLVNTF_NSYY_LSTFSPEQSREFPCLSLSQAFSIPRKEAPHLLAEPKPQTFLTKGPLGNSSSPEPHHSLRFGCLVQYRGTQPHRTSATRFGEDNSV_WTL_SQS_NIQVLSSPCQFPLLWH_TLLKNIAM+PLDSF_SRM+RRNVGEGY_KP_SPLLM+FEERQDQILGRSRD_GHLRSM+QHSYPAL_WALHRFVTSS_RGRTLP_LLTVVDS_KLM+_S_RHIQAYWDCQ_RCHLFPRWGSPY_LM+STTIESIHRLSLGFLVGISALEIDAALM+EKHHECI_ANFR_FH_FCIPSL_PIDAGPHLSSPNCLICL_PILELPSM+TYIISM+HM+RIQGFRNLACL_LLLTVPHSQGLKRQNTRKHKRKSPPVRNLIDGRGRM+CGI_M+LPSKSEGRAKLSM+FSPGFEPIQHKEILFFLSSIAFRTCEFSFCM+KM+SAVLSSK_L_NFLVALHRKM+RVTKYQTS_PFS_NPGSNRAI_LYKCV_GLFLVY__IKNKATLSSLVIAIQSFQSQVIFVPLGI_VFLLVTTNSILCL_EELCKIFFI_RDNLLHYLSFFCYSVVILLSFSLHISICM+V_KPLTSLPFVQ_GFIFTLYSFILLFCFVSYIQYKVAM+HRAFCKDSFFVYCF_WSYLKNILFVK_ICLLQF_TLNYPNKKIKFLYCKK\n" ] } ], @@ -700,26 +743,226 @@ }, { "cell_type": "code", - "execution_count": 184, + "execution_count": 118, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MMRLRGSGMLRDLLLRSPAGVSATLRRAQPLVTLCRRPRGGGRPAAGPAAAARLHPWWGGGGWPAEPLARGLSSSPSEILQELGKGSTHPQPGVSPPAAPAAPGPKDGPGETDAFGNSEGKELVASGENKIKQGLLPSLEDLLFYTIAEGQEKIPVHKFITALKSTGLRTSDPRLKECMDMLRLTLQTTSDGVMLDKDLFKKCVQSNIVLLTQAFRRKFVIPDFMSFTSHIDELYESAKKQSGGKVADYIPQLAKFSPDLWGVSVCTVDGQRHSTGDTKVPFCLQSCVKPLKYAIAVNDLGTEYVHRYVGKEPSGLRFNKLFLNEDDKPHNPMVNAGAIVVTSLIKQGVNNAEKFDYVMQFLNKMAGNEYVGFSNATFQSERESGDRNFAIGYYLKEKKCFPEGTDMVGILDFYFQLCSIEVTCESASVMAATLANGGFCPITGERVLSPEAVRNTLSLMHSCGMYDFSGQFAFHVGLPAKSGVAGGILLVVPNVMGMMCWSPPLDKMGNSVKGIHFCHDLVSLCNFHNYDNLRHFAKKLDPRREGGDQRVKSVINLLFAAYTGDVSALRRFALSAMDMEQRDYDSRTALHVAAAEGHVEVVKFLLEACKVNPFPKDRWNNTPMDEALHFGHHDVFKILQEYQVQYTPQGDSDNGKENQTVHKNLDGLL\n", + "MVSC\n", + "MCSEQHCFVDTSI\n" + ] + } + ], + "source": [ + "def translate(seq,orf):\n", + " seq = seq.upper()\n", + " seq = seq.replace(\"\\n\",\"\")\n", + "\n", + " table = { \n", + " 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', \n", + " 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', \n", + " 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', \n", + " 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R', \n", + " 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', \n", + " 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', \n", + " 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', \n", + " 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', \n", + " 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', \n", + " 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', \n", + " 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', \n", + " 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', \n", + " 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', \n", + " 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', \n", + " 'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', \n", + " 'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', \n", + " } \n", + "\n", + " protein = \"\"\n", + " exon = False\n", + " translating = True\n", + " i = orf\n", + "\n", + " while (translating):\n", + " codon = seq[i:i+3]\n", + " \n", + " try: table[codon]\n", + " except: break\n", + "\n", + " if (table[codon] == \"M\"):\n", + " exon = True\n", + "\n", + " if (exon):\n", + " if (table[codon] == \"_\"):\n", + " exon = False\n", + " translating = False\n", + " else:\n", + " protein += table[codon]\n", + " i += 3\n", + " else:\n", + " i += 3\n", + " \n", + " return protein\n", + "\n", + "a = \"AGTGCGGAGCCTTAGGCGGAGCGAAGAGAACCGGTCGCGGCAATCCTAGCGCGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCACCCGCATCCGCTGCGGGAGTCCGAGCCGGAACCACACCCAAGTAGCTGCCCTTTCCTCTTCTGTCATCTCACCGCCCCACCACAGACCGCGTTCCCCGAGGAAACCGGCCGCCCACGCCCGGAGCATCCTCCCCTGTTGAGCGGGCGCTGACGGACCCGGCGGCATGATGCGGCTGCGAGGCTCGGGGATGCTGCGGGACCTGCTCCTGCGGTCGCCCGCCGGCGTGAGCGCGACTCTGCGGCGGGCACAGCCCTTGGTCACCCTGTGCCGGCGTCCCCGAGGCGGGGGACGGCCGGCCGCGGGCCCGGCTGCCGCCGCGCGACTCCACCCGTGGTGGGGCGGGGGCGGCTGGCCGGCGGAGCCCCTCGCGCGGGGCCTGTCCAGCTCTCCTTCGGAGATCTTGCAGGAGCTGGGCAAGGGGAGCACGCATCCGCAGCCCGGGGTGTCGCCACCCGCTGCCCCGGCGGCGCCCGGCCCCAAGGACGGCCCCGGGGAGACGGACGCGTTTGGCAACAGCGAGGGCAAAGAGCTGGTGGCCTCAGGTGAAAATAAAATAAAACAGGGTCTGTTACCTAGCTTGGAAGATTTGCTGTTCTATACAATTGCTGAAGGACAAGAGAAAATACCTGTTCATAAATTTATTACAGCACTCAAATCTACAGGATTGCGAACGTCTGATCCCAGGTTGAAAGAGTGTATGGATATGTTAAGATTAACTCTTCAAACAACATCAGATGGTGTCATGCTAGACAAAGATCTTTTTAAAAAATGTGTTCAGAGCAACATTGTTTTGTTGACACAAGCATTTAGAAGAAAGTTTGTGATTCCTGACTTTATGTCTTTTACCTCACACATTGATGAGTTATATGAAAGTGCTAAAAAGCAGTCTGGAGGAAAGGTTGCAGATTATATTCCTCAACTGGCCAAATTCAGTCCCGATTTGTGGGGTGTGTCTGTTTGTACAGTAGATGGACAGAGGCATTCTACTGGAGATACCAAAGTTCCCTTCTGTCTTCAGTCCTGTGTAAAACCTTTGAAATATGCCATTGCTGTTAATGATCTTGGAACTGAATATGTGCATCGATATGTTGGAAAAGAGCCGAGTGGACTAAGATTCAACAAACTATTTTTGAATGAAGATGATAAACCACATAATCCTATGGTAAATGCTGGAGCAATTGTTGTGACTTCACTAATAAAGCAAGGAGTAAATAATGCTGAAAAATTTGACTATGTCATGCAGTTTTTGAATAAGATGGCTGGTAATGAATATGTTGGATTCAGTAATGCAACGTTTCAGTCTGAAAGAGAAAGTGGAGATCGAAATTTTGCAATAGGATATTACTTAAAAGAAAAGAAGTGTTTTCCAGAAGGCACAGACATGGTTGGTATATTAGACTTCTACTTCCAGCTGTGCTCCATTGAAGTGACTTGTGAATCAGCCAGTGTGATGGCTGCGACACTGGCTAATGGTGGTTTCTGCCCAATTACTGGTGAAAGAGTACTGAGCCCTGAAGCAGTTCGAAATACATTGAGTTTGATGCATTCCTGTGGCATGTATGACTTCTCAGGGCAGTTTGCTTTCCATGTTGGTCTTCCTGCAAAATCTGGAGTTGCTGGGGGCATTCTTTTAGTTGTCCCCAATGTTATGGGTATGATGTGCTGGTCTCCTCCTCTGGATAAGATGGGCAACAGTGTTAAGGGAATTCACTTTTGTCACGATCTTGTTTCTCTGTGTAATTTCCATAACTATGATAATTTGAGACACTTTGCAAAAAAACTTGATCCTCGAAGAGAAGGTGGTGATCAAAGGGTAAAGTCAGTGATAAATCTTTTGTTTGCTGCATATACTGGAGATGTGTCTGCACTTCGAAGATTTGCTTTGTCAGCTATGGACATGGAACAGCGGGACTATGATTCTAGAACAGCACTCCATGTAGCTGCTGCAGAGGGTCATGTTGAAGTTGTTAAATTTTTGCTGGAAGCCTGCAAAGTAAACCCTTTCCCCAAGGACAGGTGGAATAACACTCCCATGGATGAAGCACTGCACTTTGGACACCATGATGTATTTAAAATTCTCCAAGAATACCAAGTCCAGTACACACCTCAAGGAGATTCTGACAACGGGAAGGAAAATCAAACCGTCCATAAGAATCTTGATGGATTGTTGT\"\n", + "print(translate(a,0))\n", + "print(translate(a,1))\n", + "print(translate(a,2))" + ] + }, + { + "cell_type": "code", + "execution_count": 160, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "264 CGGCATGA MMRLRGSGMLRDLLLRSPAGVSATLRRAQPLVTLCRRPRGGGRPAAGPAAAARLHPWWGGGGWPAEPLARGLSSSPSEILQELGKGSTHPQPGVSPPAAPAAPGPKDGPGETDAFGNSEGKELVASGENKIKQGLLPSLEDLLFYTIAEGQEKIPVHKFITALKSTGLRTSDPRLKECMDMLRLTLQTTSDGVMLDKDLFKKCVQSNIVLLTQAFRRKFVIPDFMSFTSHIDELYESAKKQSGGKVADYIPQLAKFSPDLWGVSVCTVDGQRHSTGDTKVPFCLQSCVKPLKYAIAVNDLGTEYVHRYVGKEPSGLRFNKLFLNEDDKPHNPMVNAGAIVVTSLIKQGVNNAEKFDYVMQFLNKMAGNEYVGFSNATFQSERESGDRNFAIGYYLKEKKCFPEGTDMVGILDFYFQLCSIEVTCESASVMAATLANGGFCPITGERVLSPEAVRNTLSLMHSCGMYDFSGQFAFHVGLPAKSGVAGGILLVVPNVMGMMCWSPPLDKMGNSVKGIHFCHDLVSLCNFHNYDNLRHFAKKLDPRREGGDQRVKSVINLLFAAYTGDVSALRRFALSAMDMEQRDYDSRTALHVAAAEGHVEVVKFLLEACKVNPFPKDRWNNTPMDEALHFGHHDVFKILQEYQVQYTPQGDSDNGKENQTVHKNLDGLL 0.003137706\n" + ] + }, { "data": { "text/plain": [ - "4437" + "'MMRLRGSGMLRDLLLRSPAGVSATLRRAQPLVTLCRRPRGGGRPAAGPAAAARLHPWWGGGGWPAEPLARGLSSSPSEILQELGKGSTHPQPGVSPPAAPAAPGPKDGPGETDAFGNSEGKELVASGENKIKQGLLPSLEDLLFYTIAEGQEKIPVHKFITALKSTGLRTSDPRLKECMDMLRLTLQTTSDGVMLDKDLFKKCVQSNIVLLTQAFRRKFVIPDFMSFTSHIDELYESAKKQSGGKVADYIPQLAKFSPDLWGVSVCTVDGQRHSTGDTKVPFCLQSCVKPLKYAIAVNDLGTEYVHRYVGKEPSGLRFNKLFLNEDDKPHNPMVNAGAIVVTSLIKQGVNNAEKFDYVMQFLNKMAGNEYVGFSNATFQSERESGDRNFAIGYYLKEKKCFPEGTDMVGILDFYFQLCSIEVTCESASVMAATLANGGFCPITGERVLSPEAVRNTLSLMHSCGMYDFSGQFAFHVGLPAKSGVAGGILLVVPNVMGMMCWSPPLDKMGNSVKGIHFCHDLVSLCNFHNYDNLRHFAKKLDPRREGGDQRVKSVINLLFAAYTGDVSALRRFALSAMDMEQRDYDSRTALHVAAAEGHVEVVKFLLEACKVNPFPKDRWNNTPMDEALHFGHHDVFKILQEYQVQYTPQGDSDNGKENQTVHKNLDGLL'" ] }, - "execution_count": 184, + "execution_count": 160, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "a.find(\"\n", - " \n", - " \")" + "codon_table = {\n", + " 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', \n", + " 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', \n", + " 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', \n", + " 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R', \n", + " 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', \n", + " 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', \n", + " 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', \n", + " 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', \n", + " 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', \n", + " 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', \n", + " 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', \n", + " 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', \n", + " 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', \n", + " 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', \n", + " 'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', \n", + " 'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', \n", + " }\n", + "\n", + "def determine_utrs(gene):\n", + " filename = \"/home/annaldas/projects/result/%s/%s_utr_regions.fa\" %(gene,gene)\n", + " bedfastafile = open(filename,\"r\")\n", + " bedfastalines = bedfastafile.readlines()\n", + " bedfastafile.close()\n", + " gene_utr = dict()\n", + " for line in bedfastalines:\n", + " if (line.startswith(\">\")):\n", + " trans_id = line[1:].strip()\n", + " if (trans_id not in gene_utr):\n", + " gene_utr[trans_id] = []\n", + " else:\n", + " gene_utr[trans_id].append(line.strip())\n", + " return gene_utr\n", + "\n", + "def score(seq,start): \n", + " kozak = {\n", + " \"A\":[0.25,0.61,0.27,0.15,1.00,0.00,0.00,0.23],\n", + " \"C\":[0.53,0.02,0.49,0.55,0.00,0.00,0.00,0.16],\n", + " \"G\":[0.15,0.36,0.13,0.21,0.00,0.00,1.00,0.46],\n", + " \"T\":[0.07,0.01,0.11,0.09,0.00,1.00,0.00,0.15]\n", + " }\n", + " \n", + " score = 1.0\n", + " for i in range(start,len(seq)):\n", + " score *= kozak[seq[i]][i]\n", + " return score\n", + " \n", + "\n", + "def translate(seq, i, utr_regions):\n", + " translating = True\n", + " aa = \"\"\n", + " \n", + " while(translating): \n", + " in_utr = False\n", + " for utr in utr_regions:\n", + " start,stop = utr\n", + " if ((start < i) and (i < stop)):\n", + " in_utr = True\n", + " \n", + " if ((len(seq) < 3) or (in_utr)):\n", + " translating = False\n", + " aa = \"\"\n", + " else:\n", + " codon = seq[0:3]\n", + " if (codon_table[codon] == \"_\"):\n", + " translating = False\n", + " else:\n", + " aa += codon_table[codon]\n", + " seq = seq[3:]\n", + " i += 3\n", + " return aa\n", + "\n", + "\n", + "def translate_aa_seq(seq,enst,gene_utrs):\n", + " utr_regions = []\n", + " for utr in gene_utrs[enst]:\n", + " pos = seq.find(utr)\n", + " if (pos != -1):\n", + " utr_regions.append([pos,pos + len(utr)])\n", + " \n", + " longest_aa_seq = \"\"\n", + " longest_aa_seq_sc = 0\n", + " for i in range(len(seq)):\n", + " if (seq[i:i+3] == \"ATG\"):\n", + " sc = score(seq[i-4:i+4],0)\n", + " aa = translate(seq[i:], i, utr_regions)\n", + " if ((aa != \"\") and (sc > longest_aa_seq_sc) and (aa not in longest_aa_seq)):\n", + " print(i,seq[i-4:i+4],aa,sc)\n", + " longest_aa_seq = aa\n", + " longest_aa_seq_sc = sc\n", + " return (longest_aa_seq,longest_aa_seq_sc)\n", + "\n", + "def find_all_aa_seqs(seq,enst):\n", + " gene_utrs = determine_utrs(\"GLS\")\n", + " \n", + " if (enst in gene_utrs):\n", + " longest_aa_seq,longest_aa_seq_sc = translate_aa_seq(seq,enst,gene_utrs)\n", + " else:\n", + " longest_aa_seq = \"\"\n", + " longest_aa_seq_sc = 0\n", + " for enst_id in gene_utr:\n", + " aa_seq,aa_seq_sc = translate_aa_seq(seq,enst_id,gene_utrs)\n", + " if (aa_seq_sc > longest_aa_seq_sc):\n", + " longest_aa_seq = aa_seq\n", + " longest_aa_seq_sc = aa_seq_sc\n", + " \n", + " return longest_aa_seq\n", + " \n", + "enst = \"ENST00000320717\"\n", + "a=\"AGTGCGGAGCCTTAGGCGGAGCGAAGAGAACCGGTCGCGGCAATCCTAGCGCGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCACCCGCATCCGCTGCGGGAGTCCGAGCCGGAACCACACCCAAGTAGCTGCCCTTTCCTCTTCTGTCATCTCACCGCCCCACCACAGACCGCGTTCCCCGAGGAAACCGGCCGCCCACGCCCGGAGCATCCTCCCCTGTTGAGCGGGCGCTGACGGACCCGGCGGCATGATGCGGCTGCGAGGCTCGGGGATGCTGCGGGACCTGCTCCTGCGGTCGCCCGCCGGCGTGAGCGCGACTCTGCGGCGGGCACAGCCCTTGGTCACCCTGTGCCGGCGTCCCCGAGGCGGGGGACGGCCGGCCGCGGGCCCGGCTGCCGCCGCGCGACTCCACCCGTGGTGGGGCGGGGGCGGCTGGCCGGCGGAGCCCCTCGCGCGGGGCCTGTCCAGCTCTCCTTCGGAGATCTTGCAGGAGCTGGGCAAGGGGAGCACGCATCCGCAGCCCGGGGTGTCGCCACCCGCTGCCCCGGCGGCGCCCGGCCCCAAGGACGGCCCCGGGGAGACGGACGCGTTTGGCAACAGCGAGGGCAAAGAGCTGGTGGCCTCAGGTGAAAATAAAATAAAACAGGGTCTGTTACCTAGCTTGGAAGATTTGCTGTTCTATACAATTGCTGAAGGACAAGAGAAAATACCTGTTCATAAATTTATTACAGCACTCAAATCTACAGGATTGCGAACGTCTGATCCCAGGTTGAAAGAGTGTATGGATATGTTAAGATTAACTCTTCAAACAACATCAGATGGTGTCATGCTAGACAAAGATCTTTTTAAAAAATGTGTTCAGAGCAACATTGTTTTGTTGACACAAGCATTTAGAAGAAAGTTTGTGATTCCTGACTTTATGTCTTTTACCTCACACATTGATGAGTTATATGAAAGTGCTAAAAAGCAGTCTGGAGGAAAGGTTGCAGATTATATTCCTCAACTGGCCAAATTCAGTCCCGATTTGTGGGGTGTGTCTGTTTGTACAGTAGATGGACAGAGGCATTCTACTGGAGATACCAAAGTTCCCTTCTGTCTTCAGTCCTGTGTAAAACCTTTGAAATATGCCATTGCTGTTAATGATCTTGGAACTGAATATGTGCATCGATATGTTGGAAAAGAGCCGAGTGGACTAAGATTCAACAAACTATTTTTGAATGAAGATGATAAACCACATAATCCTATGGTAAATGCTGGAGCAATTGTTGTGACTTCACTAATAAAGCAAGGAGTAAATAATGCTGAAAAATTTGACTATGTCATGCAGTTTTTGAATAAGATGGCTGGTAATGAATATGTTGGATTCAGTAATGCAACGTTTCAGTCTGAAAGAGAAAGTGGAGATCGAAATTTTGCAATAGGATATTACTTAAAAGAAAAGAAGTGTTTTCCAGAAGGCACAGACATGGTTGGTATATTAGACTTCTACTTCCAGCTGTGCTCCATTGAAGTGACTTGTGAATCAGCCAGTGTGATGGCTGCGACACTGGCTAATGGTGGTTTCTGCCCAATTACTGGTGAAAGAGTACTGAGCCCTGAAGCAGTTCGAAATACATTGAGTTTGATGCATTCCTGTGGCATGTATGACTTCTCAGGGCAGTTTGCTTTCCATGTTGGTCTTCCTGCAAAATCTGGAGTTGCTGGGGGCATTCTTTTAGTTGTCCCCAATGTTATGGGTATGATGTGCTGGTCTCCTCCTCTGGATAAGATGGGCAACAGTGTTAAGGGAATTCACTTTTGTCACGATCTTGTTTCTCTGTGTAATTTCCATAACTATGATAATTTGAGACACTTTGCAAAAAAACTTGATCCTCGAAGAGAAGGTGGTGATCAAAGGGTAAAGTCAGTGATAAATCTTTTGTTTGCTGCATATACTGGAGATGTGTCTGCACTTCGAAGATTTGCTTTGTCAGCTATGGACATGGAACAGCGGGACTATGATTCTAGAACAGCACTCCATGTAGCTGCTGCAGAGGGTCATGTTGAAGTTGTTAAATTTTTGCTGGAAGCCTGCAAAGTAAACCCTTTCCCCAAGGACAGGTGGAATAACACTCCCATGGATGAAGCACTGCACTTTGGACACCATGATGTATTTAAAATTCTCCAAGAATACCAAGTCCAGTACACACCTCAAGGAGATTCTGACAACGGGAAGGAAAATCAAACCGTCCATAAGAATCTTGATGGATTGTTGTAATGGTCTCAAATCCCAAGATTTAAATCACTTACCTATTTAATTGTGGAAAATGATTATGAAGAACATGTGTATTTCTATCTGGTAGTGATGTATATTTTACATTTGTCATTTCAGTGTTACTGGAGTTTTCTTCATTGTGCACACAGGACAAATCTGATCTCTTTGGGAAAAAATAGAAATAAAACAATCTCCCTCCATAATGTGAGCAATATTACCTCGTGCATTGTATAATTTGATGTAAAAGAAATAGTTACCAATGCTAGCTTGTGTGGTCTTCCATGATTTATTTGTGTTTTGTGAATTTTCAATTTATGGTGATGATCTGCTGATATGCATTTATAAAGTAAGCTCTGTTGTACAGTCTGTCCAAATGGGTCAAGGTTGCCTTTAGAAGCAAATAGTGTGATTTTCAAGACTTCAAATACAAATTTAGTTTAAGTGTTTGAACAACTATATGCACTTACGGTTGTGTGTTTAAAATGTCTCTCTCACCCCCTAGCTTCATGATGTGACTCTTAAAAAACTATAATAGTTAACAACTGTTAGTAAGATAGACCAATTCTGATTAGACTTTATCAGGGAATCTGTTTAAGATATGTTTGGTGACCAAAACGTATGTGTGAATGTAGTTATAATGCTTTTGAAAAATTTTCCTTTTTCTATATCCCCTTAGTCCAGCCTCTCTTCTCAGACATTTAGCTATCTGCCTCTTTCCTTTAGCTGGGAAAGTGAGAGCTGGCATACTATGCAGTTTTTATGTTTTCCATAGTAAGTCAGAAAATGCCTCCTATTTCTGGCATCAGAACTTTGCCATTTGTCTACAGAAGACGAACCAGAGACAAAATTACTAAGTATAAATTAGTCAAGTTTATCAGTCTAAAAAACGAAGGGATGTGCAACTGCAGCTCTTTAAGAAGTTTTTTTTTTTTAGCTTCTAGGGTAAAGATAAATTCAGAAATGCTCTAAGCTACCAAAGTTATTCTGAAAGTATGGGAACTGCTACAACTAACAAACATTTGTTTCCAAGCCTGTCATTAAGAGTCTGCATCAAGAGATTTGTCCTCCTTGGGGGACCACTGGATCATTCCAGATTTCTTGTGATTTTTCTATTGTGTAATTCTTGGTGGGCTCTGTAGTTTAATAATAAGAAAAAGGCCATTTCATTTTAAATTGTGACCTATAATTCTTTGTCTTGGGTTGGTAATTCAGGATTCATTTGGAAAGTGGGTAAAAGGGGCTTCAAAAAACGGATAGAACAGGATTTTCTAGGAGTTACACATACATTTTATCCTGTCATACCTCGAGATAAAGTGGCATGTTAGTGAGGAGTTCTGATATTAAGCACACACACACATGCACACAAATGGACTTCTCTGAAGCTGTGTTTAGTGAAATGAGCTCAAGTACATGAATGTTAGTTGTTATCACATACAGCAAATTCCTTTTTTTTTCTTTTTCTATGAGCACACTCTGCTGCTTCTAAACTTTACATGCCTGATGGCACCTTACTCCAGCAGCCTCCAGGTGCTTTCATTTTCACTTCCAGTCTAAGCCAGTGGCTCCTGCCACTGCCCTCCCATTACCTAGATGGCACCTCCTTTGGTGAAACCACGGCCAATGTTCCTTAGCTGCACCAGGCCCGAAGCTGTTCCCATGCTTGAGCTTCCATGGGGAGGATGCTGAGTGAGCAGTTTCCTACCCCGTGGATCTAGCAAGCCATGGAGACAGGTAGCATTTGTAAGATGCTGCACAGGAGCAGCATTATCCCCAAAGATATTACAGGGTAGACACGTTTTAACTGAAATCAATCAAGATAACTTTATTCAAAGAGCAGCCCGCTTTGTGTGACTAAAATGAAACAAGACAGTTGAATTGTGTGACTTGAAGATTACCAATGATTTTGAGGCTTTTCTATAATAAAAAGAGGTTCTAACCATTATTTGGGAACAAAGAGAGTTTTCATCTTTTTTCAGATCAAAACCATTCTGTAAAATCTTTGTTGTTTAATTAAATGTGCCGTTATTTACCCCTGATGTTATTTATGACTATGTGCCGATTCCTGCTCGGGCTGTTTGCTGTTGGCTGGTAATAATATATTTGATTTAAATGCTGTTGACTGTGCTATTAACTGCTGCCGTCAGTAAACTCCAAAGATCTTTTTGTTTTGGCTTTAGTATCATATGTGCTTTTTCTGTATCCTGAGCGCTCTATATGATCATGTTAATTTAAAGCTTTATACACATTGTTGTTTTTGCTGGTCTCATCTTTGGTAATATGCTATACCCCACTGCTGCCCGACACTGCCCTTTAGCTGCAGAGCTGGATTAGCTGTTGACCATTTGATGCTGTTGTCTGTCTGGCAGGGACTGAATGACCTGATGTCAGATTTAGATTCTTCCTGGGGATTACACAGCTATGAATGTATTTGCTTCTAAAACCTCCCAAAGTGAATCTAATCTTAAAACTACAAGTTGTAAGTATTCTGAAATTGGGAAACATTTATTTTAAATGCAATCAGGTAGTGTTGCTTTTTACAGCATAATAAATATATGTATCAAAAAAAAAA\"\n", + "find_all_aa_seqs(a,enst)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 18, @@ -835,6 +1078,13 @@ "file.close()\n", "print(common_domains)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/plotIsoforms.py b/plotIsoforms.py index a1b2543..cf5442c 100644 --- a/plotIsoforms.py +++ b/plotIsoforms.py @@ -117,7 +117,6 @@ def preprocessArguments(args): #conditions=list(set([x.split('_')[0] for x in samples])) #conditions = ["0","3","5"] conditions = ["day0","day3","day5"] - print(conditions) conditions.sort() number_replicates={} numerical=True @@ -162,7 +161,9 @@ def __init__(self, csv, gtf, geneIDs, geneNames, outDir, minTPM, maxIso, minPct) (identifier, targets, annotation, data, samples, conditions, number_replicates, x) = preprocessArguments(args) with PdfPages(outdir+'test.pdf') as pdf: - for gene in targets: + for gene_ensembl in targets: + gene = gene_ensembl.split("_")[0] + print(gene) df=data[data[identifier]==gene] if not df.shape[0]: continue @@ -180,15 +181,16 @@ def __init__(self, csv, gtf, geneIDs, geneNames, outDir, minTPM, maxIso, minPct) # isoform percentage calculation df_temp=(df_temp.filter(like='mean').div(data_gene.filter(like='mean').values[0],1)*100).add_prefix('Pct_').join(df_temp) #choose isoforms to plot + print(df_temp) df_temp=chooseIsoforms2Plot(df_temp,minimumTPM,minimumPct,maximumIso,annotation) - + x = [0,3,5] if df_temp.shape[0]: panels = 2 if (df_temp.shape[0] > 9): panels += 1 fig,axes = plt.subplots(panels,2,figsize = (12,9)) fig.subplots_adjust(top = 0.9) - fig.suptitle(gene,fontsize=16) + fig.suptitle(gene_ensembl,fontsize=16) #plot isoform expression axg=plt.subplot(panels,2,1) plotProfiles(x, df_temp, data_gene, axg, colors) diff --git a/test/list.txt b/test/list.txt index 3c82b7d..a6957de 100644 --- a/test/list.txt +++ b/test/list.txt @@ -1,8 +1,38 @@ -ADAM11 -SST -SHOX2 -TNC -GNG8 -IGF2 -REEP1 -CNTFR +BRD4 +BRD3 +BRD2 +PAF1 +CTR9 +CDC73 +LEO1 +RTF1 +WDR61 +SPT5 +SPT4 +SPT6 +TCEA1 +TCEA2 +TCEA3 +TCEANC +TCEANC2 +CDK9 +TRIM28 +SUPT16H +SSRP1 +ELF1 +CDK12 +SUPT16H +SSRP1 +ELL2 +AFF4 +SKI +CCNT1 +NELFA +NELFB +NELFC +NELFD +NELFE +TCEAL +SNUPN +MYC +MLLT1 \ No newline at end of file diff --git a/test/test.pdf b/test/test.pdf deleted file mode 100644 index 5cebb3b..0000000 Binary files a/test/test.pdf and /dev/null differ diff --git a/test/test1.pdf b/test/test1.pdf deleted file mode 100644 index fb1572f..0000000 Binary files a/test/test1.pdf and /dev/null differ diff --git a/test/test2.pdf b/test/test2.pdf deleted file mode 100644 index d1e4219..0000000 Binary files a/test/test2.pdf and /dev/null differ diff --git a/test/test_all_oldlist.pdf b/test/test_all_oldlist.pdf deleted file mode 100644 index b24f440..0000000 Binary files a/test/test_all_oldlist.pdf and /dev/null differ diff --git a/test/test_deseq2.pdf b/test/test_deseq2.pdf deleted file mode 100644 index 6b3b2d5..0000000 Binary files a/test/test_deseq2.pdf and /dev/null differ diff --git a/test/test_deseq2_tpm.pdf b/test/test_deseq2_tpm.pdf deleted file mode 100644 index 1a30353..0000000 Binary files a/test/test_deseq2_tpm.pdf and /dev/null differ diff --git a/test/test_tpm.pdf b/test/test_tpm.pdf deleted file mode 100644 index e0ff774..0000000 Binary files a/test/test_tpm.pdf and /dev/null differ diff --git a/translation_protein.py b/translation_protein.py index 79c4fdb..1dabda4 100644 --- a/translation_protein.py +++ b/translation_protein.py @@ -1,4 +1,5 @@ from Bio import SeqIO +import os def translate(seq,orf): seq = seq.upper() @@ -55,25 +56,46 @@ def translate(seq,orf): transcripts = SeqIO.index(transcripts_filename, "fasta") output = [] +gene = snakemake.input[0].split("/")[5] +os.mkdir("/home/annaldas/projects/result/%s/transcripts" %(gene)) + for transcript in transcripts: protein = translate(str(transcripts[transcript].seq),0) if (protein != ""): - output.append(">" + str(transcripts[transcript].id) + "_1") - output.append(protein) + transcript_name = str(transcripts[transcript].id) + "_1" + transcript_filename = transcript_name.replace("|","_") + transcript_filename = transcript_filename.replace("_","") + transcript_filename_path = "/home/annaldas/projects/result/%s/transcripts/%s_map_protein.fa" %(gene,transcript_filename) + #os.mkdir("/home/annaldas/projects/result/%s/transcripts/%s" %(gene,transcript_filename)) + transcript_file = open(transcript_filename_path, "w+") + transcript_file.write(">" + transcript_name + "\n" + protein) + transcript_file.close() protein = translate(str(transcripts[transcript].seq),1) if (protein != ""): - output.append(">" + str(transcripts[transcript].id) + "_2") - output.append(protein) + transcript_name = str(transcripts[transcript].id) + "_2" + transcript_filename = transcript_name.replace("|","_") + transcript_filename = transcript_filename.replace("_","") + transcript_filename_path = "/home/annaldas/projects/result/%s/transcripts/%s_map_protein.fa" %(gene,transcript_filename) + #os.mkdir("/home/annaldas/projects/result/%s/transcripts/%s" %(gene,transcript_filename)) + transcript_file = open(transcript_filename_path, "w+") + transcript_file.write(">" + transcript_name + "\n" + protein) + transcript_file.close() protein = translate(str(transcripts[transcript].seq),2) if (protein != ""): - output.append(">" + str(transcripts[transcript].id) + "_3") - output.append(protein) + transcript_name = str(transcripts[transcript].id) + "_3" + transcript_filename = transcript_name.replace("|","_") + transcript_filename = transcript_filename.replace("_","") + transcript_filename_path = "/home/annaldas/projects/result/%s/transcripts/%s_map_protein.fa" %(gene,transcript_filename) + #os.mkdir("/home/annaldas/projects/result/%s/transcripts/%s" %(gene,transcript_filename)) + transcript_file = open(transcript_filename_path, "w+") + transcript_file.write(">" + transcript_name + "\n" + protein) + transcript_file.close() -output_filename = snakemake.output[0] -output_file = open(output_filename,"w+") -output_file.write("\n".join(output)) -output_file.close() +#output_filename = snakemake.output[0] +#output_file = open(output_filename,"w+") +#output_file.write("\n".join(output)) +#output_file.close() \ No newline at end of file