Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
isoform_differentiation/isoform_transcripts.ipynb
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
939 lines (939 sloc)
80.1 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Siddharth Annaldasula\n", | |
"# Last Modified: 15.10.2019" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Command line prompts to download data\n", | |
"\n", | |
"# /home/annaldas/ncbi-blast-2.9.0+/bin/blastx -query test.fa -db refseq_protein -out test.out -evalue 1e-5 -max_target_seqs 1 -outfmt 7" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from Bio import SeqIO" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"annotation_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/GffCompare/nanopore.combined.gtf\"\n", | |
"annotate_df = open(annotation_filename,\"r\")\n", | |
"annotate_df_lines = annotate_df.readlines()\n", | |
"annotate_df.close()\n", | |
"transcripts = []\n", | |
"\n", | |
"exon = 1\n", | |
"lines = []\n", | |
"result = []\n", | |
"for line in annotate_df_lines:\n", | |
" t = line.split(\"\\t\")[2]\n", | |
" if (t == \"transcript\"):\n", | |
" if (exon != 1):\n", | |
" result.append(transcript)\n", | |
" transcript = line.split(\"\\t\")[8].split(\";\")[0].split(\" \")[-1][1:-1].strip()\n", | |
" exon = 0\n", | |
" elif (t == \"exon\"):\n", | |
" exon += 1\n", | |
" lines.append(line.strip())\n", | |
"\n", | |
"#output_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/multiple_exons.txt\"\n", | |
"#output = open(output_filename,\"w+\")\n", | |
"#output.writelines(\"\\n\".join(result))\n", | |
"#output.close()\n", | |
"\n", | |
"#output_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_10_1/Results_10_1/GffCompare/nanopore.combined_filt.gtf\"\n", | |
"#output = open(output_filename,\"w+\")\n", | |
"#output.writelines(\"\\n\".join(result))\n", | |
"#output.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['6356d404-48e6-4c7a-8fbd-5f6e7f9b42bc|5',\n", | |
" 'a5e2697c-74a1-4ac9-a5b2-d974af83137d|4']" | |
] | |
}, | |
"execution_count": 32, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"gene_oID[\"KTN1-AS1\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Import the nanopore annotation file\n", | |
"\n", | |
"annotation_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/GffCompare/nanopore.combined_filt.gtf\"\n", | |
"annotate_df = pd.read_csv(annotation_filename,sep = \"\\t\", header = None)\n", | |
"annotate_df = annotate_df[annotate_df[2] != \"exon\"]\n", | |
"annotate_lines = list(annotate_df[8])\n", | |
"chrms = list(annotate_df[0])\n", | |
"start = list(annotate_df[3])\n", | |
"stop = list(annotate_df[4])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Mapping gene name to oID\n", | |
"# Mapping oID to transcript id\n", | |
"# Mapping transcript id to exons\n", | |
"\n", | |
"BRD4_info = []\n", | |
"\n", | |
"gene_oID = dict()\n", | |
"oID_gene = dict()\n", | |
"oID_tID = dict()\n", | |
"tID_oID = dict()\n", | |
"\n", | |
"#tID_exon = dict()\n", | |
"\n", | |
"for ann in range(len(annotate_lines)): \n", | |
" if \"gene_name\" in annotate_lines[ann]:\n", | |
" line = annotate_lines[ann].split(\";\")\n", | |
" tID = line[0].split(\" \")[-1][1:-1]\n", | |
" gene = line[2].split(\" \")[-1][1:-1]\n", | |
" oID = line[3].split(\" \")[-1][1:-1]\n", | |
" transID = line[4].split(\" \")[-1][1:-1].split(\".\")[0]\n", | |
" \n", | |
" if (gene not in gene_oID): gene_oID[gene] = [oID]\n", | |
" else: gene_oID[gene].append(oID)\n", | |
" \n", | |
" if (oID not in oID_tID): oID_tID[oID] = tID\n", | |
" if (tID not in tID_oID): \n", | |
" tID_oID[tID] = oID\n", | |
" else:\n", | |
" print(\"this sucks\")\n", | |
" if (oID not in gene_oID): oID_gene[oID] = gene\n", | |
" \n", | |
" if (gene == \"BRD4\"): \n", | |
" BRD4_info.append([chrms[ann],start[ann],stop[ann],annotate_lines[ann]]) \n", | |
" #if (tID not in tID_exon): tID_exon[tID] = []\n", | |
"\n", | |
"KDM1A = gene_oID[\"KDM1A\"]\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"oID_tID[\"142e462f-f586-42fc-97b9-b2e3bfa1fd0d\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Import transcript isoform sequences\n", | |
"\n", | |
"transcripts_filename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/Results/Pinfish/corrected_transcriptome_polished_collapsed.fas\"\n", | |
"transcripts = SeqIO.index(transcripts_filename, \"fasta\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Extracting isoforms from related genes\n", | |
"\n", | |
"output = []\n", | |
"for gene in gene_oID:\n", | |
" for oID in gene_oID[gene]:\n", | |
" tID = \">\" + oID_tID[transcripts[oID].id]\n", | |
" output.append(tID)\n", | |
" seq = str(transcripts[oID].seq)\n", | |
" output.append(seq)\n", | |
"\n", | |
"#output_filename = \"/project/owlmayerTemporary/Sid/blast/test/polished_transcripts_tcons.fa\"\n", | |
"#output_file = open(output_filename,\"w+\")\n", | |
"#output_file.write(\"\\n\".join(output))\n", | |
"#output_file.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"oID_geneID = dict()\n", | |
"geneID_oID = dict()\n", | |
"\n", | |
"filename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/Results/Pinfish/clustered_transcripts_collapsed.gff\"\n", | |
"gff = open(filename,\"r\")\n", | |
"lines = gff.readlines()\n", | |
"gff.close()\n", | |
"for line in lines:\n", | |
" if (not line.startswith(\"##\")):\n", | |
" s = line.split(\"\\t\")[-1]\n", | |
" g_t = s.split(\";\")\n", | |
" if (len(g_t) > 2):\n", | |
" geneid = g_t[0].split('\"')[1]\n", | |
" transcriptid = g_t[1].split('\"')[1]\n", | |
" transcriptid = transcriptid.split(\"|\")[0]\n", | |
" oID_geneID[transcriptid] = geneid\n", | |
" geneID_oID[geneid] = transcriptid\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"print(tID_oID[\"TCONS_00032753\"],oID_gene[tID_oID[\"TCONS_00032753\"]])\n", | |
"\n", | |
"for oID in gene_oID[\"BRD4\"]:\n", | |
" try:\n", | |
" print(oID,oID_geneID[oID.split(\"|\")[0]],oID_tID[oID])\n", | |
" except:\n", | |
" print(\"None\")\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"filename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/Results/Quantification/all_counts.txt\"\n", | |
"df = pd.read_csv(filename)\n", | |
"\n", | |
"rep1 = [\"OJ32\",\"OJ33\",\"OJ34\"]\n", | |
"rep2 = [\"OJ40\",\"OJ41\",\"OJ42\"]\n", | |
"df_rep1 = df.drop(columns = rep2)\n", | |
"df_rep2 = df.drop(columns = rep1)\n", | |
"\n", | |
"#for name in rep1:\n", | |
"# df_rep1[name] = (df_rep1[name] - df_rep1[name].min())/(df_rep1[name].max() - df_rep1[name].min())\n", | |
" \n", | |
"#for name in rep2:\n", | |
"# df_rep2[name] = (df_rep2[name] - df_rep2[name].min())/(df_rep2[name].max() - df_rep2[name].min())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"group = df_rep1.groupby([\"gene_name\"])\n", | |
"\n", | |
"count = 0\n", | |
"for key,item in group:\n", | |
" if (\"ABI2\" == key):\n", | |
" #for name in rep1:\n", | |
" # item[name] = (item[name] - item[name].mean())/item[name].std()\n", | |
" print(key)\n", | |
" print(item.sort_values(\"transcript_id\"))\n", | |
" break" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"group = df_rep2.groupby([\"gene_name\"])\n", | |
"count = 0\n", | |
"for gene,item in group:\n", | |
" if(gene == \"FUS\"):\n", | |
" print(gene)\n", | |
" #for name in rep2:\n", | |
" # item[name] = (item[name] - item[name].mean())/item[name].std()\n", | |
" #for name in rep2:\n", | |
" # if (item[name].max() > 3):\n", | |
" # print(gene)\n", | |
" print(item.sort_values(\"transcript_id\"))\n", | |
" break" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"group = df.groupby([\"gene_name\"])\n", | |
"count = 0\n", | |
"for gene,item in group:\n", | |
" if(gene == \"TLL3\"):\n", | |
" print(gene)\n", | |
" #for name in rep2:\n", | |
" # item[name] = (item[name] - item[name].mean())/item[name].std()\n", | |
" #for name in rep2:\n", | |
" # if (item[name].max() > 3):\n", | |
" # print(gene)\n", | |
" print(item.sort_values(\"transcript_id\"))\n", | |
" break" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"output = []\n", | |
"gene = \"BRD4\"\n", | |
"for oID in gene_oID[gene]:\n", | |
" tID = \">\" + oID_tID[transcripts[oID].id]\n", | |
" output.append(tID)\n", | |
" seq = str(transcripts[oID].seq)\n", | |
" output.append(seq)\n", | |
"\n", | |
" \n", | |
"#output_filename = \"/project/owlmayerTemporary/Sid/blast/test/kdm1a.fa\"\n", | |
"#output_file = open(output_filename,\"w+\")\n", | |
"#output_file.write(\"\\n\".join(output))\n", | |
"#output_file.close()\n", | |
"\n", | |
"\n", | |
"#/home/annaldas/ncbi-blast-2.9.0+/bin/blastx -query /project/owlmayerTemporary/Sid/blast/test/kdm1a.fa -db /project/owlmayerTemporary/Sid/blast/test/human.protein.fa -out kdm1a.out -evalue 1e-5 -max_target_seqs 5 -max_hsps 1 -outfmt 7 -num_threads 4" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"afilename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/ReferenceData/gencode.v32.primary_assembly.annotation.gtf\"\n", | |
"afile = open(afilename,\"r\")\n", | |
"afile_lines = afile.readlines()\n", | |
"afile.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"output = []\n", | |
"for line in afile_lines:\n", | |
" if (not line.startswith(\"#\")):\n", | |
" output.append(line.strip().split(\"\\t\"))\n", | |
"pd_aline = pd.DataFrame(output,columns=[\"chr\",\"source\",\"type\",\"start\",\"stop\",\"a\",\"b\",\"c\",\"info\"])\n", | |
"pd_aline = pd_aline.astype({'start': 'int32', \"stop\":\"int32\"})\n", | |
"#print(pd_aline[\"type\"].unique())\n", | |
"pd_aline = pd_aline[pd_aline[\"type\"] == \"UTR\"]\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"info = list(pd_aline[\"info\"])\n", | |
"types = list(pd_aline[\"type\"])\n", | |
"for ann in range(len(info)): \n", | |
" if \"gene_name\" in info[ann]:\n", | |
" line = info[ann].split(\";\")\n", | |
" transID = line[1].split(\" \")[-1][1:-1].split(\".\")[0]\n", | |
" gene = line[3].split(\" \")[-1][1:-1]\n", | |
" \n", | |
" if (transID == \"ENST00000391839\"):\n", | |
" print(line,types[ann])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"pd_aline.to_csv(path_or_buf = \"/project/owlmayerTemporary/Sid/nanopore-analysis/ReferenceData/df_utr_regions.csv\",index = False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_utr_regions = pd.read_csv(\"/project/owlmayerTemporary/Sid/nanopore-analysis/ReferenceData/df_utr_regions.csv\")\n", | |
"\n", | |
"info = list(df_utr_regions[\"info\"])\n", | |
"chrms = list(df_utr_regions[\"chr\"])\n", | |
"start = list(df_utr_regions[\"start\"])\n", | |
"stop = list(df_utr_regions[\"stop\"])\n", | |
"\n", | |
"for ann in range(len(info)): \n", | |
" if \"gene_name\" in info[ann]:\n", | |
" line = info[ann].split(\";\")\n", | |
" transID = line[1].split(\" \")[-1][1:-1].split(\".\")[0]\n", | |
" gene = line[3].split(\" \")[-1][1:-1]\n", | |
" \n", | |
" \n", | |
" if (gene == \"EBF2\"):\n", | |
" print(chrms[ann] + \"\\t\" + str(start[ann]) + \"\\t\" + str(stop[ann]) + \"\\t\" + transID)\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"bedfastafile = open(\"/home/annaldas/projects/result/RPS24/RPS24_utr_regions.fa\")\n", | |
"bedfastalines = bedfastafile.readlines()\n", | |
"bedfastafile.close()\n", | |
"trans_utr = dict()\n", | |
"for line in bedfastalines:\n", | |
" if (line.startswith(\">\")):\n", | |
" trans_id = line[1:].strip()\n", | |
" if (trans_id not in trans_utr):\n", | |
" trans_utr[trans_id] = []\n", | |
" else:\n", | |
" trans_utr[trans_id].append(line.strip())\n", | |
"\n", | |
"transcript_id = \"ENST00000372360\"\n", | |
"s = '''CTCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATCATGAACGACACCGTAACTATCCGCACTAGAAAGTTCATGACCAACCGACTACTTCAGAGGAAACAAATGGTCATTGATGTCCTTCACCCCGGGAAGGCGACAGTGCCTAAGACAGAAATTCGGGAAAAACTAGCCAAAATGTACAAGACCACACCGGATGTCATCTTTGTATTTGGATTCAGAACTCATTTTGGTGGTGGCAAGACAACTGGCTTTGGCATGATTTATGATTCCCTGGATTATGCAAAGAAAAATGAACCCAAACATAGACTTGCAAGACATGGCCTGTATGAGAAGAAAAAGACCTCAAGAAAGCAACGAAAGGAACGCAAGAACAGAATGAAGAAAGTCAGGGGGACTGCAAAGGCCAATGTTGGTGCTGGCAAAAAGTGAGCTGGAGATTGGATCACAGCCGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAAC'''" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"if transcript_id in trans_utr:\n", | |
" seq = s\n", | |
" for utr in trans_utr[transcript_id]:\n", | |
" pos = seq.find(utr)\n", | |
" print(pos, utr)\n", | |
" if (pos != -1):\n", | |
" seq = seq[:pos] + seq[pos + len(utr):]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for t in trans_utr:\n", | |
" seq = s\n", | |
" for utr in trans_utr[t]: \n", | |
" pos = seq.find(utr)\n", | |
" print(pos, utr)\n", | |
" if (pos != -1):\n", | |
" seq = seq[:pos] + seq[pos + len(utr):]\n", | |
" #print(t, len(trans_utr[t]),trans_utr[t])\n", | |
"a = seq" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"a = '''GGCGGCGGCGGCGGCGGCGGCGGCGGCGGCTGGGCTGTTTGTTCTGGTCTCCCGCAGCCGAGGAGCCGAAGCAGTGGCGGCGGCAGCGGCTGCGGCGGCTGCCGGCGGTGCCCGCGGGCGAGCGCGGCCTGTGAGCTCGGCAGAGCGGCGGGCGGGCCCCGGCGCCGCGCAGGCAGCTCGGGGAGGGGGCGGCGGCAGCGGGCGGACGGCCGGCGGGGGCGGCGTGCGGCCTAGCGTCTCAGAGTGCCTGGTGAAGAATGTGATGGGATCACTAGCATGTCTGCGGAGAGCGGCCCTGGGACGAGATTGAGAAATCTGCCAGTAATGGGGGATGGACTAGAAACTTCCCAAATGTCTACAACACAGGCCCAGGCCCAACCCCAGCCAGCCAACGCAGCCAGCACCAACCCCCCGCCCCCAGAGACCTCCAACCCTAACAAGCCCAAGAGGCAGACCAACCAACTGCAATACCTGCTCAGAGTGGTGCTCAAGACACTATGGAAACACCAGTTTGCATGGCCTTTCCAGCAGCCTGTGGATGCCGTCAAGCTGAACCTCCCTGATTACTATAAGATCATTAAAACGCCTATGGATATGGGAACAATAAAGAAGCGCTTGGAAAACAACTATTACTGGAATGCTCAGGAATGTATCCAGGACTTCAACACTATGTTTACAAATTGTTACATCTACAACAAGCCTGGAGATGACATAGTCTTAATGGCAGAAGCTCTGGAAAAGCTCTTCTTGCAAAAAATAAATGAGCTACCCACAGAAGAAACCGAGATCATGATAGTCCAGGCAAAAGGAAGAGGACGTGGGAGGAAAGAAACAGGGACAGCAAAACCTGGCGTTTCCACGGTACCAAACACAACTCAAGCATCGACTCCTCCGCAGACCCAGACCCCTCAGCCGAATCCTCCTCCTGTGCAGGCCACGCCTCACCCCTTCCCTGCCGTCACCCCGGACCTCATCGTCCAGACCCCTGTCATGACAGTGGTGCCTCCCCAGCCACTGCAGACGCCCCCGCCAGTGCCCCCCCAGCCACAACCCCCACCCGCTCCAGCTCCCCAGCCCGTACAGAGCCACCCACCCATCATCGCGGCCACCCCACAGCCTGTGAAGACAAAGAAGGGAGTGAAGAGGAAAGCAGACACCACCACCCCCACCACCATTGACCCCATTCACGAGCCACCCTCGCTGCCCCCGGAGCCCAAGACCACCAAGCTGGGCCAGCGGCGGGAGAGCAGCCGGCCTGTGAAACCTCCAAAGAAGGACGTGCCCGACTCTCAGCAGCACCCAGCACCAGAGAAGAGCAGCAAGGTCTCGGAGCAGCTCAAGTGCTGCAGCGGCATCCTCAAGGAGATGTTTGCCAAGAAGCACGCCGCCTACGCCTGGCCCTTCTACAAGCCTGTGGACGTGGAGGCACTGGGCCTACACGACTACTGTGACATCATCAAGCACCCCATGGACATGAGCACAATCAAGTCTAAACTGGAGGCCCGTGAGTACCGTGATGCTCAGGAGTTTGGTGCTGACGTCCGATTGATGTTCTCCAACTGCTATAAGTACAACCCTCCTGACCATGAGGTGGTGGCCATGGCCCGCAAGCTCCAGGATGTGTTCGAAATGCGCTTTGCCAAGATGCCGGACGAGCCTGAGGAGCCAGTGGTGGCCGTGTCCTCCCCGGCAGTGCCCCCTCCCACCAAGGTTGTGGCCCCGCCCTCATCCAGCGACAGCAGCAGCGATAGCTCCTCGGACAGTGACAGTTCGACTGATGACTCTGAGGAGGAGCGAGCCCAGCGGCTGGCTGAGCTCCAGGAGCAGCTCAAAGCCGTGCACGAGCAGCTTGCAGCCCTCTCTCAGCCCCAGCAGAACAAACCAAAGAAAAAGGAGAAAGACAAGAAGGAAAAGAAAAAAGAAAAGCACAAAAGGAAAGAGGAAGTGGAAGAGAATAAAAAAAGCAAAGCCAAGGAACCTCCTCCTAAAAAGACGAAGAAAAATAATAGCAGCAACAGCAATGTGAGCAAGAAGGAGCCAGCGCCCATGAAGAGCAAGCCCCCTCCCACGTATGAGTCGGAGGAAGAGGACAAGTGCAAGCCTATGTCCTATGAGGAGAAGCGGCAGCTCAGCTTGGACATCAACAAGCTCCCCGGCGAGAAGCTGGGCCGCGTGGTGCACATCATCCAGTCACGGGAGCCCTCCCTGAAGAATTCCAACCCCGACGAGATTGAAATCGACTTTGAGACCCTGAAGCCGTCCACACTGCGTGAGCTGGAGCGCTATGTCACCTCCTGTTTGCGGAAGAAAAGGAAACCTCAAGCTGAGAAAGTTGATGTGATTGCCGGCTCCTCCAAGATGAAGGGCTTCTCGTCCTCAGAGTCGGAGAGCTCCAGTGAGTCCAGCTCCTCTGACAGCGAAGACTCCGAAACAGGTCCTGCCTAATCATTGGACACGGACTCTTAATAAAACGGTCTTCAGTTCCAGATTCCTTCCCAGCAAGCTATAGCTTAAGTCCATTTTCTTCCGTGAAAGGGACAGGACTCCATCAAGTTATGGAATTCCTCAGAGCCCTGGGCCTGTCCCCCGGGGTGGATTAGTCATGTCCAGCAGCACACGCCTAGTCCCGCCTTCGGGAAGGCTGCCTGCCTGGCCAGCCGCCCAGGCCTCTCTGTGTAAAGACTGCCTGGCTGTCCTGCCCAGCCTTCCTGGTTCTCTGGGGTCCTCTGGGTGGGTGGCATCTCCTGGAGGGTGATGACAATCCCCAACACATGCATTCATGTGGTGCTACTCTGTGTGCAAAGCCAGACCCCAAGTATGTTTTCTCTCTTTGTCCCATCCCTCTTTTTCTGGGACTTTGGACCCTAACTACTTCCCTCCTGAACCTTGCAGTGACATCAGTCCAGGAGAGCTCTCGTTCAGTGTGCGGAAGAACACTCTGACCTCTAGAGCTGTCCTAGATAAGGAGTGGGAGCTTTAGAGGCAAGGCCTCTAGACCCTGGAAGGCTCAGTGAGGCTCTTCCCACAGCATGCTTCTCACTGGTGCCCTGTAAGGCTCGAGCCACCGCTGACTCTGAGCCTTTTGGAGTCTTTCCTCCTTCGTCTCCATTGTTCCCGTGCATTTCCAAAAGCTTAAGTTGCCTGGTGGGCATTTCCCCAGTTTCTTTGGCCTCCGTCTTCTCAAGTCACATAGGGAAAGTACCTCCTGGAACCAGGCTGCAGTATGCAGGACCTGCCAGGCAGGCACTGGTGAAGGGCCTTGGGCCTATCATCCCCCCAACCCCACCTCACCCCACCCGCCTCCTCTAGTGGGGTGAGTCTGGGCTGGTGGACCAGAGAGGGTGTCACAGACCCTCAGGGACTGCCCCATGGACACCTCTGACTGGTGTTAACAGTGTGAACATTTTCCCCGTCTTCAGTCCCTTAGAATGACGACAGCCCCTGGGGTTGGGGCAGGCGAGTGTGGCCACATCATCCAAGCCCTCCCAGAGACACAAATAGGCTTTTTTGCTCTAAAAATAAATACCAGCCCTTTTTTGGTCACAAATCCAGCATCTCAGCAGAAAACTGCCTGACATGAAAAGTCCCCTGAGGAACTGCATCTGCGTTTCAGGGGCTTTTCATTTTTTCTCCTTTTTTAAAGTGTAGATTGTGGGTGCTTCCTAGAGGCCTGCCTTCTTCTGGAACTGGAAGTGGGCTATCACCATGGGCAAGCCCTTGGGTGCAGGCTCCCCACCTGCCTGGGAACTCTGGCAGCTCTCCTCAGCTCCTTGGGCTTGAGCAGCTGCAACTGCCCCAGATTTGCTGTGGAAGCAGGGGCTAGCCCTGGCCTCACCAGGGCCTCCCGGGGCCCTGCATTGATGCTCAGGAGTTCCTGGGCTGCTCTTGATCCTTTCTGGGCATCCAGCTTCCAGTTAAGCTCTGTTTGCCAAACAAACTATTCTCAGCTGCCCTTTGGCCTGCGCCTGATGTGTTCCTGTTGCAGTCCCGCCTGCCTGAGACAGGAGCAGGCAGGAGAGCCTTCATGCCCAGATTCCCACAGGACAATTGGGGAGCTGCTGGCATTGTCTTTCTGGGAAGATTCTGCTTTCTTGGACCAAATGGCAGCCTGATTACCAGTGTCGGGCCTGCATGCTGCCCCCGACACACGCACGCACGCGCACACACGTGTGCACATGGGCCATAGCCACAAGCCAGCTCTCCTCCAGGGTCCTTTCAACCTCGCTGTCCAGGGACCCTGTCCTTCTTGCCCGTGGGGCTTCCATCTGGCAGAGAACGTTCAGGGCTTGTTGAACTTGAAAGCTCATTAGACTTAAGCTGTCACCTGTGCTTGGTGCCCCAGGAACAGCCAGAGAGGACAGTGCCCACTCACTTCTTGTTGGCAGCCTCCTGTGCAGGAAGTGCCAGCCGGGCCTCGACGCACCAGCTGGCTGTGGGTCCTGAGGAGGGGCGGGAGGCGGCCGCTCAGTGCAGATGGGGACTCCTCTCCTCTGCCCTGACCTTACCCTCCATTACCTCCTTCACTGGAGTGGGGCTGGGGGGTGGGTGGAATCAGTGTTTTAATCGGATTTTTAAAAAACATTTTATTTCTTTGTACAATTACCATCCTATGTAAAGATGAAATTTGTGTTGAGTTGAAGATTGTCATGGAATAAAGATCACACCGTA'''\n", | |
"\n", | |
"#a = '''GTACGCGACGGCGGTTGGCGGCGCGCGGGCAGCGTGAAGCGAGGCGAGGCAAGGCTTTTCGGACCCACGGAGCGACAGAGCGAGCGGCCCCTACGGCCGTCGGCGGCCCGGCGGCCCGAGATGTTATCTGGGAAGAAGGCGGCAGCCGCGGCGGCGGCGGCTGCAGCGGCAGCAACCGGGACGGAGGCTGGCCCTGGGACAGCAGGCGGCTCCGAGAACGGGTCTGAGGTGGCCGCGCAGCCCGCGGGCCTGTCGGGCCCAGCCGAGGTCGGGCCGGGGGCGGTGGGGGAGCGCACACCCCGCAAGAAAGAGCCTCCGCGGGCCTCGCCCCCCGGGGGCCTGGCGGAACCGCCGGGGTCCGCAGGGCCTCAGGCCGGCCCTACTGTCGTGCCTGGGTCTGCGACCCCCATGGAAACTGGAATAGCAGAGACTCCGGAGGGGCGTCGGACCAGCCGGCGCAAGCGGGCGAAGGTAGAGTACAGAGAGATGGATGAAAGCTTGGCCAACCTCTCAGAAGATGAGTATTATTCAGAAGAAGAGAGAAATGCCAAAGCAGAGAAGGAAAAGAAGCTTCCCCCACCACCCCCTCAAGCCCCACCTGAGGAAGAAAATGAAAGTGAGCCTGAAGAACCATCGGGGCAAGCAGGAGGACTTCAAGACGACAGTTCTGGAGGGTATGGAGACGGCCAAGCATCAGGTGTGGAGGGCGCAGCTTTCCAGAGCCGACTTCCTCATGACCGGATGACTTCTCAAGAAGCAGCCTGTTTTCCAGATATTATCAGTGGACCACAACAGACCCAGAAGGTTTTTCTTTTCATTAGAAACCGCACACTGCAGTTGTGGTTGGATAATCCAAAGATTCAGCTGACATTTGAGGCTACTCTCCAACAATTAGAAGCACCTTATAACAGTGATACTGTGCTTGTCCACCGAGTTCACAGTTATTTAGAGCGTCATGGTCTTATCAACTTCGGCATCTATAAGAGGATAAAACCCCTACCAACTAAAAAGACAGGAAAGGTAATTATTATAGGCTCTGGGGTCTCAGGCTTGGCAGCAGCTCGACAGTTACAAAGTTTTGGAATGGATGTCACACTTTTGGAAGCCAGGGATCGTGTGGGTGGACGAGTTGCCACATTTCGCAAAGGAAACTATGTAGCTGATCTTGGAGCCATGGTGGTAACAGGTCTTGGAGGGAATCCTATGGCTGTGGTCAGCAAACAAGTAAATATGGAACTGGCCAAGATCAAGCAAAAATGCCCACTTTATGAAGCCAACGGACAAGCTGTTCCTAAAGAGAAAGATGAAATGGTAGAGCAAGAGTTTAACCGGTTGCTAGAAGCTACATCTTACCTTAGTCATCAACTAGACTTCAATGTCCTCAATAATAAGCCTGTGTCCCTTGGCCAGGCATTGGAAGTTGTCATTCAGTTACAAGAGAAGCATGTCAAAGATGAGCAGATTGAACATTGGAAGAAGATAGTGAAAACTCAGGAAGAATTGAAAGAACTTCTTAATAAGATGGTAAATTTGAAAGAGAAAATTAAAGAACTCCATCAGCAATACAAAGAAGCATCTGAAGTAAAGCCACCCAGAGATATTACTGCCGAGTTCTTAGTGAAAAGCAAACACAGGGATCTGACCGCCCTATGCAAGGAATATGATGAATTAGCTGAAACACAAGGAAAGCTAGAAGAAAAACTTCAGGAGTTGGAAGCGAATCCCCCAAGTGATGTATATCTCTCATCAAGAGACAGACAAATACTTGATTGGCATTTTGCAAATCTTGAATTTGCTAATGCCACACCTCTCTCAACTCTCTCCCTTAAGCACTGGGATCAGGATGATGACTTTGAGTTCACTGGCAGCCACCTGACAGTAAGGAATGGCTACTCGTGTGTGCCTGTGGCTTTAGCAGAAGGCCTAGACATTAAACTGAATACAGCAGTGCGACAGGTTCGCTACACGGCTTCAGGATGTGAAGTGATAGCTGTGAATACCCGCTCCACGAGTCAAACCTTTATTTATAAATGCGACGCAGTTCTCTGTACCCTTCCCCTGGGTGTGCTGAAGCAGCAGCCACCAGCCGTTCAGTTTGTGCCACCTCTCCCTGAGTGGAAAACATCTGCAGTCCAAAGGATGGGATTTGGCAACCTTAACAAGGTGGTGTTGTGTTTTGATCGGGTGTTCTGGGATCCAAGTGTCAATTTGTTCGGGCATGTTGGCAGTACGACTGCCAGCAGGGGTGAGCTCTTCCTCTTCTGGAACCTCTATAAAGCTCCAATACTGTTGGCACTAGTGGCAGGAGAAGCTGCTGGTATCATGGAAAACATAAGTGACGATGTGATTGTTGGCCGATGCCTGGCCATTCTCAAAGGGATTTTTGGTAGCAGTGCAGTACCTCAGCCCAAAGAAACTGTGGTGTCTCGTTGGCGTGCTGATCCCTGGGCTCGGGGCTCTTATTCCTATGTTGCTGCAGGATCATCTGGAAATGACTATGATTTAATGGCTCAGCCAATCACTCCTGGCCCCTCGATTCCAGGTGCCCCACAGATTTTAGTCATTCCATCTTCGGACCCTTTCAGATAACCAAGAGCAGAGTTAAAAGGATGGGCAGCATTTCTGATTTCTCATAATTTGGTTTTGTAAATCTAGGCAGTCTACATTCAGAATGGAGGAGTCCAGAGTATAAGTCAAATAACATTTTTCCTTATTCAGGTTTTCTCCCTAAAAAACAAAACCATTTTTAATTGCACTTCCATTTTGTAATGGCTCATAGGATTGCTGTAAGGTCTCAATTACTAGGGGCTTCCAGGGCATTTCTGAGAAATAACCCTGGGTCCTTGTCTAGACCCTTATGCCAGACCCCACTCCAAAGAGCGGTAAGAATTCCTTAGTGTCATAGCCCAGACCTGCTGAGCTGCGAGGCTTAAGTGTCCCTGATCACCAAATGTCCTGTGCTTCAGGGTAGCGAGGCTCCCTCCCAGAAGGTAGTTATACATGAGGGAAGGACGCTTTACAACTGGGTATCTAAACTGATGAGAACACATGTTAAGCATCACTTTAGGACTGAGCCTAGGTAGAGTTTTATTGTCTCATTTCTACTTGTCAATTCTGGGAAAGTGCCTACTGATAAGGGAGACTCTTCGATAGAATGATGAATAGTAATTGGGGGGGTCAGCCTTTAAAAAGGTCAACAGCAATTTAAGTACTTAGCAATTTAAGTACAAGAATAAAGGTATATGTGCAGCCTGCCAATTTTCTCTTTTTCCCCTAAAATAGCCGATTCCACGACTCTTCTTTGCGGGAGAACATACGATCCGTAACTACCCAGCCACAGTGCATGGTGCTCTGCTGAGTGGGCTGCGAGAAGCGGGAAGAATTGCAGACCAGTTTTTGGGGGCCATGTATACGCTGCCTCGCCAGGCCACACCAGGTGTTCCTGCACAGCAGTCCCCAAGCATGTGAGACAGATGCATTCTAAGGGAAGAGGCCCATGTGCCTGTTTCTGCCATGTAAGGAAGGCTCTTCTAGCAATACTAGATCCCACTGAGAAAATCCACCCTGGCATCTGGGCTCCTGATCAGCTGATGGAGCTCCTGATTTGACAAAGGAGCTTGCCTCCTTTGAATGACCTAGAGCACAGGGAGGAACTTGTCCATTAGTTTGGAATTGTGTTCTTCGTAAAGACTGAGGCAAGCAAGTGCTGTGAAATAACATCATCTTAGTCCCTTGGTGTGTGGGGTTTTTGTTTTTTTTTTATATTTTGAGAATAAAACTTCATATAAAAA'''\n", | |
"\n", | |
"#a = '''GCTTGGCGCGTGCGTACGCGACGGCGGTTGGCGGCGCGCGGGCAGCGTGAAGCGAGGCGAGGCAAGGCTTTTCGGACCCACGGAGCGACAGAGCGAGCGGCCCCTACGGCCGTCGGCGGCCCGGCGGCCCGAGATGTTATCTGGGAAGAAGGCGGCAGCCGCGGCGGCGGCGGCTGCAGCGGCAGCAACCGGGACGGAGGCTGGCCCTGGGACAGCAGGCGGCTCCGAGAACGGGTCTGAGGTGGCCGCGCAGCCCGCGGGCCTGTCGGGCCCAGCCGAGGTCGGGCCGGGGGCGGTGGGGGAGCGCACACCCCGCAAGAAAGAGCCTCCGCGGGCCTCGCCCCCCGGGGGCCTGGCGGAACCGCCGGGGTCCGCAGGGCCTCAGGCCGGCCCTACTGTCGTGCCTGGGTCTGCGACCCCCATGGAAACTGGAATAGCAGAGACTCCGGAGGGGCGTCGGACCAGCCGGCGCAAGCGGGCGAAGGTAGAGTACAGAGAGATGGATGAAAGCTTGGCCAACCTCTCAGAAGATGAGTATTATTCAGAAGAAGAGAGAAATGCCAAAGCAGAGAAGGAAAAGAAGCTTCCCCCACCACCCCCTCAAGCCCCACCTGAGGAAGAAAATGAAAGTGAGCCTGAAGAACCATCGGGGCAAGCAGGAGGACTTCAAGACGACAGTTCTGGAGGGTATGGAGACGGCCAAGCATCAGGTGTGGAGGGCGCAGCTTTCCAGAGCCGACTTCCTCATGACCGGATGACTTCTCAAGAAGCAGCCTGTTTTCCAGATATTATCAGTGGACCACAACAGACCCAGAAGGTTTTTCTTTTCATTAGAAACCGCACACTGCAGTTGTGGTTGGATAATCCAAAGATTCAGCTGACATTTGAGGCTACTCTCCAACAATTAGAAGCACCTTATAACAGTGATACTGTGCTTGTCCACCGAGTTCACAGTTATTTAGAGCGTCATGGTCTTATCAACTTCGGCATCTATAAGAGGATAAAACCCCTACCAACTAAAAAGACAGGAAAGGTAATTATTATAGGCTCTGGGGTCTCAGGCTTGGCAGCAGCTCGACAGTTACAAAGTTTTGGAATGGATGTCACACTTTTGGAAGCCAGGGATCGTGTGGGTGGACGAGTTGCCACATTTCGCAAAGGAAACTATGTAGCTGATCTTGGAGCCATGGTGGTAACAGGTCTTGGAGGGAATCCTATGGCTGTGGTCAGCAAACAAGTAAATATGGAACTGGCCAAGATCAAGCAAAAATGCCCACTTTATGAAGCCAACGGACAAGCTGTTCCTAAAGAGAAAGATGAAATGGTAGAGCAAGAGTTTAACCGGTTGCTAGAAGCTACATCTTACCTTAGTCATCAACTAGACTTCAATGTCCTCAATAATAAGCCTGTGTCCCTTGGCCAGGCATTGGAAGTTGTCATTCAGTTACAAGAGAAGCATGTCAAAGATGAGCAGATTGAACATTGGAAGAAGATAGTGAAAACTCAGGAAGAATTGAAAGAACTTCTTAATAAGATGGTAAATTTGAAAGAGAAAATTAAAGAACTCCATCAGCAATACAAAGAAGCATCTGAAGTAAAGCCACCCAGAGATATTACTGCCGAGTTCTTAGTGAAAAGCAAACACAGGGATCTGACCGCCCTATGCAAGGAATATGATGAATTAGCTGAAACACAAGGAAAGCTAGAAGAAAAACTTCAGGAGTTGGAAGCGAATCCCCCAAGTGATGTATATCTCTCATCAAGAGACAGACAAATACTTGATTGGCATTTTGCAAATCTTGAATTTGCTAATGCCACACCTCTCTCAACTCTCTCCCTTAAGCACTGGGATCAGGATGATGACTTTGAGTTCACTGGCAGCCACCTGACAGTAAGGAATGGCTACTCGTGTGTGCCTGTGGCTTTAGCAGAAGGCCTAGACATTAAACTGAATACAGCAGTGCGACAGGTTCGCTACACGGCTTCAGGATGTGAAGTGATAGCTGTGAATACCCGCTCCACGAGTCAAACCTTTATTTATAAATGCGACGCAGTTCTCTGTACCCTTCCCCTGGGTGTGCTGAAGCAGCAGCCACCAGCCGTTCAGTTTGTGCCACCTCTCCCTGAGTGGAAAACATCTGCAGTCCAAAGGATGGGATTTGGCAACCTTAACAAGGTGGTGTTGTGTTTTGATCGGGTGTTCTGGGATCCAAGTGTCAATTTGTTCGGGCATGTTGGCAGTACGACTGCCAGCAGGGGTGAGCTCTTCCTCTTCTGGAACCTCTATAAAGCTCCAATACTGTTGGCACTAGTGGCAGGAGAAGCTGCTGGTATCATGGAAAACATAAGTGACGATGTGATTGTTGGCCGATGCCTGGCCATTCTCAAAGGGATTTTTGGTAGCAGTGCAGTACCTCAGCCCAAAGAAACTGTGGTGTCTCGTTGGCGTGCTGATCCCTGGGCTCGGGGCTCTTATTCCTATGTTGCTGCAGGATCATCTGGAAATGACTATGATTTAATGGCTCAGCCAATCACTCCTGGCCCCTCGATTCCAGGTGCCCCACAGCCGATTCCACGACTCTTCTTTGCGGGAGAACATACGATCCGTAACTACCCAGCCACAGTGCATGGTGCTCTGCTGAGTGGGCTGCGAGAAGCGGGAAGAATTGCAGACCAGTTTTTGGGGGCCATGTATACGCTGCCTCGCCAGGCCACACCAGGTGTTCCTGCACAGCAGTCCCCAAGCATGTGAGACAGATGCATTCTAAGGGAAGAGGCCCATGTGCCTGTTTCTGCCATGTAAGGAAGGCTCTTCTAGCAATACTAGATCCCACTGAGAAAATCCACCCTGGCATCTGGGCTCCTGATCAGCTGATGGAGCTCCTGATTTGACAAAGGAGCTTGCCTCCTTTGAATGACCTAGAGCACAGGGAGGAACTTGTCCATTAGTTTGGAATTGTGTTCTTCGTAAAGACTGAGGCAAGCAAGTGCTGTGAAATAACATCATCTTAGTCCCTTGGTGTGTGGGGTTTTTGTTTTTTTTTTATATTTTGAGAATAAAACTTCATATAAAA'''\n", | |
"a = '''GGGAAGTGTCTGAACAAAACAAGGAACAAAAATGAGTGGGTGGGATGAGATGGAAGAAACCCCAAAGAACCTAAATACGTCACACTTGAAGTGGCACAGTGGTATATGAGAAAGTGCCTGAGTGGGCATGGACATCGAAGAAGTGGGGGGACCTCTGGCTGGATGCCTGAACCACACCCTCACAGGGAGGTTTGGCAGATCCAAAGACAATAAAGGACCTTTCAGGTCAAAAAGAGGGGATGTAGGAGAATAAACAGTGCTAGAAATGTTTCAATCAGTTTGTCTCTGCATACATATAAATTATATACTTGTATCTTACATCTTAGGGGCTCTTCTTGGGCGTCACCCGTGCCTTGTGGCTGGGGCATGTACATACACAAGTGGACACACAGGCAGAGCAGCCACCTGTGGGCTTTCTTGGACAGGAATGTGTGTGTATGTGCATGGGGGGTGAGGATCCTATTTTGGGGGATGTAGACTTATATCTAAGAGTATCTGGTTAACCCTGAGCTTAAATGAAAGGAGGAGTTAGGTTGAGGCAGGCAAGGTGGGAGAAGTGGCCCAAGTCCTTTGGTGAGTGGGGGGACAGGATGGAGTAGGGGGGACAGGATGGAGTAGGGTGGAGGGGAAGAATACTGTCTGGAGTCTGGCCAGGGTTCTGCTAGAGCACACCCTCCACCTCAGCCAGGGTCCACAAGGATGGGTACCGGGCTCTGCGTCACAGCTTCAGCTTGGGGTGGTTGCTATGAGTCTGCGTGGCTCCCGCCCAGGGCAGACAGGGACAGGTCACAGGAGAGGGGCTAGGTAATCCCTGGCAGTAGTTCCTGTACAGAGGTGGTCTGGGGTCCAGGGGGTCCCCTGGGCCTAGCCTAGGCAACAGTTGGTTCACAAAGAAATGTCAGGGAGACGCCAGCATTAAAAAAAGAGAGATGTGTTTATTCCATGATCAGTACAGACCAAATGCATATTCACCGTATGAAAGTCAAACCAGTCAGTGACTCCAGAGTTTGGCCAACACTGAGGCACCAGCGTCGTGGTGTAGAGTGGGTTCTCATGGCACGCGTAACCTCACCAGGGGCTCCAATTATAAAAATTAAAAAAAAAAAAAAAAAAAAG'''\n", | |
"\n", | |
"a = '''GCTCAGTCCTCCAGGCGTCGGTACTCAGCGGTGTTGGAACTTCGTTGCTTGCTTGCCTGTGCGCGCGTGCGCGGACATGGCCTCAAACGATTATACCCAACAAGCAACCCAAAGCTATGGGGCCTACCCCACCCAGCCCGGGCAGGGCTATTCCCAGCAGAGCAGTCAGCCCTACGGACAGCAGAGTTACAGTGGTTATAGCCAGTCCACGGACACTTCAGGCTATGGCCAGAGCAGCTATTCTTCTTATGGCCAGAGCCAGAACACAGGCTATGGAACTCAGTCAACTCCCCAGGGATATGGCTCGACTGGCGGCTATGGCAGTAGCCAGAGCTCCCAATCGTCTTACGGGCAGCAGTCCTCCTACCCTGGCTATGGCCAGCAGCCAGCTCCCAGCAGCACCTCGGGAAGTTACGGTAGCAGTTCTCAGAGCAGCAGCTATGGGCAGCCCCAGAGTGGGAGCTACAGCCAGCAGCCTAGCTATGGTGGACAGCAGCAAAGCTATGGACAGCAGCAAAGCTATAATCCCCCTCAGGGCTATGGACAGCAGAACCAGTACAACAGCAGCAGTGGTGGTGGAGGTGGAGGTGGAGGTGGAGGTAACTATGGCCAAGATCAATCCTCCATGAGTAGTGGTGGTGGCAGTGGTGGCGGTTATGGCAATCAAGACCAGAGTGGTGGAGGTGGCAGCGGTGGCTATGGACAGCAGGACCGTGGAGGCCGCGGCAGGGGTGGCAGTGGTGGCGGCGGCGGCGGCGGCGGTGGTGGTTACAACCGCAGCAGTGGTGGCTATGAACCCAGAGGTCGTGGAGGTGGCCGTGGAGGCAGAGGTGGCATGGGGTAGGTGTCTCATGAGCCAGGGAGTATCTTTGGTGGGGAGTGTGGAGGATTGCATGAATCTCCCTGAAGCCAGTCCCTAGTGCATGGTTTAGTATTCTTGTTGTCTAGGGATCTGTGAGGGCTTTGATTTGGGGGCAGTGACTTTCTTTTTACATCCCCATTTTATTTTTGTGAGAACTTGGGAGCCTGAACTCCCATCCATACCACTGAATAGAGATTTTGAGTAATGATACTTGTTTCCAAAAAAAAAGAAACCATACATAGATACGTATGGATTGGAGTCATTAATATCCTAGGCAAGAAACATGGAAGTGAAGACTTCTTTCTCTGCAAGGGAAACCGATGATCCCACTCCTGGGAAATAGTAGGGAAACTTGGTATGTGTATTCCCATGTGTCCTCTAGGGAGTTGGTAATGGTTAACCTGACTTCAGCTTCCAGGAATTGGCTACTCTTCCCGTTTTCTATAGTCATTTGAATCCACGAGCTTGATTTGCACTAATTTGACCGACATTGATTTTGTGTGTGACTTGGTTTATGGGGCCAGCTGACTGAAGTAAGCAGACCTTTTGGGCAAAAATATGCTTTGACAGTGGTCTCCCACCTATTTGTTCCACTGTCTGCCTTCCCCTGGTTACTTAAAATTCATCAGCTTGTCCAACTGGACCTTCTTTCCTTCCTGCTGAAGTTGATTTGAAGTAAAACCTTAGATTTGATGTTAAAACAGTTGTCAAATCTGTTGGTAAATAAGATTTGAAGGACCCTACTCTGTCTCCCTTGAAAAAGGGGAGGAATGTCAGTGTTACTGTTTTTGGAAAAAGTAGATTTTTAAACCGAGTTTGGAAATGGTAAGTATGCAGAGGTGGGTGGGGGCAATCTCAAAAACGTGCAAAAATGAGGAAAACAAAAATGAGGAAATGTGTGCGTGTGTTTAATGCAAAACTTTAAAAAGAAAAACAACTGTTATGTGACTGTTAACTTGCTCTGCATTTTATGTGCCACAGGTATGAAAGGTGACATTGCAAAATACTCCGCTCTTCTCGCAGTGTAGAAGGGGTGACCCCGGGGGTTGGGGGAGATCAAAAACAGCTCAGTAGTTAGGACAGAGCTTAGCTAAGTTTGTCTTGCTTTAAGGGGAAGTTGCCTTTGGTTTTGACTTTTTATGGAATGGGGTTGGGTCTGCTTGCTGCTTTCAAAGCAAAAACCACAAAAATGTGTTCAAGGCTACCCCAGCCTGGTGTGAAATGTCTTCTGGGTAAATTGGGGTAGGGTTTTTAAACCAACTACTTGGTTGTCAACCACTTGCGACAAGAGGAAAAAAAAACATCTGCTCCATCGGAAGAACGACCAAGGAAAATGGGTTATTTTTTTTCCAGAGGAAATAGATAACGTAACCTTTTAAAGCAAAATCTTTATAAACTGTGTCTGAGAAATTGCACACGTGTGTGTGACATGCTCAAAGGTCAGACAAGGGGTGGTCAGGAAGGGATGTATTTTAGTAGCCACTTGTATCTTTTTCCAAAAACACCTACCCATGTTTGGGGAATGTTAAACAAAATCAAAAAACAACCTTTTGTAGCCGTTGGAAGCTTCATGTCCTTTCTTCTAACTTGTCTTCTCCAGCGGAAGTGACCGTGGTGGCTTCAATAAATTTGGTGGTAAGTGAACAGAGTTTCCAAAATTCCCAACTCCCAGCAATGCTTTGTCTGATTGTTCATTTGCAGATGTCTTAGCGTGTTAATTTAAATGTCAAAGGTTTTGAGGTGTCCAGAACCACCTCCAGAAAGGGGTAGGGTAGAATGCCACCTGTTGCCTGGTGTGTGCTAACCTGGAGCAGGTAGGGGTAAGACTCAATAGTCATCTTTTACCAAATGGGTTTGCCCCAGGTTAATAAGAGGGGTCTAGTAGGCCTTGGACTGGGCCGTTGCCACACCTGGCACTTAGTGACCATCATCATGAGAAACTGGAGAGTGCGTGCTGGAACACGTGGTGCCATCTTGGCTTTAGGATCCTTTTGATCGTTGTGTCCAAGGCTTGTGTGTGTGTGAGTGTGTGGGAGACAACTCCGAATGTTTAATTCTGGAAGAGGGATGTAACATTGCCCTGAGGATGGTGAAGTTGGTATACATTTATAAAGTACGGAATGGTGTCAATGAATGCAATTCTATGTATATGGACTTAACTGAGATGGGCAAATAGAAACTAGCTCTGGGAAGGAACATGTGCACTACTTCAAGAAAGATTGGAAGCATGTGTGGCTCATGGGAAATAACCAGGTCTTAAACAGCACAAACTGAATTCGTGGACCAGGAAGGTCTTAAACAGCACAAACTGAATTCATGGAAAAATGACAAATTTGAGAAGTCTCCCAGTAAGCTGGAACTTTTCTGGTTTGGTTAACAAAAGGTTTCTTGATTTGTTTCAAGATTTAAAGCCAAAGGTGTGGGTTCATGACTTAGGTGTCATTGCGTGTGGGTACAATATTTATATATGGCGAATTCAGATAAACATTGGTCAAAGATGGTCTCTGGAAAAACAAAATAGAGGCTGCATTACGGAAATAAGATTTCTGGTCTGTTCCCTGGGACATGCTTAAAAAATACAATAGCTATTATGTATGGTTTTTATTTTCATGTGGTTTCGGGGAAACAACACGGTTTTAAGGATGGTTTCTAAAGATGAAATTAAAAATTGTTCCACAAGGGTTAAGTGTCTGGTGGTAAAGTTGGGAGAAACTGGATGGATGCACATCGCATGGCTGGTGGCGAGCCCATCTCTCTTCTCTCGGGTGAGAGAACCGGGCCAAGCTGAGTTGGTTTGTTCACTTTAATGGGTCTCCGTTTCCCCTGCCACCTGTGCTGAGGACATTTCCCAGCCTGAGCTGGGGGAGGCAGCATTTGCTGAAGTGTGGAGTTGTCTCTGTGGAGACTCAAGTTACAGATCTTAAGGGGCCTGCCTAGAATTTTCTCCTCTGGGCAGGCGACCCAGGAAAGGGTTTGGAGTGAGGCTGTGAGCACTTACTTGATATTTTACAAGTTTGGATTTGGTGTTAATTTTTTTCCTTGTCCGTTTTTTCCTGTTGACTAACGGCTCATCTTTTCCTTGTTTTTGTTTTTTTTTTGTTCTTTTTTTCCATGTCACTAAAGGCCCTCGGGACCAAGGATCACGTCATGACTCCGAACAGGATAATTCAGACAACAACACCATCTTTGTGCAAGGCCTGGGTGAGAATGTTACAATTGAGTCTGTGGCTGATTACTTCAAGCAGATTGGTATTATTAAGACAAACAAGAAAACGGGACAGCCCATGATTAATTTGTACACAGACAGGGAAACTGGCAAGCTGAAGGGAGAGGCAACGGTCTCTTTTGATGACCCACCTTCAGCTAAAGCAGCTATTGACTGGTTTGATGGTAAAGAATTCTCCGGAAATCCTATCAAGGTCTCATTTGCTACTCGCCGGGCAGACTTTAATCGGGGTGGTGGCAATGGTCGTGGAGGCCGAGGGCGAGGAGGACCCATGGGCCGTGGAGGCTATGGAGGTGGTGGCAGTGGTGGTGGTGGCCGAGGAGGATTTCCCAGTGGAGGTGGTGGCGGTGGAGGACAGCAGCGAGCTGGTGACTGGAAGTGTCCTAATCCCACCTGTGAGAATATGAACTTCTCTTGGAGGAATGAATGCAACCAGTGTAAGGCCCCTAAACCAGATGGCCCAGGAGGGGGACCAGGTGGCTCTCACATGGGGGGTAACTACGGGGATGATCGTCGTGGTGGCAGAGGAGGCTATGATCGAGGCGGCTACCGGGGCCGCGGCGGGGACCGTGGAGGCTTCCGAGGGGGCCGGGGTGGTGGGGACAGAGGTGGCTTTGGCCCTGGCAAGATGGATTCCAGGGGTGAGCACAGACAGGATCGCAGGGAGAGGCCGTATTAATTAGCCTGGCTCCCCAGGTTCTGGAACAGCTTTTTGTCCTGTACCCAGTGTTACCCTCGTTATTTTGTAACCTTCCAATTCCTGATCACCCAAGGGTTTTTTTGTGTCGGACTATGTAATTGTAACTATACCTCTGGTTCCCATTAAAAGTGACCATTTTAGTTAAA'''\n", | |
"\n", | |
"a = '''GGCTGCAGCCGGGCTCCGTGGCGCTCGCAGCCACCGCCTCCTCTCGGCTCCAGGTCTTCCCCTTCTTTTTACAACTGATCCTGTTGGGGATTTTTTTTTTTTCTAAATTGGAACGGTGGGGAGGAGCAGGGAGGGGGGACCTGGAGGAAGGGGAGAGATTAGGCAGCCATCAATTTCCTCCAGTTTCTCCCAGAACAGGTGATGCTTCTAAATTGTGATCACTTTCAGGAGGCAGCACTGCAGCTGGAAGGATGCGAGCGACCTAGGGTGGAGTGGCTGAGGCGGCAGATCTGAACTTGCGGAGGATAAGAACCCAAACTTTGACTACATCAGTCCGCACCTCGCCAGTGAAGCAAAGGACGGGTTATCTTTTTTTTTTTTCTAAGACTCAAACTTGGGCACTTGATCCCTTTTCTTGGATTGCTTTGGAGGAGACGATTTGCTGGCAACGTTGGGAACAGTCAGGACTGTGTTGTAACTCTTACTTTTAAAGCGACAGTAGAGGATCAGACTTTTTAAATGTTTGGAATTCAAGATACTTTAGGAAGAGGACCAACTCTGAAAGAGAAATCGCTGGGCGCGGAGATGGATTCGGTCAGGTCCTGGGTCCGGAATGTCGGAGTGGTGGACGCTAATGTCGCCGCGCAGAGCGGGGTCGCCCTGTCCCGGGCCCACTTTGAGAAACAGCCTCCTTCCAACTTGAGGAAATCCAACTTCTTTCACTTCGTCCTGGCGCTCTATGACAGGCAGGGCCAGCCGGTGGAGATCGAGCGGACGGCCTTCGTGGACTTTGTGGAGAATGACAAAGAACAAGGCAACGAGAAGACCAACAACGGCACTCACTACAAGTTACAGCTCCTCTACAGCAACGGTGTCCGCACGGAACAGGACCTCTATGTCAGGCTCATCGACTCGGTCACCAAGCAGCCCATCGCTTACGAGGGACAGAATAAGAATCCGGAAATGTGCCGAGTTCTCCTGACGCACGAAGTGATGTGTAGTCGATGCTGCGAAAAGAAAAGCTGTGGAAACCGAAATGAGACTCCATCGGACCCAGTCATAATTGACAGATTCTTTTTAAAATTTTTCCTCAAGTGCAATCAGAATTGTTTGAAAACAGCAGGAAACCCAAGGGACATGAGACGGTTTCAGGTTGTGTTGTCAACAACGGTGAATGTGGATGGACACGTCCTGGCTGTTTCTGACAACATGTTTGTTCATAACAACTCCAAGCATGGACGGAGAGCAAGAAGACTCGATCCATCGGAAGCTACCCCCTGCATCAAAGCCATTAGCCCGAGTGAAGGCTGGACCACAGGAGGAGCCATGGTCATCATCATCGGGGACAACTTCTTTGATGGTCTCCAAGTGGTGTTTGGGACTATGCTTGTATGGAGCGAGCTAATAACCCCTCATGCCATCAGAGTACAGACTCCTCCCCGGCACATCCCAGGCGTGGTAGAGGTGACATTATCTTATAAATCTAAACAGTTCTGCAAAGGAGCCCCAGGAAGGTTCATTTACACAGCATTAAATGAACCCACCATAGACTATGGCTTCCAGAGACTGCAGAAGGTCATCCCTAGGCATCCTGGAGATCCTGAGAGATTAGCTAAGGAGATGCTGTTGAAAAGAGCTGCAGATCTAGTGGAAGCTCTTTATGGCACACCACACAATAACCAGGACATCATTTTGAAGCGAGCCGCAGACATTGCTGAAGCTCTCTACAGCGTCCCCAGGAATCCCAGCCAGCTTCCAGCCCTCTCTAGCTCCCCAGCGCACAGTGGCATGATGGGAATCAACTCCTATGGCAGCCAGCTTGGGGTCAGCATCTCAGAGTCAACACAAGGAAATAATCAAGGGTACATCCGCAACACAAGCAGCATCTCTCCGCGGGGATACTCTTCCAGCTCCACGCCTCAACAGTCTAATTACAGTACCTCCAGCAACAGTATGAATGGCTACAGCAATGTCCCCATGGCCAACTTGGGTGTTCCAGGTTCACCAGGATTTCTAAATGGCTCACCCACCGGCTCTCCTTATGGAATCATGTCATCAAGTCCCACCGTTGGGTCTTCCAGCACATCCTCCATCCTCCCATTTTCCTCTTCAGTTTTTCCTGCTGTCAAACAGAAGAGTGCCTTTGCCCCTGTCATCAGGCCCCAAGGCTCCCCTTCACCTGCCTGCTCCAGCGGCAATGGAAATGGATTCAGAGCCATGACCGGACTTGTTGTACCCCCGATGTAAAGAAGAACTGCTTTCTTATAGCACAAAACTACTTACTCTGATGGACCAATAATGAAGAAAGCACTAGGAGCTCTTTTGGGGGTGTAGTGGTGCCCCCACATGAACATGATGGACACCCTTGGGTCTGCAAGGAGCCAGCATCTTACTTGGTCCCACGTCCTCCTATAGCTCTGATGGTGGCTACACAAACTGACCCTCTTGGGACAAGGACAAAAGATGTCATTGACGTAGTCAGTGCTAAGAGCAGAAATGCAATTCTTTGTTATGAACATTATGAAAACCACCTTCCTATGTTTGTAAAATATTTAAGAAAAAATTGGCAAACAATTAATGCTTAATATTTTGGATACTATTTGTTTTTCTTTGTAGGAAAAAAAAGTTGAAAGTTTCTATTTTCTATGAAGCCTTTCAGATACCAATTTAGTTTATGCAGAAAAAAATTGAACAAAACAGGGTACCAGCACGGAAGACTTTCTTAAAACGCAACCTGAATTGAATGATGAAATGTTGTATGTGTGTTTGCTTATAGCTTAATCTCTTTAAAAAATGAACAAAAAAAA'''" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"a = \"TGGACAGCTCCCGCTCACCCAAACAGAAGACGTCGGCGCCGGAGCGGGCTCGGACATGGCGAGGCTGCGAGCCGGCCCGAGCGGCGGGGCCCGGTGATCCCTCCCTCCCTCCCCGTCCCCTCCCCTCTCCCGCACGCACGCCCCGTCCGCCCCCACCCCGCCCCCACCCCGGGCGAGCCCGCCCGCAGCCCGGGGCGCACACCCGCACGCGCACTCCTCTCCACTCACTCCCGCGCCCGCCCCCACTCCCGCAGCCGAGCCCCGCCACGCGCGCCTTGCCCGCCCGCCGGCCGCCCCCGCCGCCCCCGCCGCCCCCGGGCCCTGATGGACTGAATGAAGGCTGCCTACACCGCCTATCGATGCCTCACCAAAGACCTAGAAGGCTGCGCCATGAACCCGGAGCTGACAATGGAAAGTCTGGGCACTTTGCACGGGCCGGCCGGCGGCGGCAGTGGCGGGGGCGGCGGCGGGGGCGGCGGGGGCGGCGGCGGGGGCCCGGGCCATGAGCAGGAGCTGCTGGCCAGCCCCAGCCCCCACCACGCGGGCCGCGGCGCCGCTGGCTCGCTGCGGGGCCCTCCGCCGCCTCCAACCGCGCACCAGGAGCTGGGCACGGCGGCAGCGGCGGCAGCGGCGGCGTCGCGCTCGGCCATGGTCACCAGCATGGCCTCGATCCTGGACGGCGGCGACTACCGGCCCGAGCTCTCCATCCCGCTGCACCACGCCATGAGCATGTCCTGCGACTCGTCTCCGCCTGGCATGGGCATGAGCAACACCTACACCACGCTGACACCGCTCCAGCCGCTGCCACCCATCTCCACCGTGTCTGACAAGTTCCACCACCCTCACCCGCACCACCATCCGCACCACCACCACCACCACCACCACCAGCGCCTGTCCGGCAACGTCAGCGGCAGCTTCACCCTCATGCGCGACGAGCGCGGGCTCCCGGCCATGAACAACCTCTACAGTCCCTACAAGGAGATGCCCGGCATGAGCCAGAGCCTGTCCCCGCTGGCCGCCACGCCGCTGGGCAACGGGCTAGGCGGCCTCCACAACGCGCAGCAGAGTCTGCCCAACTACGGTCCGCCGGGCCACGACAAAATGCTCAGCCCCAACTTCGACGCGCACCACACTGCCATGCTGACCCGCGGTGAGCAACACCTGTCCCGCGGCCTGGGCACCCCACCTGCGGCCATGATGTCGCACCTGAACGGCCTGCACCACCCGGGCCACACTCAGTCTCACGGGCCGGTGCTGGCACCCAGTCGCGAGCGGCCACCCTCGTCCTCATCGGGCTCGCAGGTGGCCACGTCGGGCCAGCTGGAAGAAATCAACACCAAAGAGGTGGCCCAGCGCATCACAGCGGAGCTGAAGCGCTACAGTATCCCCCAGGCGATCTTTGCGCAGAGGGTGCTGTGCCGGTCTCAGGGGACTCTCTCCGACCTGCTCCGGAATCCAAAACCGTGGAGTAAACTCAAATCTGGCAGGGAGACCTTCCGCAGGATGTGGAAGTGGCTTCAGGAGCCCGAGTTCCAGCGCATGTCCGCCTTACGCCTGGCAGCGTGCAAACGCAAAGAGCAAGAACCAAACAAAGACAGGAACAATTCCCAGAAGAAGTCCCGCCTGGTGTTCACTGACCTCCAACGCCGAACACTCTTCGCCATCTTCAAGGAGAACAAACGCCCGTCAAAGGAGATGCAGATCACCATTTCCCAGCAGCTGGGCCTGGAGCTCACAACCGTCAGCAACTTCTTCATGAACGCCCGGCGCCGCAGCCTGGAGAAGTGGCAAGACGATCTGAGCACAGGGGGCTCCTCGTCCACCTCCAGCACGTGTACCAAAGCATGATGGAAGGACTCTCACTTGGGCACAAGTCACCTCCAAATGAGGACAACAGATACCAAAAGAAAACAAAGGAAAAAGACACCGGATTCCTAGCTGGGGCCCTTCACTGGTGATTTGAAAGCACAATTCTCTTGCAAAGAAACTTATATTCTAGCTGTAATCATAGGCCAGGTGTTCTTCTTTTGTTTTTAATGGCTATGGAGTCCAAGTGCAAGCTGAAAAATTAATCTCTTAGAACCAGACACTGTTCTCTGAGCATGCTAAGCATCCCAGAAACCCAAATGGGGCCTTCCTGGAGCGAGTTAATTCCAGTATGGTGTCAACCAAGCTCGGGATTGCTTAAAATATCATCCATCCCACTTCAGGTCCTGTCAGCTTCTTGCAGTCAGAGTTCCTATGAGTAACAATAGGAGTTTGGCCTATGTAAGGACTCTGAGTTTAGGCTTCCAAGATACAACAATAAGAGAAGAATCTAGCAACGAGAATGACCTCATTTGCTTTCCACATGCTTAGCCTCATTATACCATGTTATGTCCAAGTTCACAGCCACAACATCAGAATGGTAATTACTGAGCACAAGTTTTAAATATGGACGTTAAAAAAAAAAATCCAAGGACCTGTTTTTCCAACCCAGACATCTTTTCATTGAATGATTTAGAAAGCTTTAAGTTGATCCAGCTTACAATTTTTTTTTTCTTTACCTCCTGGAAATCTCATATGGTCTTGGATCCGTCAAAAAAACCAGTCAGTTCACTTGCGCTCAAAGTATCAAGCACAACAAAGATAAACAGAAGTGAGGAAGGTTCTGGGTTCACTACATCTGGATTTTCAAGACACCTATTGTGAAGTCATTAGGGAATTGATGAGAATATGGCTTCAAGCACATTTTGCAGTTTGCTACAAATTCTGTTGTACATAATGCAGACGCACACTCAGGAGGCCAATTTAACTGTTAACAGTGCATGGAGCGAATGCAGCATTTTAAAAGATCTAGGTTTTTTTAGGTCATTAATGTGTCCTTGGTTGATCAGTCATCTGGTCCCTCCTACTGTGTGTTATGACCACCACGTAATCCATTCTCGCTCTTTCTGATTTGGGGTTTTTCCTCATCCATCCCATTAGTAGGGATGTTTTCTGTGTTTTCTAGCAAGAAAAAAAAATCAATCAATCAAACCTGCATACATGTTACTCATGACTGTCATCTAGTCCTAAATCTCTTCTGTTGTTGAATCATCCTTGCAAAACAGCTGAATACATCTGGAGAAAACACAGCACACCAAAGAAGCAGAATACTGCAAACCAAAGACATTTATGACTTGTCATTTTCTAGCCTAAAAATACTGTGATTACTTTTAGAAATCAGAAAACCTCTGCAACTCCGAATGGCATTCAGCTCTTGCATTTGGCGCATCATCGGGCTGAGCGGACCAGCTACACCAAGGACATTAGCCAAGCCACCCAGAGGGGTGGCTTTGCCACACCAGTTGTCACCTTCCCATAGCAAGTGGAAGAGCGCCCACAGAACTCTGGGAGATTGCAAAGGTCACAATGTGCATATTTACCAGTGAATGGCCCCGGGTGGGGCCACGTGGGGGTGTTCAAAGCAAGCCAAACGCTGCAATCATTCTTTACAGACACTTGAGACTGACTTTTTTATGAATTACTTAGTCGAAACCAAAGAAACTTTTTCTGCACCTACTTCTGCAACAAACAAAACTGTCCCATTAAAATGAATAAATAAATCCGTAAATCAATGGAAATCACCACCAATAAGAAGGAAGCACGCCAGAAAATAAACGAAAACAAAAACAGGGAGACACACTGTGTTCAAACAGACCTCTTGGGACATTTTTTGGAAGCAGATTTTAAAGAAAGGGTTGAGACAAAGATAGAAATAAGGAAGAGCCTCAGTGGCTGCTGCTTCATTTGACAACTCACACGGTAATCTTAAAGCTGAAGATTGTCTTTAATTTGTGCCTATGCAGTTTTTCAAAAGAACACGGAACAGAGCAACAGAAACCTCAACAGCTACAATACCAAAGATGAGGATTTCTCACACCTTTTGTTTCAGTTCATTATCTCCTCTTGCCTGGCTAAAATACTAATAGCGCCATTGAACTGTATAAAGGTAATCAATTATGTTTCTCTGAGCAACAAAAGGAAAGGGCCATTTATTTGATTTTATTGTTTCATTTCAATTTTGTCTTATGGTTTTTTGCCCCAACATGGAATCTCTCAAAAGTTTCCATGGACTCCAAGTTTAAGATGTTGGGATATTGAACAGTTCTCTCTGCTCAGCAGAGGGTAGGGAATAACATTATCACTTGAATGTTCTTTGCTTAACCCTTAGACTTGGTTCCTTCTATGTTCAGAGTCTCATCATCAGGGGAAGGAAAGGGAGTGAGGGTCAGGGATAGGGGTCTTGGTGATGCATCCTCTCCCGAGCCACAGAACCAAAGAGTTTATAGAGGAATTTACAGCCTCGTTTTCATGTGATTGCTACATCCTAACAGGGCTTCATTTGGGGGTGGGGGGAAACATGTAAAAATAATTGCCAGTTTCTACTTTTCTATTAGCTTTTTAAAAATCAGCTGTAAAGTTGCATTTCTAAAGAAAGATATATATAATATATAAAATACATATATAGATCAACTTGACATTGGTGATAACCAAAATTATTGCTGTCCAAATTCATGTCTTGTTTTGGTCCAGTGCTTCATTTGCTAAGTATTCGGTTCAGAATTTTTCTCATTTCTCATGCCATTCCAGAGTTAATTTGCCACTGTGGATGATTTGAAGTATTCAGATCTCTATGGAAGTTTCTGGGACAGGTTTAAAGTCAAGATCAAGCATTTTAGCATTTAACCTGTTGATAAATGGATCCATGGTGTACATGAGTTTTATTTGTATTCGGAGTCATCTCTATTCTATCCCTCAGCCTCGATTAAGGTGGTGAGTGAAGTGCATCCAACAGACTCGGCCCAGAACTGGGTCCTGACAGTGGGGTGCTCATCTTCTGTAACTGTTGGGAAGGCTCGGTGGTCCATTTTCACCAGTTAAAGAATATGAGGCCAGCCCAGAAATCTGTTCTCCAGGAGCTGCCCTGTCCCATCTGGGTGTGCCAGACCCCCTCAGTGAGCAGGTCCACCAAAGGGACTTCTCACAGGGGAAGCCCAACTCCTGTTGCAATGGGTTGATAGATTTCCTCAGGGTGGTAATTACCAATTCGTATTTTGACAAGCCTATGTGCAACCACAGCTGGCACTGGGGTGGGCAGTGGTGTTGGGTGGGATGGGGGAGAGTGTCTCAATCCTGAAGAGAAAATATAAAGCAGGTTTTGGGGAGACTTCTGGAGTCCTGCCCCTAGAGAGCCCCATTGTTGTTCTTTGTGCCCCCTCCTCATTCCCCCTATGTGGGTCTCCCTATGCAGGAGCTGTGAGAGAATGTGACTCTCCACAATTTTTATAATTCATCCTTCCTAGGAGATTGTTCATTGGCTCTTCCCTTGTGTCCCTTTGTCCCTTGCTCATACTCCATGTTTCCTTTGTCAAAGGACTAAGAAAAGAGCATATTTCAGCAGAGGAGTGTTCCCATGTGGGTTGATTTCAACTTGGGTATTTCTAAAAGAGTCCTTGTGACATGTGTCCAGTGGAAATGGTTGCTCTTTTCCAGACTGGATTGAGGAATGGAGCCTGTTTGATTTGGTTAGTGATTCTTTGACATACTAATCTCAGCGTTTGGGTCTCCAGCATCCTCTGAAGATGTCTAGACTAGTAGAGGCTGCCTTTGTGACCTGACATTACAACATTGGTCAAACCAGTCCTCTGATAATCAGAAGAACATGTCATAATTGTTTAAAAAAAAAAAAAAGGCAAGAATTTCTCTCCAAGGAGCTTTAATAAATGTCTCATTCCAGATAATGTCATACCAGAGAAAAGTGCTTGCTTTTAGAAAATTATTTACATACATATATAAATATATATGTGTATCTATACAGTTATGTATCAAAATTTTAAGCCCTGCAGAATTTCAATTTGTTAGAAATCTAACAGAAAAAAATTTCTATATTGAAAGGTAATAGAATTTAACCCAGTGAGTTTACTCAAGGATTTTTAAATTTAAGTTAATAATTTCAGAGAAAATAACCATTTGGGTGTGGTTATAGTTTAGTATCCATTACCTCAATCCAAGGAAAATTCCAGGCATTCCTCAACCATCAGGAAAAGGTACAGTGTGAAGGAACAGTTCTCAGCCAAATTTCACATTCTTGAGGCAACAGAAATCAAAACACTCAGAGCCATTGAGTGGAAAAACAATTTACTTTATTCCTTTACACAAATAGGCTTGCATTGTTTTTGTTTTAATGTGATTTTGGTACTAGGGATATAATTATTTCATTCCAGGAAATAATAAAAAAAAACAGACAGAGCCAATACATTTCTTTTTTTAAAGGAAACAGCAACAACAATAAAAACTCAGCACCAATATTTAAAAGCTTTTCCAAAATGTAAAAGAAGTGTTTAGCTTGCACCATGCATAAAGGTGCAGGCTAGTTGAACCAGGAAGCATGGCACTTCCTCTGGAGAAATCCAGAAAGAGTTGCTTCTAAGCTCCCTTTTCCCCCTGCAGGCTCTTGGCAATTGTAGGCTTTAGCAAATCCAGAATAATTTTCAATTCAAGCTAAAATAAAATCAACATTTGGAATGTAAATCTGATACACACACACTTTTCTAAGTCAAACAACATATTTCAAAACCAAAAATAAATACCTTTTAGATAATCAGTTATTTTCTTTGTCTATACTGGGCACCCACCTACTAGTGCCAGTAAATTCAAGTTGAACAGATTTTTAAAATCACTATTATCTGGGTATGGGGGAAACTTCCCCACTTTTGAAAATGTTGGTAGAATTATAGGAATGTCTGTTTGATTATCATTACCAAAGTGTCATGACAGTATGCCTTTGTAGTGAACTCGGATTTTCAGGAGTTTGAATAGTTGGATATTTTAAAATCTAAGAAGAAAAGGCCTGTTTCCAATGTTGTTGAAGAATAATGAACTCTATTAAAAAGTGGAGAAAAAGATAATACATGTGGTCAAGGTTGACCACAAGGCCCAGGCACAACTACCTTGGCGATAATCTTCTAGATTCGTAACAGGTTAGAGCTGACTTTTTGTTTTTGTTGTTGCTGATGCTGTGTGATTCAGACTTCTCAGCCTAACCAGGAAGAGTAAGTGGAAATGGTAGATGAAGAAGGGGTAGAGCTGGTGTATCTATAACTTTCTGATATTTGTCTGCCAAACTTGATATATTAGTAATTTTTTTATCTTTAGCTAAGATCAAGTCACCCCTGAAACAACAGGAGATTCTAGTTTTAAAATAAGGCCACAAAAATCCTTACGGAATGAAGAATGGCACCCCAGTTGGTTGTATAAGTCTCATAAGATAATGATGTTGATTTTAAATATGGATGTCTCAATGCCTGTTTTCTATCAATGATTTGTTTGTTTCCAAGGTCGGGGAGGGAAAGAGGGGAGGGTTTATCTGTTTTAGAAAGTCTCAGAATACTTATAAAATACAGAAGTAGTTATTAAAATATATAGGACCTCACATAGGTAGATACAGAACTTACCATTGAGGCTGATGGGCTGTTGTGTGAATCACACAGGACCTTAAATGAGGCTCATTATTCTCACACACCAAAATGACTCTGACAGCCTGAAGCAGTTATTGCTAGAGCCCAAGCTTTCCTTGGAGGTTTTGGAGTTAGGTTGATTGGAAGTAACCAGCTAATACCTTTTCTAGTGGAGAAAAAGACATTGCTACCAGCTTGTTCATCCCATAGAAGTCTTCCACTCTGCTCCATTTTTAGCAGCAAGCATTTCATGTAGCATAAACCTTGGCAGATAAGTGTGCCTAAGGTTTATACAGTCTGTCCGCTTGGATGTATACAAATTTAGATACATATTTTAACATGTGTTCTCATAGATGACTTTATAACAACACACATTACCTATAGGTGTCTAGACTGTGTACATACAAGTGTGTACAGACAAGCTTCATACGTATATACTGTAATCCGTTACAACAAATAAATTTTAAATCATCGTTTAACATGTATGTGGTACTTCTACAGTGTACATTGTTTTCATTATTTATTGTAACATTGAAAACCACAGTGCAGGGAAAACAAAAGTATCCCAGCATCTTCATCCTGTACACTTGGAATTAATTTCATTTGGGCATATCCAAGATAAACTCAACTTTCAAGAAATCTTGTATATTATTTAATCATCTGTGTTAGGATGACACCTATGATTGATGACTTCGGTTGAATAGCTTTATTCTGGATTTTTCATAACTAAAGCTAAATCCAAAGACCTGAAAAAGGACAAAAAGAAAAAAAAAAAAAGAAA\"\n", | |
"a = a.upper()\n", | |
"a = a.replace(\"\\n\",\"\")\n", | |
" \n", | |
"table = { \n", | |
" 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M+', \n", | |
" 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', \n", | |
" 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', \n", | |
" 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R', \n", | |
" 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', \n", | |
" 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', \n", | |
" 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', \n", | |
" 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', \n", | |
" 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', \n", | |
" 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', \n", | |
" 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', \n", | |
" 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', \n", | |
" 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', \n", | |
" 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', \n", | |
" 'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', \n", | |
" 'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', \n", | |
"} \n", | |
"\n", | |
"protein = \"\"\n", | |
"protein2 = \"\"\n", | |
"protein3 = \"\"\n", | |
"\n", | |
"exon = False\n", | |
"intron = True\n", | |
"for i in range(0, len(a) - 3,3): \n", | |
" codon = a[i:i+3] \n", | |
" codon2 = a[i+1:i+4]\n", | |
" codon3 = a[i+2:i+5]\n", | |
" \n", | |
" protein += table[codon]\n", | |
" protein2 += table[codon2]\n", | |
" protein3 += table[codon3]\n", | |
" \n", | |
"print(protein)\n", | |
"print(protein2)\n", | |
"print(protein3)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def translate(seq,orf):\n", | |
" seq = seq.upper()\n", | |
" seq = seq.replace(\"\\n\",\"\")\n", | |
"\n", | |
" table = { \n", | |
" 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', \n", | |
" 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', \n", | |
" 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', \n", | |
" 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R', \n", | |
" 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', \n", | |
" 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', \n", | |
" 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', \n", | |
" 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', \n", | |
" 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', \n", | |
" 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', \n", | |
" 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', \n", | |
" 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', \n", | |
" 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', \n", | |
" 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', \n", | |
" 'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', \n", | |
" 'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', \n", | |
" } \n", | |
"\n", | |
" protein = \"\"\n", | |
" exon = False\n", | |
" translating = True\n", | |
" i = orf\n", | |
"\n", | |
" while (translating):\n", | |
" codon = seq[i:i+3]\n", | |
" \n", | |
" try: table[codon]\n", | |
" except: break\n", | |
"\n", | |
" if (table[codon] == \"M\"):\n", | |
" exon = True\n", | |
"\n", | |
" if (exon):\n", | |
" if (table[codon] == \"_\"):\n", | |
" exon = False\n", | |
" translating = False\n", | |
" else:\n", | |
" protein += table[codon]\n", | |
" i += 3\n", | |
" else:\n", | |
" i += 3\n", | |
" \n", | |
" return protein\n", | |
"\n", | |
"a = \"TGGACAGCTCCCGCTCACCCAAACAGAAGACGTCGGCGCCGGAGCGGGCTCGGACATGGCGAGGCTGCGAGCCGGCCCGAGCGGCGGGGCCCGGTGATCCCTCCCTCCCTCCCCGTCCCCTCCCCTCTCCCGCACGCACGCCCCGTCCGCCCCCACCCCGCCCCCACCCCGGGCGAGCCCGCCCGCAGCCCGGGGCGCACACCCGCACGCGCACTCCTCTCCACTCACTCCCGCGCCCGCCCCCACTCCCGCAGCCGAGCCCCGCCACGCGCGCCTTGCCCGCCCGCCGGCCGCCCCCGCCGCCCCCGCCGCCCCCGGGCCCTGATGGACTGAATGAAGGCTGCCTACACCGCCTATCGATGCCTCACCAAAGACCTAGAAGGCTGCGCCATGAACCCGGAGCTGACAATGGAAAGTCTGGGCACTTTGCACGGGCCGGCCGGCGGCGGCAGTGGCGGGGGCGGCGGCGGGGGCGGCGGGGGCGGCGGCGGGGGCCCGGGCCATGAGCAGGAGCTGCTGGCCAGCCCCAGCCCCCACCACGCGGGCCGCGGCGCCGCTGGCTCGCTGCGGGGCCCTCCGCCGCCTCCAACCGCGCACCAGGAGCTGGGCACGGCGGCAGCGGCGGCAGCGGCGGCGTCGCGCTCGGCCATGGTCACCAGCATGGCCTCGATCCTGGACGGCGGCGACTACCGGCCCGAGCTCTCCATCCCGCTGCACCACGCCATGAGCATGTCCTGCGACTCGTCTCCGCCTGGCATGGGCATGAGCAACACCTACACCACGCTGACACCGCTCCAGCCGCTGCCACCCATCTCCACCGTGTCTGACAAGTTCCACCACCCTCACCCGCACCACCATCCGCACCACCACCACCACCACCACCACCAGCGCCTGTCCGGCAACGTCAGCGGCAGCTTCACCCTCATGCGCGACGAGCGCGGGCTCCCGGCCATGAACAACCTCTACAGTCCCTACAAGGAGATGCCCGGCATGAGCCAGAGCCTGTCCCCGCTGGCCGCCACGCCGCTGGGCAACGGGCTAGGCGGCCTCCACAACGCGCAGCAGAGTCTGCCCAACTACGGTCCGCCGGGCCACGACAAAATGCTCAGCCCCAACTTCGACGCGCACCACACTGCCATGCTGACCCGCGGTGAGCAACACCTGTCCCGCGGCCTGGGCACCCCACCTGCGGCCATGATGTCGCACCTGAACGGCCTGCACCACCCGGGCCACACTCAGTCTCACGGGCCGGTGCTGGCACCCAGTCGCGAGCGGCCACCCTCGTCCTCATCGGGCTCGCAGGTGGCCACGTCGGGCCAGCTGGAAGAAATCAACACCAAAGAGGTGGCCCAGCGCATCACAGCGGAGCTGAAGCGCTACAGTATCCCCCAGGCGATCTTTGCGCAGAGGGTGCTGTGCCGGTCTCAGGGGACTCTCTCCGACCTGCTCCGGAATCCAAAACCGTGGAGTAAACTCAAATCTGGCAGGGAGACCTTCCGCAGGATGTGGAAGTGGCTTCAGGAGCCCGAGTTCCAGCGCATGTCCGCCTTACGCCTGGCAGCGTGCAAACGCAAAGAGCAAGAACCAAACAAAGACAGGAACAATTCCCAGAAGAAGTCCCGCCTGGTGTTCACTGACCTCCAACGCCGAACACTCTTCGCCATCTTCAAGGAGAACAAACGCCCGTCAAAGGAGATGCAGATCACCATTTCCCAGCAGCTGGGCCTGGAGCTCACAACCGTCAGCAACTTCTTCATGAACGCCCGGCGCCGCAGCCTGGAGAAGTGGCAAGACGATCTGAGCACAGGGGGCTCCTCGTCCACCTCCAGCACGTGTACCAAAGCATGATGGAAGGACTCTCACTTGGGCACAAGTCACCTCCAAATGAGGACAACAGATACCAAAAGAAAACAAAGGAAAAAGACACCGGATTCCTAGCTGGGGCCCTTCACTGGTGATTTGAAAGCACAATTCTCTTGCAAAGAAACTTATATTCTAGCTGTAATCATAGGCCAGGTGTTCTTCTTTTGTTTTTAATGGCTATGGAGTCCAAGTGCAAGCTGAAAAATTAATCTCTTAGAACCAGACACTGTTCTCTGAGCATGCTAAGCATCCCAGAAACCCAAATGGGGCCTTCCTGGAGCGAGTTAATTCCAGTATGGTGTCAACCAAGCTCGGGATTGCTTAAAATATCATCCATCCCACTTCAGGTCCTGTCAGCTTCTTGCAGTCAGAGTTCCTATGAGTAACAATAGGAGTTTGGCCTATGTAAGGACTCTGAGTTTAGGCTTCCAAGATACAACAATAAGAGAAGAATCTAGCAACGAGAATGACCTCATTTGCTTTCCACATGCTTAGCCTCATTATACCATGTTATGTCCAAGTTCACAGCCACAACATCAGAATGGTAATTACTGAGCACAAGTTTTAAATATGGACGTTAAAAAAAAAAATCCAAGGACCTGTTTTTCCAACCCAGACATCTTTTCATTGAATGATTTAGAAAGCTTTAAGTTGATCCAGCTTACAATTTTTTTTTTCTTTACCTCCTGGAAATCTCATATGGTCTTGGATCCGTCAAAAAAACCAGTCAGTTCACTTGCGCTCAAAGTATCAAGCACAACAAAGATAAACAGAAGTGAGGAAGGTTCTGGGTTCACTACATCTGGATTTTCAAGACACCTATTGTGAAGTCATTAGGGAATTGATGAGAATATGGCTTCAAGCACATTTTGCAGTTTGCTACAAATTCTGTTGTACATAATGCAGACGCACACTCAGGAGGCCAATTTAACTGTTAACAGTGCATGGAGCGAATGCAGCATTTTAAAAGATCTAGGTTTTTTTAGGTCATTAATGTGTCCTTGGTTGATCAGTCATCTGGTCCCTCCTACTGTGTGTTATGACCACCACGTAATCCATTCTCGCTCTTTCTGATTTGGGGTTTTTCCTCATCCATCCCATTAGTAGGGATGTTTTCTGTGTTTTCTAGCAAGAAAAAAAAATCAATCAATCAAACCTGCATACATGTTACTCATGACTGTCATCTAGTCCTAAATCTCTTCTGTTGTTGAATCATCCTTGCAAAACAGCTGAATACATCTGGAGAAAACACAGCACACCAAAGAAGCAGAATACTGCAAACCAAAGACATTTATGACTTGTCATTTTCTAGCCTAAAAATACTGTGATTACTTTTAGAAATCAGAAAACCTCTGCAACTCCGAATGGCATTCAGCTCTTGCATTTGGCGCATCATCGGGCTGAGCGGACCAGCTACACCAAGGACATTAGCCAAGCCACCCAGAGGGGTGGCTTTGCCACACCAGTTGTCACCTTCCCATAGCAAGTGGAAGAGCGCCCACAGAACTCTGGGAGATTGCAAAGGTCACAATGTGCATATTTACCAGTGAATGGCCCCGGGTGGGGCCACGTGGGGGTGTTCAAAGCAAGCCAAACGCTGCAATCATTCTTTACAGACACTTGAGACTGACTTTTTTATGAATTACTTAGTCGAAACCAAAGAAACTTTTTCTGCACCTACTTCTGCAACAAACAAAACTGTCCCATTAAAATGAATAAATAAATCCGTAAATCAATGGAAATCACCACCAATAAGAAGGAAGCACGCCAGAAAATAAACGAAAACAAAAACAGGGAGACACACTGTGTTCAAACAGACCTCTTGGGACATTTTTTGGAAGCAGATTTTAAAGAAAGGGTTGAGACAAAGATAGAAATAAGGAAGAGCCTCAGTGGCTGCTGCTTCATTTGACAACTCACACGGTAATCTTAAAGCTGAAGATTGTCTTTAATTTGTGCCTATGCAGTTTTTCAAAAGAACACGGAACAGAGCAACAGAAACCTCAACAGCTACAATACCAAAGATGAGGATTTCTCACACCTTTTGTTTCAGTTCATTATCTCCTCTTGCCTGGCTAAAATACTAATAGCGCCATTGAACTGTATAAAGGTAATCAATTATGTTTCTCTGAGCAACAAAAGGAAAGGGCCATTTATTTGATTTTATTGTTTCATTTCAATTTTGTCTTATGGTTTTTTGCCCCAACATGGAATCTCTCAAAAGTTTCCATGGACTCCAAGTTTAAGATGTTGGGATATTGAACAGTTCTCTCTGCTCAGCAGAGGGTAGGGAATAACATTATCACTTGAATGTTCTTTGCTTAACCCTTAGACTTGGTTCCTTCTATGTTCAGAGTCTCATCATCAGGGGAAGGAAAGGGAGTGAGGGTCAGGGATAGGGGTCTTGGTGATGCATCCTCTCCCGAGCCACAGAACCAAAGAGTTTATAGAGGAATTTACAGCCTCGTTTTCATGTGATTGCTACATCCTAACAGGGCTTCATTTGGGGGTGGGGGGAAACATGTAAAAATAATTGCCAGTTTCTACTTTTCTATTAGCTTTTTAAAAATCAGCTGTAAAGTTGCATTTCTAAAGAAAGATATATATAATATATAAAATACATATATAGATCAACTTGACATTGGTGATAACCAAAATTATTGCTGTCCAAATTCATGTCTTGTTTTGGTCCAGTGCTTCATTTGCTAAGTATTCGGTTCAGAATTTTTCTCATTTCTCATGCCATTCCAGAGTTAATTTGCCACTGTGGATGATTTGAAGTATTCAGATCTCTATGGAAGTTTCTGGGACAGGTTTAAAGTCAAGATCAAGCATTTTAGCATTTAACCTGTTGATAAATGGATCCATGGTGTACATGAGTTTTATTTGTATTCGGAGTCATCTCTATTCTATCCCTCAGCCTCGATTAAGGTGGTGAGTGAAGTGCATCCAACAGACTCGGCCCAGAACTGGGTCCTGACAGTGGGGTGCTCATCTTCTGTAACTGTTGGGAAGGCTCGGTGGTCCATTTTCACCAGTTAAAGAATATGAGGCCAGCCCAGAAATCTGTTCTCCAGGAGCTGCCCTGTCCCATCTGGGTGTGCCAGACCCCCTCAGTGAGCAGGTCCACCAAAGGGACTTCTCACAGGGGAAGCCCAACTCCTGTTGCAATGGGTTGATAGATTTCCTCAGGGTGGTAATTACCAATTCGTATTTTGACAAGCCTATGTGCAACCACAGCTGGCACTGGGGTGGGCAGTGGTGTTGGGTGGGATGGGGGAGAGTGTCTCAATCCTGAAGAGAAAATATAAAGCAGGTTTTGGGGAGACTTCTGGAGTCCTGCCCCTAGAGAGCCCCATTGTTGTTCTTTGTGCCCCCTCCTCATTCCCCCTATGTGGGTCTCCCTATGCAGGAGCTGTGAGAGAATGTGACTCTCCACAATTTTTATAATTCATCCTTCCTAGGAGATTGTTCATTGGCTCTTCCCTTGTGTCCCTTTGTCCCTTGCTCATACTCCATGTTTCCTTTGTCAAAGGACTAAGAAAAGAGCATATTTCAGCAGAGGAGTGTTCCCATGTGGGTTGATTTCAACTTGGGTATTTCTAAAAGAGTCCTTGTGACATGTGTCCAGTGGAAATGGTTGCTCTTTTCCAGACTGGATTGAGGAATGGAGCCTGTTTGATTTGGTTAGTGATTCTTTGACATACTAATCTCAGCGTTTGGGTCTCCAGCATCCTCTGAAGATGTCTAGACTAGTAGAGGCTGCCTTTGTGACCTGACATTACAACATTGGTCAAACCAGTCCTCTGATAATCAGAAGAACATGTCATAATTGTTTAAAAAAAAAAAAAAGGCAAGAATTTCTCTCCAAGGAGCTTTAATAAATGTCTCATTCCAGATAATGTCATACCAGAGAAAAGTGCTTGCTTTTAGAAAATTATTTACATACATATATAAATATATATGTGTATCTATACAGTTATGTATCAAAATTTTAAGCCCTGCAGAATTTCAATTTGTTAGAAATCTAACAGAAAAAAATTTCTATATTGAAAGGTAATAGAATTTAACCCAGTGAGTTTACTCAAGGATTTTTAAATTTAAGTTAATAATTTCAGAGAAAATAACCATTTGGGTGTGGTTATAGTTTAGTATCCATTACCTCAATCCAAGGAAAATTCCAGGCATTCCTCAACCATCAGGAAAAGGTACAGTGTGAAGGAACAGTTCTCAGCCAAATTTCACATTCTTGAGGCAACAGAAATCAAAACACTCAGAGCCATTGAGTGGAAAAACAATTTACTTTATTCCTTTACACAAATAGGCTTGCATTGTTTTTGTTTTAATGTGATTTTGGTACTAGGGATATAATTATTTCATTCCAGGAAATAATAAAAAAAAACAGACAGAGCCAATACATTTCTTTTTTTAAAGGAAACAGCAACAACAATAAAAACTCAGCACCAATATTTAAAAGCTTTTCCAAAATGTAAAAGAAGTGTTTAGCTTGCACCATGCATAAAGGTGCAGGCTAGTTGAACCAGGAAGCATGGCACTTCCTCTGGAGAAATCCAGAAAGAGTTGCTTCTAAGCTCCCTTTTCCCCCTGCAGGCTCTTGGCAATTGTAGGCTTTAGCAAATCCAGAATAATTTTCAATTCAAGCTAAAATAAAATCAACATTTGGAATGTAAATCTGATACACACACACTTTTCTAAGTCAAACAACATATTTCAAAACCAAAAATAAATACCTTTTAGATAATCAGTTATTTTCTTTGTCTATACTGGGCACCCACCTACTAGTGCCAGTAAATTCAAGTTGAACAGATTTTTAAAATCACTATTATCTGGGTATGGGGGAAACTTCCCCACTTTTGAAAATGTTGGTAGAATTATAGGAATGTCTGTTTGATTATCATTACCAAAGTGTCATGACAGTATGCCTTTGTAGTGAACTCGGATTTTCAGGAGTTTGAATAGTTGGATATTTTAAAATCTAAGAAGAAAAGGCCTGTTTCCAATGTTGTTGAAGAATAATGAACTCTATTAAAAAGTGGAGAAAAAGATAATACATGTGGTCAAGGTTGACCACAAGGCCCAGGCACAACTACCTTGGCGATAATCTTCTAGATTCGTAACAGGTTAGAGCTGACTTTTTGTTTTTGTTGTTGCTGATGCTGTGTGATTCAGACTTCTCAGCCTAACCAGGAAGAGTAAGTGGAAATGGTAGATGAAGAAGGGGTAGAGCTGGTGTATCTATAACTTTCTGATATTTGTCTGCCAAACTTGATATATTAGTAATTTTTTTATCTTTAGCTAAGATCAAGTCACCCCTGAAACAACAGGAGATTCTAGTTTTAAAATAAGGCCACAAAAATCCTTACGGAATGAAGAATGGCACCCCAGTTGGTTGTATAAGTCTCATAAGATAATGATGTTGATTTTAAATATGGATGTCTCAATGCCTGTTTTCTATCAATGATTTGTTTGTTTCCAAGGTCGGGGAGGGAAAGAGGGGAGGGTTTATCTGTTTTAGAAAGTCTCAGAATACTTATAAAATACAGAAGTAGTTATTAAAATATATAGGACCTCACATAGGTAGATACAGAACTTACCATTGAGGCTGATGGGCTGTTGTGTGAATCACACAGGACCTTAAATGAGGCTCATTATTCTCACACACCAAAATGACTCTGACAGCCTGAAGCAGTTATTGCTAGAGCCCAAGCTTTCCTTGGAGGTTTTGGAGTTAGGTTGATTGGAAGTAACCAGCTAATACCTTTTCTAGTGGAGAAAAAGACATTGCTACCAGCTTGTTCATCCCATAGAAGTCTTCCACTCTGCTCCATTTTTAGCAGCAAGCATTTCATGTAGCATAAACCTTGGCAGATAAGTGTGCCTAAGGTTTATACAGTCTGTCCGCTTGGATGTATACAAATTTAGATACATATTTTAACATGTGTTCTCATAGATGACTTTATAACAACACACATTACCTATAGGTGTCTAGACTGTGTACATACAAGTGTGTACAGACAAGCTTCATACGTATATACTGTAATCCGTTACAACAAATAAATTTTAAATCATCGTTTAACATGTATGTGGTACTTCTACAGTGTACATTGTTTTCATTATTTATTGTAACATTGAAAACCACAGTGCAGGGAAAACAAAAGTATCCCAGCATCTTCATCCTGTACACTTGGAATTAATTTCATTTGGGCATATCCAAGATAAACTCAACTTTCAAGAAATCTTGTATATTATTTAATCATCTGTGTTAGGATGACACCTATGATTGATGACTTCGGTTGAATAGCTTTATTCTGGATTTTTCATAACTAAAGCTAAATCCAAAGACCTGAAAAAGGACAAAAAGAAAAAAAAAAAAAGAAA\"\n", | |
"print(translate(a,0))\n", | |
"print(translate(a,1))\n", | |
"print(translate(a,2))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "FileNotFoundError", | |
"evalue": "[Errno 2] No such file or directory: '/home/annaldas/projects/result/ONECUT2/ONECUT2_seq.fa'", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-9-28170ef6fa91>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0mtranscripts_filename\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/home/annaldas/projects/result/ONECUT2/ONECUT2_seq.fa\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 127\u001b[0;31m \u001b[0mtranscripts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSeqIO\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtranscripts_filename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"fasta\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 128\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[0mgene\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"ONECUT2\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/Bio/SeqIO/__init__.py\u001b[0m in \u001b[0;36mindex\u001b[0;34m(filename, format, alphabet, key_function)\u001b[0m\n\u001b[1;32m 951\u001b[0m \u001b[0mrepr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"SeqIO.index(%r, %r, alphabet=%r, key_function=%r)\"\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 952\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey_function\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 953\u001b[0;31m return _IndexedSeqFileDict(proxy_class(filename, format, alphabet),\n\u001b[0m\u001b[1;32m 954\u001b[0m key_function, repr, \"SeqRecord\")\n\u001b[1;32m 955\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/Bio/SeqIO/_index.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, filename, format, alphabet)\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 184\u001b[0m \u001b[0;34m\"\"\"Initialize the class.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 185\u001b[0;31m \u001b[0mSeqFileRandomAccess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 186\u001b[0m marker = {\"ace\": b\"CO \",\n\u001b[1;32m 187\u001b[0m \u001b[0;34m\"embl\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34mb\"ID \"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/Bio/SeqIO/_index.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, filename, format, alphabet)\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0;34m\"\"\"Initialize the class.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 46\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_handle\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_open_for_random_access\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 47\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_alphabet\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_format\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/Bio/File.py\u001b[0m in \u001b[0;36m_open_for_random_access\u001b[0;34m(filename)\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[0mIf\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mfile\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mgzipped\u001b[0m \u001b[0mbut\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mBGZF\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ma\u001b[0m \u001b[0mspecific\u001b[0m \u001b[0mValueError\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mraised\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 131\u001b[0m \"\"\"\n\u001b[0;32m--> 132\u001b[0;31m \u001b[0mhandle\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 133\u001b[0m \u001b[0mmagic\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 134\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseek\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/home/annaldas/projects/result/ONECUT2/ONECUT2_seq.fa'" | |
] | |
} | |
], | |
"source": [ | |
"codon_table = {\n", | |
" 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', \n", | |
" 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', \n", | |
" 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', \n", | |
" 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R', \n", | |
" 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', \n", | |
" 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', \n", | |
" 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', \n", | |
" 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', \n", | |
" 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', \n", | |
" 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', \n", | |
" 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', \n", | |
" 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', \n", | |
" 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', \n", | |
" 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', \n", | |
" 'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', \n", | |
" 'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', \n", | |
" }\n", | |
"\n", | |
"def determine_utrs(gene):\n", | |
" filename = \"/home/annaldas/projects/result/%s/%s_utr_regions.fa\" %(gene,gene)\n", | |
" bedfastafile = open(filename,\"r\")\n", | |
" bedfastalines = bedfastafile.readlines()\n", | |
" bedfastafile.close()\n", | |
" gene_utr = dict()\n", | |
" for line in bedfastalines:\n", | |
" if (line.startswith(\">\")):\n", | |
" trans_id = line[1:].strip()\n", | |
" if (trans_id not in gene_utr):\n", | |
" gene_utr[trans_id] = []\n", | |
" else:\n", | |
" gene_utr[trans_id].append(line.strip())\n", | |
" return gene_utr\n", | |
"\n", | |
"def score(seq,start): \n", | |
" kozak = {\n", | |
" \"A\":[0.25,0.61,0.27,0.15,1.00,0.00,0.00,0.23],\n", | |
" \"C\":[0.53,0.02,0.49,0.55,0.00,0.00,0.00,0.16],\n", | |
" \"G\":[0.15,0.36,0.13,0.21,0.00,0.00,1.00,0.46],\n", | |
" \"T\":[0.07,0.01,0.11,0.09,0.00,1.00,0.00,0.15]\n", | |
" }\n", | |
" \n", | |
" score = 1.0\n", | |
" for i in range(start,len(seq)):\n", | |
" score *= kozak[seq[i]][i]\n", | |
" return score\n", | |
" \n", | |
"\n", | |
"def translate(seq, i, utr_regions):\n", | |
" translating = True\n", | |
" aa = \"\"\n", | |
" \n", | |
" in_utr = False\n", | |
" for utr in utr_regions:\n", | |
" start,stop = utr\n", | |
" if ((start < i) and (i < stop)):\n", | |
" in_utr = True\n", | |
" \n", | |
" while(translating): \n", | |
" if ((len(seq) < 3) or (in_utr)):\n", | |
" translating = False\n", | |
" aa = \"\"\n", | |
" else:\n", | |
" codon = seq[0:3]\n", | |
" if (codon_table[codon] == \"_\"):\n", | |
" translating = False\n", | |
" else:\n", | |
" aa += codon_table[codon]\n", | |
" seq = seq[3:]\n", | |
" i += 3\n", | |
" return aa,i\n", | |
"\n", | |
"def find_utrs(seq,utr):\n", | |
" pos = seq.find(utr)\n", | |
" if (pos == -1):\n", | |
" if (len(utr) > 20): \n", | |
" for i in range(len(utr) - 1,len(utr)*5//10 - 1,-1):\n", | |
" pos = seq.find(utr[:i])\n", | |
" return pos\n", | |
"\n", | |
"def translate_aa_seq(seq,enst,gene_utrs):\n", | |
" utr_regions = []\n", | |
" for utr in gene_utrs[enst]:\n", | |
" pos = find_utrs(seq,utr)\n", | |
" if (pos != -1):\n", | |
" utr_regions.append([pos,pos + len(utr)])\n", | |
" \n", | |
" longest_aa_seq = \"M\"\n", | |
" longest_aa_seq_sc = 0\n", | |
" longest_aa_seq_sc_end = 0\n", | |
" for i in range(len(seq)):\n", | |
" if (seq[i:i+3] == \"ATG\"):\n", | |
" sc = score(seq[i-4:i+4],0)\n", | |
" aa,end = translate(seq[i:], i, utr_regions)\n", | |
" #print(i,seq[i-4:i+4],aa,sc, end)\n", | |
" if ((len(aa) > 20) and (sc > longest_aa_seq_sc) and (i > longest_aa_seq_sc_end)):\n", | |
" longest_aa_seq = aa\n", | |
" longest_aa_seq_sc = sc\n", | |
" longest_aa_seq_sc_end = end\n", | |
" return (longest_aa_seq,longest_aa_seq_sc)\n", | |
"\n", | |
"\n", | |
"\n", | |
"def translate_aa_seq_length(seq,enst,gene_utrs):\n", | |
" utr_regions = []\n", | |
" \n", | |
" longest_aa_seq = \"M\"\n", | |
" for i in range(len(seq)):\n", | |
" if (seq[i:i+3] == \"ATG\"):\n", | |
" aa,end = translate(seq[i:], i, utr_regions)\n", | |
" #print(i,seq[i-4:i+4],aa, end)\n", | |
" if (len(aa) > len(longest_aa_seq)):\n", | |
" longest_aa_seq = aa\n", | |
" return longest_aa_seq\n", | |
"\n", | |
"def find_all_aa_seqs(seq,enst,gene):\n", | |
" gene_utrs = determine_utrs(gene)\n", | |
" \n", | |
" longest_aa_seq = translate_aa_seq_length(seq,enst,gene_utrs)\n", | |
" if gene in gene_utrs:\n", | |
" for utr in gene_utrs[gene]:\n", | |
" if (find_utrs(seq,utr) != -1):\n", | |
" longest_aa_seq,longest_aa_seq_sc = translate_aa_seq(seq,enst,gene_utrs)\n", | |
" \n", | |
" return longest_aa_seq\n", | |
" \n", | |
"transcripts_filename = \"/home/annaldas/projects/result/ONECUT2/ONECUT2_seq.fa\"\n", | |
"transcripts = SeqIO.index(transcripts_filename, \"fasta\")\n", | |
"\n", | |
"gene = \"ONECUT2\"\n", | |
"\n", | |
"for transcript in transcripts:\n", | |
" seq = str(transcripts[transcript].seq).strip()\n", | |
" enst = str(transcripts[transcript].id).split(\"|\")[-1].strip()\n", | |
" protein = find_all_aa_seqs(seq,enst,gene)\n", | |
" transcript_name = str(transcripts[transcript].id)\n", | |
" transcript_name = str(transcripts[transcript].id)\n", | |
" transcript_filename = transcript_name.replace(\"|\",\"_\")\n", | |
" transcript_filename = transcript_filename.replace(\"_\",\"\")\n", | |
" print(transcript_name,transcript_filename,protein)\n", | |
" \n", | |
"#enst = \"enst\"\n", | |
"#a=\"TGGACAGCTCCCGCTCACCCAAACAGAAGACGTCGGCGCCGGAGCGGGCTCGGACATGGCGAGGCTGCGAGCCGGCCCGAGCGGCGGGGCCCGGTGATCCCTCCCTCCCTCCCCGTCCCCTCCCCTCTCCCGCACGCACGCCCCGTCCGCCCCCACCCCGCCCCCACCCCGGGCGAGCCCGCCCGCAGCCCGGGGCGCACACCCGCACGCGCACTCCTCTCCACTCACTCCCGCGCCCGCCCCCACTCCCGCAGCCGAGCCCCGCCACGCGCGCCTTGCCCGCCCGCCGGCCGCCCCCGCCGCCCCCGCCGCCCCCGGGCCCTGATGGACTGAATGAAGGCTGCCTACACCGCCTATCGATGCCTCACCAAAGACCTAGAAGGCTGCGCCATGAACCCGGAGCTGACAATGGAAAGTCTGGGCACTTTGCACGGGCCGGCCGGCGGCGGCAGTGGCGGGGGCGGCGGCGGGGGCGGCGGGGGCGGCGGCGGGGGCCCGGGCCATGAGCAGGAGCTGCTGGCCAGCCCCAGCCCCCACCACGCGGGCCGCGGCGCCGCTGGCTCGCTGCGGGGCCCTCCGCCGCCTCCAACCGCGCACCAGGAGCTGGGCACGGCGGCAGCGGCGGCAGCGGCGGCGTCGCGCTCGGCCATGGTCACCAGCATGGCCTCGATCCTGGACGGCGGCGACTACCGGCCCGAGCTCTCCATCCCGCTGCACCACGCCATGAGCATGTCCTGCGACTCGTCTCCGCCTGGCATGGGCATGAGCAACACCTACACCACGCTGACACCGCTCCAGCCGCTGCCACCCATCTCCACCGTGTCTGACAAGTTCCACCACCCTCACCCGCACCACCATCCGCACCACCACCACCACCACCACCACCAGCGCCTGTCCGGCAACGTCAGCGGCAGCTTCACCCTCATGCGCGACGAGCGCGGGCTCCCGGCCATGAACAACCTCTACAGTCCCTACAAGGAGATGCCCGGCATGAGCCAGAGCCTGTCCCCGCTGGCCGCCACGCCGCTGGGCAACGGGCTAGGCGGCCTCCACAACGCGCAGCAGAGTCTGCCCAACTACGGTCCGCCGGGCCACGACAAAATGCTCAGCCCCAACTTCGACGCGCACCACACTGCCATGCTGACCCGCGGTGAGCAACACCTGTCCCGCGGCCTGGGCACCCCACCTGCGGCCATGATGTCGCACCTGAACGGCCTGCACCACCCGGGCCACACTCAGTCTCACGGGCCGGTGCTGGCACCCAGTCGCGAGCGGCCACCCTCGTCCTCATCGGGCTCGCAGGTGGCCACGTCGGGCCAGCTGGAAGAAATCAACACCAAAGAGGTGGCCCAGCGCATCACAGCGGAGCTGAAGCGCTACAGTATCCCCCAGGCGATCTTTGCGCAGAGGGTGCTGTGCCGGTCTCAGGGGACTCTCTCCGACCTGCTCCGGAATCCAAAACCGTGGAGTAAACTCAAATCTGGCAGGGAGACCTTCCGCAGGATGTGGAAGTGGCTTCAGGAGCCCGAGTTCCAGCGCATGTCCGCCTTACGCCTGGCAGCGTGCAAACGCAAAGAGCAAGAACCAAACAAAGACAGGAACAATTCCCAGAAGAAGTCCCGCCTGGTGTTCACTGACCTCCAACGCCGAACACTCTTCGCCATCTTCAAGGAGAACAAACGCCCGTCAAAGGAGATGCAGATCACCATTTCCCAGCAGCTGGGCCTGGAGCTCACAACCGTCAGCAACTTCTTCATGAACGCCCGGCGCCGCAGCCTGGAGAAGTGGCAAGACGATCTGAGCACAGGGGGCTCCTCGTCCACCTCCAGCACGTGTACCAAAGCATGATGGAAGGACTCTCACTTGGGCACAAGTCACCTCCAAATGAGGACAACAGATACCAAAAGAAAACAAAGGAAAAAGACACCGGATTCCTAGCTGGGGCCCTTCACTGGTGATTTGAAAGCACAATTCTCTTGCAAAGAAACTTATATTCTAGCTGTAATCATAGGCCAGGTGTTCTTCTTTTGTTTTTAATGGCTATGGAGTCCAAGTGCAAGCTGAAAAATTAATCTCTTAGAACCAGACACTGTTCTCTGAGCATGCTAAGCATCCCAGAAACCCAAATGGGGCCTTCCTGGAGCGAGTTAATTCCAGTATGGTGTCAACCAAGCTCGGGATTGCTTAAAATATCATCCATCCCACTTCAGGTCCTGTCAGCTTCTTGCAGTCAGAGTTCCTATGAGTAACAATAGGAGTTTGGCCTATGTAAGGACTCTGAGTTTAGGCTTCCAAGATACAACAATAAGAGAAGAATCTAGCAACGAGAATGACCTCATTTGCTTTCCACATGCTTAGCCTCATTATACCATGTTATGTCCAAGTTCACAGCCACAACATCAGAATGGTAATTACTGAGCACAAGTTTTAAATATGGACGTTAAAAAAAAAAATCCAAGGACCTGTTTTTCCAACCCAGACATCTTTTCATTGAATGATTTAGAAAGCTTTAAGTTGATCCAGCTTACAATTTTTTTTTTCTTTACCTCCTGGAAATCTCATATGGTCTTGGATCCGTCAAAAAAACCAGTCAGTTCACTTGCGCTCAAAGTATCAAGCACAACAAAGATAAACAGAAGTGAGGAAGGTTCTGGGTTCACTACATCTGGATTTTCAAGACACCTATTGTGAAGTCATTAGGGAATTGATGAGAATATGGCTTCAAGCACATTTTGCAGTTTGCTACAAATTCTGTTGTACATAATGCAGACGCACACTCAGGAGGCCAATTTAACTGTTAACAGTGCATGGAGCGAATGCAGCATTTTAAAAGATCTAGGTTTTTTTAGGTCATTAATGTGTCCTTGGTTGATCAGTCATCTGGTCCCTCCTACTGTGTGTTATGACCACCACGTAATCCATTCTCGCTCTTTCTGATTTGGGGTTTTTCCTCATCCATCCCATTAGTAGGGATGTTTTCTGTGTTTTCTAGCAAGAAAAAAAAATCAATCAATCAAACCTGCATACATGTTACTCATGACTGTCATCTAGTCCTAAATCTCTTCTGTTGTTGAATCATCCTTGCAAAACAGCTGAATACATCTGGAGAAAACACAGCACACCAAAGAAGCAGAATACTGCAAACCAAAGACATTTATGACTTGTCATTTTCTAGCCTAAAAATACTGTGATTACTTTTAGAAATCAGAAAACCTCTGCAACTCCGAATGGCATTCAGCTCTTGCATTTGGCGCATCATCGGGCTGAGCGGACCAGCTACACCAAGGACATTAGCCAAGCCACCCAGAGGGGTGGCTTTGCCACACCAGTTGTCACCTTCCCATAGCAAGTGGAAGAGCGCCCACAGAACTCTGGGAGATTGCAAAGGTCACAATGTGCATATTTACCAGTGAATGGCCCCGGGTGGGGCCACGTGGGGGTGTTCAAAGCAAGCCAAACGCTGCAATCATTCTTTACAGACACTTGAGACTGACTTTTTTATGAATTACTTAGTCGAAACCAAAGAAACTTTTTCTGCACCTACTTCTGCAACAAACAAAACTGTCCCATTAAAATGAATAAATAAATCCGTAAATCAATGGAAATCACCACCAATAAGAAGGAAGCACGCCAGAAAATAAACGAAAACAAAAACAGGGAGACACACTGTGTTCAAACAGACCTCTTGGGACATTTTTTGGAAGCAGATTTTAAAGAAAGGGTTGAGACAAAGATAGAAATAAGGAAGAGCCTCAGTGGCTGCTGCTTCATTTGACAACTCACACGGTAATCTTAAAGCTGAAGATTGTCTTTAATTTGTGCCTATGCAGTTTTTCAAAAGAACACGGAACAGAGCAACAGAAACCTCAACAGCTACAATACCAAAGATGAGGATTTCTCACACCTTTTGTTTCAGTTCATTATCTCCTCTTGCCTGGCTAAAATACTAATAGCGCCATTGAACTGTATAAAGGTAATCAATTATGTTTCTCTGAGCAACAAAAGGAAAGGGCCATTTATTTGATTTTATTGTTTCATTTCAATTTTGTCTTATGGTTTTTTGCCCCAACATGGAATCTCTCAAAAGTTTCCATGGACTCCAAGTTTAAGATGTTGGGATATTGAACAGTTCTCTCTGCTCAGCAGAGGGTAGGGAATAACATTATCACTTGAATGTTCTTTGCTTAACCCTTAGACTTGGTTCCTTCTATGTTCAGAGTCTCATCATCAGGGGAAGGAAAGGGAGTGAGGGTCAGGGATAGGGGTCTTGGTGATGCATCCTCTCCCGAGCCACAGAACCAAAGAGTTTATAGAGGAATTTACAGCCTCGTTTTCATGTGATTGCTACATCCTAACAGGGCTTCATTTGGGGGTGGGGGGAAACATGTAAAAATAATTGCCAGTTTCTACTTTTCTATTAGCTTTTTAAAAATCAGCTGTAAAGTTGCATTTCTAAAGAAAGATATATATAATATATAAAATACATATATAGATCAACTTGACATTGGTGATAACCAAAATTATTGCTGTCCAAATTCATGTCTTGTTTTGGTCCAGTGCTTCATTTGCTAAGTATTCGGTTCAGAATTTTTCTCATTTCTCATGCCATTCCAGAGTTAATTTGCCACTGTGGATGATTTGAAGTATTCAGATCTCTATGGAAGTTTCTGGGACAGGTTTAAAGTCAAGATCAAGCATTTTAGCATTTAACCTGTTGATAAATGGATCCATGGTGTACATGAGTTTTATTTGTATTCGGAGTCATCTCTATTCTATCCCTCAGCCTCGATTAAGGTGGTGAGTGAAGTGCATCCAACAGACTCGGCCCAGAACTGGGTCCTGACAGTGGGGTGCTCATCTTCTGTAACTGTTGGGAAGGCTCGGTGGTCCATTTTCACCAGTTAAAGAATATGAGGCCAGCCCAGAAATCTGTTCTCCAGGAGCTGCCCTGTCCCATCTGGGTGTGCCAGACCCCCTCAGTGAGCAGGTCCACCAAAGGGACTTCTCACAGGGGAAGCCCAACTCCTGTTGCAATGGGTTGATAGATTTCCTCAGGGTGGTAATTACCAATTCGTATTTTGACAAGCCTATGTGCAACCACAGCTGGCACTGGGGTGGGCAGTGGTGTTGGGTGGGATGGGGGAGAGTGTCTCAATCCTGAAGAGAAAATATAAAGCAGGTTTTGGGGAGACTTCTGGAGTCCTGCCCCTAGAGAGCCCCATTGTTGTTCTTTGTGCCCCCTCCTCATTCCCCCTATGTGGGTCTCCCTATGCAGGAGCTGTGAGAGAATGTGACTCTCCACAATTTTTATAATTCATCCTTCCTAGGAGATTGTTCATTGGCTCTTCCCTTGTGTCCCTTTGTCCCTTGCTCATACTCCATGTTTCCTTTGTCAAAGGACTAAGAAAAGAGCATATTTCAGCAGAGGAGTGTTCCCATGTGGGTTGATTTCAACTTGGGTATTTCTAAAAGAGTCCTTGTGACATGTGTCCAGTGGAAATGGTTGCTCTTTTCCAGACTGGATTGAGGAATGGAGCCTGTTTGATTTGGTTAGTGATTCTTTGACATACTAATCTCAGCGTTTGGGTCTCCAGCATCCTCTGAAGATGTCTAGACTAGTAGAGGCTGCCTTTGTGACCTGACATTACAACATTGGTCAAACCAGTCCTCTGATAATCAGAAGAACATGTCATAATTGTTTAAAAAAAAAAAAAAGGCAAGAATTTCTCTCCAAGGAGCTTTAATAAATGTCTCATTCCAGATAATGTCATACCAGAGAAAAGTGCTTGCTTTTAGAAAATTATTTACATACATATATAAATATATATGTGTATCTATACAGTTATGTATCAAAATTTTAAGCCCTGCAGAATTTCAATTTGTTAGAAATCTAACAGAAAAAAATTTCTATATTGAAAGGTAATAGAATTTAACCCAGTGAGTTTACTCAAGGATTTTTAAATTTAAGTTAATAATTTCAGAGAAAATAACCATTTGGGTGTGGTTATAGTTTAGTATCCATTACCTCAATCCAAGGAAAATTCCAGGCATTCCTCAACCATCAGGAAAAGGTACAGTGTGAAGGAACAGTTCTCAGCCAAATTTCACATTCTTGAGGCAACAGAAATCAAAACACTCAGAGCCATTGAGTGGAAAAACAATTTACTTTATTCCTTTACACAAATAGGCTTGCATTGTTTTTGTTTTAATGTGATTTTGGTACTAGGGATATAATTATTTCATTCCAGGAAATAATAAAAAAAAACAGACAGAGCCAATACATTTCTTTTTTTAAAGGAAACAGCAACAACAATAAAAACTCAGCACCAATATTTAAAAGCTTTTCCAAAATGTAAAAGAAGTGTTTAGCTTGCACCATGCATAAAGGTGCAGGCTAGTTGAACCAGGAAGCATGGCACTTCCTCTGGAGAAATCCAGAAAGAGTTGCTTCTAAGCTCCCTTTTCCCCCTGCAGGCTCTTGGCAATTGTAGGCTTTAGCAAATCCAGAATAATTTTCAATTCAAGCTAAAATAAAATCAACATTTGGAATGTAAATCTGATACACACACACTTTTCTAAGTCAAACAACATATTTCAAAACCAAAAATAAATACCTTTTAGATAATCAGTTATTTTCTTTGTCTATACTGGGCACCCACCTACTAGTGCCAGTAAATTCAAGTTGAACAGATTTTTAAAATCACTATTATCTGGGTATGGGGGAAACTTCCCCACTTTTGAAAATGTTGGTAGAATTATAGGAATGTCTGTTTGATTATCATTACCAAAGTGTCATGACAGTATGCCTTTGTAGTGAACTCGGATTTTCAGGAGTTTGAATAGTTGGATATTTTAAAATCTAAGAAGAAAAGGCCTGTTTCCAATGTTGTTGAAGAATAATGAACTCTATTAAAAAGTGGAGAAAAAGATAATACATGTGGTCAAGGTTGACCACAAGGCCCAGGCACAACTACCTTGGCGATAATCTTCTAGATTCGTAACAGGTTAGAGCTGACTTTTTGTTTTTGTTGTTGCTGATGCTGTGTGATTCAGACTTCTCAGCCTAACCAGGAAGAGTAAGTGGAAATGGTAGATGAAGAAGGGGTAGAGCTGGTGTATCTATAACTTTCTGATATTTGTCTGCCAAACTTGATATATTAGTAATTTTTTTATCTTTAGCTAAGATCAAGTCACCCCTGAAACAACAGGAGATTCTAGTTTTAAAATAAGGCCACAAAAATCCTTACGGAATGAAGAATGGCACCCCAGTTGGTTGTATAAGTCTCATAAGATAATGATGTTGATTTTAAATATGGATGTCTCAATGCCTGTTTTCTATCAATGATTTGTTTGTTTCCAAGGTCGGGGAGGGAAAGAGGGGAGGGTTTATCTGTTTTAGAAAGTCTCAGAATACTTATAAAATACAGAAGTAGTTATTAAAATATATAGGACCTCACATAGGTAGATACAGAACTTACCATTGAGGCTGATGGGCTGTTGTGTGAATCACACAGGACCTTAAATGAGGCTCATTATTCTCACACACCAAAATGACTCTGACAGCCTGAAGCAGTTATTGCTAGAGCCCAAGCTTTCCTTGGAGGTTTTGGAGTTAGGTTGATTGGAAGTAACCAGCTAATACCTTTTCTAGTGGAGAAAAAGACATTGCTACCAGCTTGTTCATCCCATAGAAGTCTTCCACTCTGCTCCATTTTTAGCAGCAAGCATTTCATGTAGCATAAACCTTGGCAGATAAGTGTGCCTAAGGTTTATACAGTCTGTCCGCTTGGATGTATACAAATTTAGATACATATTTTAACATGTGTTCTCATAGATGACTTTATAACAACACACATTACCTATAGGTGTCTAGACTGTGTACATACAAGTGTGTACAGACAAGCTTCATACGTATATACTGTAATCCGTTACAACAAATAAATTTTAAATCATCGTTTAACATGTATGTGGTACTTCTACAGTGTACATTGTTTTCATTATTTATTGTAACATTGAAAACCACAGTGCAGGGAAAACAAAAGTATCCCAGCATCTTCATCCTGTACACTTGGAATTAATTTCATTTGGGCATATCCAAGATAAACTCAACTTTCAAGAAATCTTGTATATTATTTAATCATCTGTGTTAGGATGACACCTATGATTGATGACTTCGGTTGAATAGCTTTATTCTGGATTTTTCATAACTAAAGCTAAATCCAAAGACCTGAAAAAGGACAAAAAGAAAAAAAAAAAAAGAAA\"\n", | |
"#find_all_aa_seqs(a,enst,\"ONECUT2\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"test = determine_utrs(\"ONECUT2\")\n", | |
"for i in test:\n", | |
" for j in test[i]:\n", | |
" print(i,find_utrs(a,j), j)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"len(\"MALNGAEVDDFSWEPPTEAETKVLQARRERQDRISRLMGDYLLRGYRMLGETCADCGTILLQDKQRKIYCVACQELDSDVDKDNPALRDVVPQPLPF\") *3" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"\n", | |
"domains = dict()\n", | |
"\n", | |
"file = open(\"/home/annaldas/projects/result/ZNRD2/ZNRD2_blastx.gff3\", \"r\")\n", | |
"lines = file.readlines()\n", | |
"file.close()\n", | |
"\n", | |
"\n", | |
"for line in lines:\n", | |
" if (line.startswith(\">\")):\n", | |
" break\n", | |
" \n", | |
" if (not line.startswith(\"#\")):\n", | |
" data = line.split(\"\\t\")\n", | |
" seqid,source,attr = data[0],data[1],data[8]\n", | |
" if (seqid not in domains): \n", | |
" domains[seqid] = set()\n", | |
" if (source != \".\" and \"Dbxref\" in attr):\n", | |
" Dbxref = attr.split(\";\")[-1]\n", | |
" IPR = Dbxref.split(\"=\")[-1][10:-2]\n", | |
" if (IPR != []):\n", | |
" domains[seqid].add(IPR)\n", | |
"\n", | |
"try:\n", | |
" key,value = domains.popitem()\n", | |
" \n", | |
" domains[key] = value\n", | |
" common_domains = value\n", | |
" for transcript in domains:\n", | |
" curr = domains[transcript]\n", | |
" common_domains = common_domains.intersection(curr)\n", | |
"except:\n", | |
" common_domains = []\n", | |
"\n", | |
" \n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"domains" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for line in lines:\n", | |
" if (line.startswith(\">\")):\n", | |
" break\n", | |
" \n", | |
" if (not line.startswith(\"#\")):\n", | |
" data = line.split(\"\\t\")\n", | |
" seqid,source,attr = data[0],data[1],data[8]\n", | |
" if (seqid not in domains): \n", | |
" domains[seqid] = set()\n", | |
" if (source != \".\"):\n", | |
" Dbxref = attr.split(\";\")[-1]\n", | |
" IPR = Dbxref.split(\"=\")[-1][10:-2]\n", | |
" domains[seqid].add(IPR)\n", | |
"\n", | |
"key,value = domains.popitem()\n", | |
"domains[key] = value\n", | |
"common_domains = value\n", | |
"for transcript in domains:\n", | |
" curr = domains[transcript]\n", | |
" common_domains.intersection(curr)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"file = open(\"/home/annaldas/projects/result/snrnp70/snrnp70_protein_analysis.txt\",\"w+\")\n", | |
"file.write(\"Gene:\\t%s\\n\" %(\"SNRNP30\"))\n", | |
"file.write(\"Common Domains:\\t\")\n", | |
"file.write(\",\".join(list(common_domains)))\n", | |
"file.write(\"\\nSpecific Domains:\\n\")\n", | |
"for transcript in domains:\n", | |
" specific_domains = domains[transcript].difference(common_domains)\n", | |
" file.write(\"%s: %s \\n\" %(transcript,\",\".join(list(specific_domains))))\n", | |
"file.close()\n", | |
"print(common_domains)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |