Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Siddharth Annaldasula\n",
"# Last Modified: 15.10.2019"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Command line prompts to download data\n",
"\n",
"# /home/annaldas/ncbi-blast-2.9.0+/bin/blastx -query test.fa -db refseq_protein -out test.out -evalue 1e-5 -max_target_seqs 1 -outfmt 7"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from Bio import SeqIO"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"annotation_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/GffCompare/nanopore.combined.gtf\"\n",
"annotate_df = open(annotation_filename,\"r\")\n",
"annotate_df_lines = annotate_df.readlines()\n",
"annotate_df.close()\n",
"transcripts = []\n",
"\n",
"exon = 1\n",
"lines = []\n",
"result = []\n",
"for line in annotate_df_lines:\n",
" t = line.split(\"\\t\")[2]\n",
" if (t == \"transcript\"):\n",
" if (exon != 1):\n",
" result.append(transcript)\n",
" transcript = line.split(\"\\t\")[8].split(\";\")[0].split(\" \")[-1][1:-1].strip()\n",
" exon = 0\n",
" elif (t == \"exon\"):\n",
" exon += 1\n",
" lines.append(line.strip())\n",
"\n",
"#output_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/multiple_exons.txt\"\n",
"#output = open(output_filename,\"w+\")\n",
"#output.writelines(\"\\n\".join(result))\n",
"#output.close()\n",
"\n",
"#output_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_10_1/Results_10_1/GffCompare/nanopore.combined_filt.gtf\"\n",
"#output = open(output_filename,\"w+\")\n",
"#output.writelines(\"\\n\".join(result))\n",
"#output.close()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['6356d404-48e6-4c7a-8fbd-5f6e7f9b42bc|5',\n",
" 'a5e2697c-74a1-4ac9-a5b2-d974af83137d|4']"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gene_oID[\"KTN1-AS1\"]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# Import the nanopore annotation file\n",
"\n",
"annotation_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/GffCompare/nanopore.combined_filt.gtf\"\n",
"annotate_df = pd.read_csv(annotation_filename,sep = \"\\t\", header = None)\n",
"annotate_df = annotate_df[annotate_df[2] != \"exon\"]\n",
"annotate_lines = list(annotate_df[8])\n",
"chrms = list(annotate_df[0])\n",
"start = list(annotate_df[3])\n",
"stop = list(annotate_df[4])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# Mapping gene name to oID\n",
"# Mapping oID to transcript id\n",
"# Mapping transcript id to exons\n",
"\n",
"BRD4_info = []\n",
"\n",
"gene_oID = dict()\n",
"oID_gene = dict()\n",
"oID_tID = dict()\n",
"tID_oID = dict()\n",
"\n",
"#tID_exon = dict()\n",
"\n",
"for ann in range(len(annotate_lines)): \n",
" if \"gene_name\" in annotate_lines[ann]:\n",
" line = annotate_lines[ann].split(\";\")\n",
" tID = line[0].split(\" \")[-1][1:-1]\n",
" gene = line[2].split(\" \")[-1][1:-1]\n",
" oID = line[3].split(\" \")[-1][1:-1]\n",
" transID = line[4].split(\" \")[-1][1:-1].split(\".\")[0]\n",
" \n",
" if (gene not in gene_oID): gene_oID[gene] = [oID]\n",
" else: gene_oID[gene].append(oID)\n",
" \n",
" if (oID not in oID_tID): oID_tID[oID] = tID\n",
" if (tID not in tID_oID): \n",
" tID_oID[tID] = oID\n",
" else:\n",
" print(\"this sucks\")\n",
" if (oID not in gene_oID): oID_gene[oID] = gene\n",
" \n",
" if (gene == \"BRD4\"): \n",
" BRD4_info.append([chrms[ann],start[ann],stop[ann],annotate_lines[ann]]) \n",
" #if (tID not in tID_exon): tID_exon[tID] = []\n",
"\n",
"KDM1A = gene_oID[\"KDM1A\"]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"oID_tID[\"142e462f-f586-42fc-97b9-b2e3bfa1fd0d\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Import transcript isoform sequences\n",
"\n",
"transcripts_filename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/Results/Pinfish/corrected_transcriptome_polished_collapsed.fas\"\n",
"transcripts = SeqIO.index(transcripts_filename, \"fasta\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Extracting isoforms from related genes\n",
"\n",
"output = []\n",
"for gene in gene_oID:\n",
" for oID in gene_oID[gene]:\n",
" tID = \">\" + oID_tID[transcripts[oID].id]\n",
" output.append(tID)\n",
" seq = str(transcripts[oID].seq)\n",
" output.append(seq)\n",
"\n",
"#output_filename = \"/project/owlmayerTemporary/Sid/blast/test/polished_transcripts_tcons.fa\"\n",
"#output_file = open(output_filename,\"w+\")\n",
"#output_file.write(\"\\n\".join(output))\n",
"#output_file.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"oID_geneID = dict()\n",
"geneID_oID = dict()\n",
"\n",
"filename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/Results/Pinfish/clustered_transcripts_collapsed.gff\"\n",
"gff = open(filename,\"r\")\n",
"lines = gff.readlines()\n",
"gff.close()\n",
"for line in lines:\n",
" if (not line.startswith(\"##\")):\n",
" s = line.split(\"\\t\")[-1]\n",
" g_t = s.split(\";\")\n",
" if (len(g_t) > 2):\n",
" geneid = g_t[0].split('\"')[1]\n",
" transcriptid = g_t[1].split('\"')[1]\n",
" transcriptid = transcriptid.split(\"|\")[0]\n",
" oID_geneID[transcriptid] = geneid\n",
" geneID_oID[geneid] = transcriptid\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(tID_oID[\"TCONS_00032753\"],oID_gene[tID_oID[\"TCONS_00032753\"]])\n",
"\n",
"for oID in gene_oID[\"BRD4\"]:\n",
" try:\n",
" print(oID,oID_geneID[oID.split(\"|\")[0]],oID_tID[oID])\n",
" except:\n",
" print(\"None\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"filename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/Results/Quantification/all_counts.txt\"\n",
"df = pd.read_csv(filename)\n",
"\n",
"rep1 = [\"OJ32\",\"OJ33\",\"OJ34\"]\n",
"rep2 = [\"OJ40\",\"OJ41\",\"OJ42\"]\n",
"df_rep1 = df.drop(columns = rep2)\n",
"df_rep2 = df.drop(columns = rep1)\n",
"\n",
"#for name in rep1:\n",
"# df_rep1[name] = (df_rep1[name] - df_rep1[name].min())/(df_rep1[name].max() - df_rep1[name].min())\n",
" \n",
"#for name in rep2:\n",
"# df_rep2[name] = (df_rep2[name] - df_rep2[name].min())/(df_rep2[name].max() - df_rep2[name].min())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"group = df_rep1.groupby([\"gene_name\"])\n",
"\n",
"count = 0\n",
"for key,item in group:\n",
" if (\"ABI2\" == key):\n",
" #for name in rep1:\n",
" # item[name] = (item[name] - item[name].mean())/item[name].std()\n",
" print(key)\n",
" print(item.sort_values(\"transcript_id\"))\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"group = df_rep2.groupby([\"gene_name\"])\n",
"count = 0\n",
"for gene,item in group:\n",
" if(gene == \"FUS\"):\n",
" print(gene)\n",
" #for name in rep2:\n",
" # item[name] = (item[name] - item[name].mean())/item[name].std()\n",
" #for name in rep2:\n",
" # if (item[name].max() > 3):\n",
" # print(gene)\n",
" print(item.sort_values(\"transcript_id\"))\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"group = df.groupby([\"gene_name\"])\n",
"count = 0\n",
"for gene,item in group:\n",
" if(gene == \"TLL3\"):\n",
" print(gene)\n",
" #for name in rep2:\n",
" # item[name] = (item[name] - item[name].mean())/item[name].std()\n",
" #for name in rep2:\n",
" # if (item[name].max() > 3):\n",
" # print(gene)\n",
" print(item.sort_values(\"transcript_id\"))\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"output = []\n",
"gene = \"BRD4\"\n",
"for oID in gene_oID[gene]:\n",
" tID = \">\" + oID_tID[transcripts[oID].id]\n",
" output.append(tID)\n",
" seq = str(transcripts[oID].seq)\n",
" output.append(seq)\n",
"\n",
" \n",
"#output_filename = \"/project/owlmayerTemporary/Sid/blast/test/kdm1a.fa\"\n",
"#output_file = open(output_filename,\"w+\")\n",
"#output_file.write(\"\\n\".join(output))\n",
"#output_file.close()\n",
"\n",
"\n",
"#/home/annaldas/ncbi-blast-2.9.0+/bin/blastx -query /project/owlmayerTemporary/Sid/blast/test/kdm1a.fa -db /project/owlmayerTemporary/Sid/blast/test/human.protein.fa -out kdm1a.out -evalue 1e-5 -max_target_seqs 5 -max_hsps 1 -outfmt 7 -num_threads 4"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"afilename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/ReferenceData/gencode.v32.primary_assembly.annotation.gtf\"\n",
"afile = open(afilename,\"r\")\n",
"afile_lines = afile.readlines()\n",
"afile.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"output = []\n",
"for line in afile_lines:\n",
" if (not line.startswith(\"#\")):\n",
" output.append(line.strip().split(\"\\t\"))\n",
"pd_aline = pd.DataFrame(output,columns=[\"chr\",\"source\",\"type\",\"start\",\"stop\",\"a\",\"b\",\"c\",\"info\"])\n",
"pd_aline = pd_aline.astype({'start': 'int32', \"stop\":\"int32\"})\n",
"#print(pd_aline[\"type\"].unique())\n",
"pd_aline = pd_aline[pd_aline[\"type\"] == \"UTR\"]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"info = list(pd_aline[\"info\"])\n",
"types = list(pd_aline[\"type\"])\n",
"for ann in range(len(info)): \n",
" if \"gene_name\" in info[ann]:\n",
" line = info[ann].split(\";\")\n",
" transID = line[1].split(\" \")[-1][1:-1].split(\".\")[0]\n",
" gene = line[3].split(\" \")[-1][1:-1]\n",
" \n",
" if (transID == \"ENST00000391839\"):\n",
" print(line,types[ann])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"pd_aline.to_csv(path_or_buf = \"/project/owlmayerTemporary/Sid/nanopore-analysis/ReferenceData/df_utr_regions.csv\",index = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df_utr_regions = pd.read_csv(\"/project/owlmayerTemporary/Sid/nanopore-analysis/ReferenceData/df_utr_regions.csv\")\n",
"\n",
"info = list(df_utr_regions[\"info\"])\n",
"chrms = list(df_utr_regions[\"chr\"])\n",
"start = list(df_utr_regions[\"start\"])\n",
"stop = list(df_utr_regions[\"stop\"])\n",
"\n",
"for ann in range(len(info)): \n",
" if \"gene_name\" in info[ann]:\n",
" line = info[ann].split(\";\")\n",
" transID = line[1].split(\" \")[-1][1:-1].split(\".\")[0]\n",
" gene = line[3].split(\" \")[-1][1:-1]\n",
" \n",
" \n",
" if (gene == \"EBF2\"):\n",
" print(chrms[ann] + \"\\t\" + str(start[ann]) + \"\\t\" + str(stop[ann]) + \"\\t\" + transID)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bedfastafile = open(\"/home/annaldas/projects/result/RPS24/RPS24_utr_regions.fa\")\n",
"bedfastalines = bedfastafile.readlines()\n",
"bedfastafile.close()\n",
"trans_utr = dict()\n",
"for line in bedfastalines:\n",
" if (line.startswith(\">\")):\n",
" trans_id = line[1:].strip()\n",
" if (trans_id not in trans_utr):\n",
" trans_utr[trans_id] = []\n",
" else:\n",
" trans_utr[trans_id].append(line.strip())\n",
"\n",
"transcript_id = \"ENST00000372360\"\n",
"s = '''CTCTTTTCCTCCTTGGCTGTCTGAAGATAGATCGCCATCATGAACGACACCGTAACTATCCGCACTAGAAAGTTCATGACCAACCGACTACTTCAGAGGAAACAAATGGTCATTGATGTCCTTCACCCCGGGAAGGCGACAGTGCCTAAGACAGAAATTCGGGAAAAACTAGCCAAAATGTACAAGACCACACCGGATGTCATCTTTGTATTTGGATTCAGAACTCATTTTGGTGGTGGCAAGACAACTGGCTTTGGCATGATTTATGATTCCCTGGATTATGCAAAGAAAAATGAACCCAAACATAGACTTGCAAGACATGGCCTGTATGAGAAGAAAAAGACCTCAAGAAAGCAACGAAAGGAACGCAAGAACAGAATGAAGAAAGTCAGGGGGACTGCAAAGGCCAATGTTGGTGCTGGCAAAAAGTGAGCTGGAGATTGGATCACAGCCGAAGGAGTAAAGGTGCTGCAATGATGTTAGCTGTGGCCACTGTGGATTTTTCGCAAGAACATTAATAAACTAAAAAC'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if transcript_id in trans_utr:\n",
" seq = s\n",
" for utr in trans_utr[transcript_id]:\n",
" pos = seq.find(utr)\n",
" print(pos, utr)\n",
" if (pos != -1):\n",
" seq = seq[:pos] + seq[pos + len(utr):]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for t in trans_utr:\n",
" seq = s\n",
" for utr in trans_utr[t]: \n",
" pos = seq.find(utr)\n",
" print(pos, utr)\n",
" if (pos != -1):\n",
" seq = seq[:pos] + seq[pos + len(utr):]\n",
" #print(t, len(trans_utr[t]),trans_utr[t])\n",
"a = seq"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"a = '''GGCGGCGGCGGCGGCGGCGGCGGCGGCGGCTGGGCTGTTTGTTCTGGTCTCCCGCAGCCGAGGAGCCGAAGCAGTGGCGGCGGCAGCGGCTGCGGCGGCTGCCGGCGGTGCCCGCGGGCGAGCGCGGCCTGTGAGCTCGGCAGAGCGGCGGGCGGGCCCCGGCGCCGCGCAGGCAGCTCGGGGAGGGGGCGGCGGCAGCGGGCGGACGGCCGGCGGGGGCGGCGTGCGGCCTAGCGTCTCAGAGTGCCTGGTGAAGAATGTGATGGGATCACTAGCATGTCTGCGGAGAGCGGCCCTGGGACGAGATTGAGAAATCTGCCAGTAATGGGGGATGGACTAGAAACTTCCCAAATGTCTACAACACAGGCCCAGGCCCAACCCCAGCCAGCCAACGCAGCCAGCACCAACCCCCCGCCCCCAGAGACCTCCAACCCTAACAAGCCCAAGAGGCAGACCAACCAACTGCAATACCTGCTCAGAGTGGTGCTCAAGACACTATGGAAACACCAGTTTGCATGGCCTTTCCAGCAGCCTGTGGATGCCGTCAAGCTGAACCTCCCTGATTACTATAAGATCATTAAAACGCCTATGGATATGGGAACAATAAAGAAGCGCTTGGAAAACAACTATTACTGGAATGCTCAGGAATGTATCCAGGACTTCAACACTATGTTTACAAATTGTTACATCTACAACAAGCCTGGAGATGACATAGTCTTAATGGCAGAAGCTCTGGAAAAGCTCTTCTTGCAAAAAATAAATGAGCTACCCACAGAAGAAACCGAGATCATGATAGTCCAGGCAAAAGGAAGAGGACGTGGGAGGAAAGAAACAGGGACAGCAAAACCTGGCGTTTCCACGGTACCAAACACAACTCAAGCATCGACTCCTCCGCAGACCCAGACCCCTCAGCCGAATCCTCCTCCTGTGCAGGCCACGCCTCACCCCTTCCCTGCCGTCACCCCGGACCTCATCGTCCAGACCCCTGTCATGACAGTGGTGCCTCCCCAGCCACTGCAGACGCCCCCGCCAGTGCCCCCCCAGCCACAACCCCCACCCGCTCCAGCTCCCCAGCCCGTACAGAGCCACCCACCCATCATCGCGGCCACCCCACAGCCTGTGAAGACAAAGAAGGGAGTGAAGAGGAAAGCAGACACCACCACCCCCACCACCATTGACCCCATTCACGAGCCACCCTCGCTGCCCCCGGAGCCCAAGACCACCAAGCTGGGCCAGCGGCGGGAGAGCAGCCGGCCTGTGAAACCTCCAAAGAAGGACGTGCCCGACTCTCAGCAGCACCCAGCACCAGAGAAGAGCAGCAAGGTCTCGGAGCAGCTCAAGTGCTGCAGCGGCATCCTCAAGGAGATGTTTGCCAAGAAGCACGCCGCCTACGCCTGGCCCTTCTACAAGCCTGTGGACGTGGAGGCACTGGGCCTACACGACTACTGTGACATCATCAAGCACCCCATGGACATGAGCACAATCAAGTCTAAACTGGAGGCCCGTGAGTACCGTGATGCTCAGGAGTTTGGTGCTGACGTCCGATTGATGTTCTCCAACTGCTATAAGTACAACCCTCCTGACCATGAGGTGGTGGCCATGGCCCGCAAGCTCCAGGATGTGTTCGAAATGCGCTTTGCCAAGATGCCGGACGAGCCTGAGGAGCCAGTGGTGGCCGTGTCCTCCCCGGCAGTGCCCCCTCCCACCAAGGTTGTGGCCCCGCCCTCATCCAGCGACAGCAGCAGCGATAGCTCCTCGGACAGTGACAGTTCGACTGATGACTCTGAGGAGGAGCGAGCCCAGCGGCTGGCTGAGCTCCAGGAGCAGCTCAAAGCCGTGCACGAGCAGCTTGCAGCCCTCTCTCAGCCCCAGCAGAACAAACCAAAGAAAAAGGAGAAAGACAAGAAGGAAAAGAAAAAAGAAAAGCACAAAAGGAAAGAGGAAGTGGAAGAGAATAAAAAAAGCAAAGCCAAGGAACCTCCTCCTAAAAAGACGAAGAAAAATAATAGCAGCAACAGCAATGTGAGCAAGAAGGAGCCAGCGCCCATGAAGAGCAAGCCCCCTCCCACGTATGAGTCGGAGGAAGAGGACAAGTGCAAGCCTATGTCCTATGAGGAGAAGCGGCAGCTCAGCTTGGACATCAACAAGCTCCCCGGCGAGAAGCTGGGCCGCGTGGTGCACATCATCCAGTCACGGGAGCCCTCCCTGAAGAATTCCAACCCCGACGAGATTGAAATCGACTTTGAGACCCTGAAGCCGTCCACACTGCGTGAGCTGGAGCGCTATGTCACCTCCTGTTTGCGGAAGAAAAGGAAACCTCAAGCTGAGAAAGTTGATGTGATTGCCGGCTCCTCCAAGATGAAGGGCTTCTCGTCCTCAGAGTCGGAGAGCTCCAGTGAGTCCAGCTCCTCTGACAGCGAAGACTCCGAAACAGGTCCTGCCTAATCATTGGACACGGACTCTTAATAAAACGGTCTTCAGTTCCAGATTCCTTCCCAGCAAGCTATAGCTTAAGTCCATTTTCTTCCGTGAAAGGGACAGGACTCCATCAAGTTATGGAATTCCTCAGAGCCCTGGGCCTGTCCCCCGGGGTGGATTAGTCATGTCCAGCAGCACACGCCTAGTCCCGCCTTCGGGAAGGCTGCCTGCCTGGCCAGCCGCCCAGGCCTCTCTGTGTAAAGACTGCCTGGCTGTCCTGCCCAGCCTTCCTGGTTCTCTGGGGTCCTCTGGGTGGGTGGCATCTCCTGGAGGGTGATGACAATCCCCAACACATGCATTCATGTGGTGCTACTCTGTGTGCAAAGCCAGACCCCAAGTATGTTTTCTCTCTTTGTCCCATCCCTCTTTTTCTGGGACTTTGGACCCTAACTACTTCCCTCCTGAACCTTGCAGTGACATCAGTCCAGGAGAGCTCTCGTTCAGTGTGCGGAAGAACACTCTGACCTCTAGAGCTGTCCTAGATAAGGAGTGGGAGCTTTAGAGGCAAGGCCTCTAGACCCTGGAAGGCTCAGTGAGGCTCTTCCCACAGCATGCTTCTCACTGGTGCCCTGTAAGGCTCGAGCCACCGCTGACTCTGAGCCTTTTGGAGTCTTTCCTCCTTCGTCTCCATTGTTCCCGTGCATTTCCAAAAGCTTAAGTTGCCTGGTGGGCATTTCCCCAGTTTCTTTGGCCTCCGTCTTCTCAAGTCACATAGGGAAAGTACCTCCTGGAACCAGGCTGCAGTATGCAGGACCTGCCAGGCAGGCACTGGTGAAGGGCCTTGGGCCTATCATCCCCCCAACCCCACCTCACCCCACCCGCCTCCTCTAGTGGGGTGAGTCTGGGCTGGTGGACCAGAGAGGGTGTCACAGACCCTCAGGGACTGCCCCATGGACACCTCTGACTGGTGTTAACAGTGTGAACATTTTCCCCGTCTTCAGTCCCTTAGAATGACGACAGCCCCTGGGGTTGGGGCAGGCGAGTGTGGCCACATCATCCAAGCCCTCCCAGAGACACAAATAGGCTTTTTTGCTCTAAAAATAAATACCAGCCCTTTTTTGGTCACAAATCCAGCATCTCAGCAGAAAACTGCCTGACATGAAAAGTCCCCTGAGGAACTGCATCTGCGTTTCAGGGGCTTTTCATTTTTTCTCCTTTTTTAAAGTGTAGATTGTGGGTGCTTCCTAGAGGCCTGCCTTCTTCTGGAACTGGAAGTGGGCTATCACCATGGGCAAGCCCTTGGGTGCAGGCTCCCCACCTGCCTGGGAACTCTGGCAGCTCTCCTCAGCTCCTTGGGCTTGAGCAGCTGCAACTGCCCCAGATTTGCTGTGGAAGCAGGGGCTAGCCCTGGCCTCACCAGGGCCTCCCGGGGCCCTGCATTGATGCTCAGGAGTTCCTGGGCTGCTCTTGATCCTTTCTGGGCATCCAGCTTCCAGTTAAGCTCTGTTTGCCAAACAAACTATTCTCAGCTGCCCTTTGGCCTGCGCCTGATGTGTTCCTGTTGCAGTCCCGCCTGCCTGAGACAGGAGCAGGCAGGAGAGCCTTCATGCCCAGATTCCCACAGGACAATTGGGGAGCTGCTGGCATTGTCTTTCTGGGAAGATTCTGCTTTCTTGGACCAAATGGCAGCCTGATTACCAGTGTCGGGCCTGCATGCTGCCCCCGACACACGCACGCACGCGCACACACGTGTGCACATGGGCCATAGCCACAAGCCAGCTCTCCTCCAGGGTCCTTTCAACCTCGCTGTCCAGGGACCCTGTCCTTCTTGCCCGTGGGGCTTCCATCTGGCAGAGAACGTTCAGGGCTTGTTGAACTTGAAAGCTCATTAGACTTAAGCTGTCACCTGTGCTTGGTGCCCCAGGAACAGCCAGAGAGGACAGTGCCCACTCACTTCTTGTTGGCAGCCTCCTGTGCAGGAAGTGCCAGCCGGGCCTCGACGCACCAGCTGGCTGTGGGTCCTGAGGAGGGGCGGGAGGCGGCCGCTCAGTGCAGATGGGGACTCCTCTCCTCTGCCCTGACCTTACCCTCCATTACCTCCTTCACTGGAGTGGGGCTGGGGGGTGGGTGGAATCAGTGTTTTAATCGGATTTTTAAAAAACATTTTATTTCTTTGTACAATTACCATCCTATGTAAAGATGAAATTTGTGTTGAGTTGAAGATTGTCATGGAATAAAGATCACACCGTA'''\n",
"\n",
"#a = '''GTACGCGACGGCGGTTGGCGGCGCGCGGGCAGCGTGAAGCGAGGCGAGGCAAGGCTTTTCGGACCCACGGAGCGACAGAGCGAGCGGCCCCTACGGCCGTCGGCGGCCCGGCGGCCCGAGATGTTATCTGGGAAGAAGGCGGCAGCCGCGGCGGCGGCGGCTGCAGCGGCAGCAACCGGGACGGAGGCTGGCCCTGGGACAGCAGGCGGCTCCGAGAACGGGTCTGAGGTGGCCGCGCAGCCCGCGGGCCTGTCGGGCCCAGCCGAGGTCGGGCCGGGGGCGGTGGGGGAGCGCACACCCCGCAAGAAAGAGCCTCCGCGGGCCTCGCCCCCCGGGGGCCTGGCGGAACCGCCGGGGTCCGCAGGGCCTCAGGCCGGCCCTACTGTCGTGCCTGGGTCTGCGACCCCCATGGAAACTGGAATAGCAGAGACTCCGGAGGGGCGTCGGACCAGCCGGCGCAAGCGGGCGAAGGTAGAGTACAGAGAGATGGATGAAAGCTTGGCCAACCTCTCAGAAGATGAGTATTATTCAGAAGAAGAGAGAAATGCCAAAGCAGAGAAGGAAAAGAAGCTTCCCCCACCACCCCCTCAAGCCCCACCTGAGGAAGAAAATGAAAGTGAGCCTGAAGAACCATCGGGGCAAGCAGGAGGACTTCAAGACGACAGTTCTGGAGGGTATGGAGACGGCCAAGCATCAGGTGTGGAGGGCGCAGCTTTCCAGAGCCGACTTCCTCATGACCGGATGACTTCTCAAGAAGCAGCCTGTTTTCCAGATATTATCAGTGGACCACAACAGACCCAGAAGGTTTTTCTTTTCATTAGAAACCGCACACTGCAGTTGTGGTTGGATAATCCAAAGATTCAGCTGACATTTGAGGCTACTCTCCAACAATTAGAAGCACCTTATAACAGTGATACTGTGCTTGTCCACCGAGTTCACAGTTATTTAGAGCGTCATGGTCTTATCAACTTCGGCATCTATAAGAGGATAAAACCCCTACCAACTAAAAAGACAGGAAAGGTAATTATTATAGGCTCTGGGGTCTCAGGCTTGGCAGCAGCTCGACAGTTACAAAGTTTTGGAATGGATGTCACACTTTTGGAAGCCAGGGATCGTGTGGGTGGACGAGTTGCCACATTTCGCAAAGGAAACTATGTAGCTGATCTTGGAGCCATGGTGGTAACAGGTCTTGGAGGGAATCCTATGGCTGTGGTCAGCAAACAAGTAAATATGGAACTGGCCAAGATCAAGCAAAAATGCCCACTTTATGAAGCCAACGGACAAGCTGTTCCTAAAGAGAAAGATGAAATGGTAGAGCAAGAGTTTAACCGGTTGCTAGAAGCTACATCTTACCTTAGTCATCAACTAGACTTCAATGTCCTCAATAATAAGCCTGTGTCCCTTGGCCAGGCATTGGAAGTTGTCATTCAGTTACAAGAGAAGCATGTCAAAGATGAGCAGATTGAACATTGGAAGAAGATAGTGAAAACTCAGGAAGAATTGAAAGAACTTCTTAATAAGATGGTAAATTTGAAAGAGAAAATTAAAGAACTCCATCAGCAATACAAAGAAGCATCTGAAGTAAAGCCACCCAGAGATATTACTGCCGAGTTCTTAGTGAAAAGCAAACACAGGGATCTGACCGCCCTATGCAAGGAATATGATGAATTAGCTGAAACACAAGGAAAGCTAGAAGAAAAACTTCAGGAGTTGGAAGCGAATCCCCCAAGTGATGTATATCTCTCATCAAGAGACAGACAAATACTTGATTGGCATTTTGCAAATCTTGAATTTGCTAATGCCACACCTCTCTCAACTCTCTCCCTTAAGCACTGGGATCAGGATGATGACTTTGAGTTCACTGGCAGCCACCTGACAGTAAGGAATGGCTACTCGTGTGTGCCTGTGGCTTTAGCAGAAGGCCTAGACATTAAACTGAATACAGCAGTGCGACAGGTTCGCTACACGGCTTCAGGATGTGAAGTGATAGCTGTGAATACCCGCTCCACGAGTCAAACCTTTATTTATAAATGCGACGCAGTTCTCTGTACCCTTCCCCTGGGTGTGCTGAAGCAGCAGCCACCAGCCGTTCAGTTTGTGCCACCTCTCCCTGAGTGGAAAACATCTGCAGTCCAAAGGATGGGATTTGGCAACCTTAACAAGGTGGTGTTGTGTTTTGATCGGGTGTTCTGGGATCCAAGTGTCAATTTGTTCGGGCATGTTGGCAGTACGACTGCCAGCAGGGGTGAGCTCTTCCTCTTCTGGAACCTCTATAAAGCTCCAATACTGTTGGCACTAGTGGCAGGAGAAGCTGCTGGTATCATGGAAAACATAAGTGACGATGTGATTGTTGGCCGATGCCTGGCCATTCTCAAAGGGATTTTTGGTAGCAGTGCAGTACCTCAGCCCAAAGAAACTGTGGTGTCTCGTTGGCGTGCTGATCCCTGGGCTCGGGGCTCTTATTCCTATGTTGCTGCAGGATCATCTGGAAATGACTATGATTTAATGGCTCAGCCAATCACTCCTGGCCCCTCGATTCCAGGTGCCCCACAGATTTTAGTCATTCCATCTTCGGACCCTTTCAGATAACCAAGAGCAGAGTTAAAAGGATGGGCAGCATTTCTGATTTCTCATAATTTGGTTTTGTAAATCTAGGCAGTCTACATTCAGAATGGAGGAGTCCAGAGTATAAGTCAAATAACATTTTTCCTTATTCAGGTTTTCTCCCTAAAAAACAAAACCATTTTTAATTGCACTTCCATTTTGTAATGGCTCATAGGATTGCTGTAAGGTCTCAATTACTAGGGGCTTCCAGGGCATTTCTGAGAAATAACCCTGGGTCCTTGTCTAGACCCTTATGCCAGACCCCACTCCAAAGAGCGGTAAGAATTCCTTAGTGTCATAGCCCAGACCTGCTGAGCTGCGAGGCTTAAGTGTCCCTGATCACCAAATGTCCTGTGCTTCAGGGTAGCGAGGCTCCCTCCCAGAAGGTAGTTATACATGAGGGAAGGACGCTTTACAACTGGGTATCTAAACTGATGAGAACACATGTTAAGCATCACTTTAGGACTGAGCCTAGGTAGAGTTTTATTGTCTCATTTCTACTTGTCAATTCTGGGAAAGTGCCTACTGATAAGGGAGACTCTTCGATAGAATGATGAATAGTAATTGGGGGGGTCAGCCTTTAAAAAGGTCAACAGCAATTTAAGTACTTAGCAATTTAAGTACAAGAATAAAGGTATATGTGCAGCCTGCCAATTTTCTCTTTTTCCCCTAAAATAGCCGATTCCACGACTCTTCTTTGCGGGAGAACATACGATCCGTAACTACCCAGCCACAGTGCATGGTGCTCTGCTGAGTGGGCTGCGAGAAGCGGGAAGAATTGCAGACCAGTTTTTGGGGGCCATGTATACGCTGCCTCGCCAGGCCACACCAGGTGTTCCTGCACAGCAGTCCCCAAGCATGTGAGACAGATGCATTCTAAGGGAAGAGGCCCATGTGCCTGTTTCTGCCATGTAAGGAAGGCTCTTCTAGCAATACTAGATCCCACTGAGAAAATCCACCCTGGCATCTGGGCTCCTGATCAGCTGATGGAGCTCCTGATTTGACAAAGGAGCTTGCCTCCTTTGAATGACCTAGAGCACAGGGAGGAACTTGTCCATTAGTTTGGAATTGTGTTCTTCGTAAAGACTGAGGCAAGCAAGTGCTGTGAAATAACATCATCTTAGTCCCTTGGTGTGTGGGGTTTTTGTTTTTTTTTTATATTTTGAGAATAAAACTTCATATAAAAA'''\n",
"\n",
"#a = '''GCTTGGCGCGTGCGTACGCGACGGCGGTTGGCGGCGCGCGGGCAGCGTGAAGCGAGGCGAGGCAAGGCTTTTCGGACCCACGGAGCGACAGAGCGAGCGGCCCCTACGGCCGTCGGCGGCCCGGCGGCCCGAGATGTTATCTGGGAAGAAGGCGGCAGCCGCGGCGGCGGCGGCTGCAGCGGCAGCAACCGGGACGGAGGCTGGCCCTGGGACAGCAGGCGGCTCCGAGAACGGGTCTGAGGTGGCCGCGCAGCCCGCGGGCCTGTCGGGCCCAGCCGAGGTCGGGCCGGGGGCGGTGGGGGAGCGCACACCCCGCAAGAAAGAGCCTCCGCGGGCCTCGCCCCCCGGGGGCCTGGCGGAACCGCCGGGGTCCGCAGGGCCTCAGGCCGGCCCTACTGTCGTGCCTGGGTCTGCGACCCCCATGGAAACTGGAATAGCAGAGACTCCGGAGGGGCGTCGGACCAGCCGGCGCAAGCGGGCGAAGGTAGAGTACAGAGAGATGGATGAAAGCTTGGCCAACCTCTCAGAAGATGAGTATTATTCAGAAGAAGAGAGAAATGCCAAAGCAGAGAAGGAAAAGAAGCTTCCCCCACCACCCCCTCAAGCCCCACCTGAGGAAGAAAATGAAAGTGAGCCTGAAGAACCATCGGGGCAAGCAGGAGGACTTCAAGACGACAGTTCTGGAGGGTATGGAGACGGCCAAGCATCAGGTGTGGAGGGCGCAGCTTTCCAGAGCCGACTTCCTCATGACCGGATGACTTCTCAAGAAGCAGCCTGTTTTCCAGATATTATCAGTGGACCACAACAGACCCAGAAGGTTTTTCTTTTCATTAGAAACCGCACACTGCAGTTGTGGTTGGATAATCCAAAGATTCAGCTGACATTTGAGGCTACTCTCCAACAATTAGAAGCACCTTATAACAGTGATACTGTGCTTGTCCACCGAGTTCACAGTTATTTAGAGCGTCATGGTCTTATCAACTTCGGCATCTATAAGAGGATAAAACCCCTACCAACTAAAAAGACAGGAAAGGTAATTATTATAGGCTCTGGGGTCTCAGGCTTGGCAGCAGCTCGACAGTTACAAAGTTTTGGAATGGATGTCACACTTTTGGAAGCCAGGGATCGTGTGGGTGGACGAGTTGCCACATTTCGCAAAGGAAACTATGTAGCTGATCTTGGAGCCATGGTGGTAACAGGTCTTGGAGGGAATCCTATGGCTGTGGTCAGCAAACAAGTAAATATGGAACTGGCCAAGATCAAGCAAAAATGCCCACTTTATGAAGCCAACGGACAAGCTGTTCCTAAAGAGAAAGATGAAATGGTAGAGCAAGAGTTTAACCGGTTGCTAGAAGCTACATCTTACCTTAGTCATCAACTAGACTTCAATGTCCTCAATAATAAGCCTGTGTCCCTTGGCCAGGCATTGGAAGTTGTCATTCAGTTACAAGAGAAGCATGTCAAAGATGAGCAGATTGAACATTGGAAGAAGATAGTGAAAACTCAGGAAGAATTGAAAGAACTTCTTAATAAGATGGTAAATTTGAAAGAGAAAATTAAAGAACTCCATCAGCAATACAAAGAAGCATCTGAAGTAAAGCCACCCAGAGATATTACTGCCGAGTTCTTAGTGAAAAGCAAACACAGGGATCTGACCGCCCTATGCAAGGAATATGATGAATTAGCTGAAACACAAGGAAAGCTAGAAGAAAAACTTCAGGAGTTGGAAGCGAATCCCCCAAGTGATGTATATCTCTCATCAAGAGACAGACAAATACTTGATTGGCATTTTGCAAATCTTGAATTTGCTAATGCCACACCTCTCTCAACTCTCTCCCTTAAGCACTGGGATCAGGATGATGACTTTGAGTTCACTGGCAGCCACCTGACAGTAAGGAATGGCTACTCGTGTGTGCCTGTGGCTTTAGCAGAAGGCCTAGACATTAAACTGAATACAGCAGTGCGACAGGTTCGCTACACGGCTTCAGGATGTGAAGTGATAGCTGTGAATACCCGCTCCACGAGTCAAACCTTTATTTATAAATGCGACGCAGTTCTCTGTACCCTTCCCCTGGGTGTGCTGAAGCAGCAGCCACCAGCCGTTCAGTTTGTGCCACCTCTCCCTGAGTGGAAAACATCTGCAGTCCAAAGGATGGGATTTGGCAACCTTAACAAGGTGGTGTTGTGTTTTGATCGGGTGTTCTGGGATCCAAGTGTCAATTTGTTCGGGCATGTTGGCAGTACGACTGCCAGCAGGGGTGAGCTCTTCCTCTTCTGGAACCTCTATAAAGCTCCAATACTGTTGGCACTAGTGGCAGGAGAAGCTGCTGGTATCATGGAAAACATAAGTGACGATGTGATTGTTGGCCGATGCCTGGCCATTCTCAAAGGGATTTTTGGTAGCAGTGCAGTACCTCAGCCCAAAGAAACTGTGGTGTCTCGTTGGCGTGCTGATCCCTGGGCTCGGGGCTCTTATTCCTATGTTGCTGCAGGATCATCTGGAAATGACTATGATTTAATGGCTCAGCCAATCACTCCTGGCCCCTCGATTCCAGGTGCCCCACAGCCGATTCCACGACTCTTCTTTGCGGGAGAACATACGATCCGTAACTACCCAGCCACAGTGCATGGTGCTCTGCTGAGTGGGCTGCGAGAAGCGGGAAGAATTGCAGACCAGTTTTTGGGGGCCATGTATACGCTGCCTCGCCAGGCCACACCAGGTGTTCCTGCACAGCAGTCCCCAAGCATGTGAGACAGATGCATTCTAAGGGAAGAGGCCCATGTGCCTGTTTCTGCCATGTAAGGAAGGCTCTTCTAGCAATACTAGATCCCACTGAGAAAATCCACCCTGGCATCTGGGCTCCTGATCAGCTGATGGAGCTCCTGATTTGACAAAGGAGCTTGCCTCCTTTGAATGACCTAGAGCACAGGGAGGAACTTGTCCATTAGTTTGGAATTGTGTTCTTCGTAAAGACTGAGGCAAGCAAGTGCTGTGAAATAACATCATCTTAGTCCCTTGGTGTGTGGGGTTTTTGTTTTTTTTTTATATTTTGAGAATAAAACTTCATATAAAA'''\n",
"a = '''GGGAAGTGTCTGAACAAAACAAGGAACAAAAATGAGTGGGTGGGATGAGATGGAAGAAACCCCAAAGAACCTAAATACGTCACACTTGAAGTGGCACAGTGGTATATGAGAAAGTGCCTGAGTGGGCATGGACATCGAAGAAGTGGGGGGACCTCTGGCTGGATGCCTGAACCACACCCTCACAGGGAGGTTTGGCAGATCCAAAGACAATAAAGGACCTTTCAGGTCAAAAAGAGGGGATGTAGGAGAATAAACAGTGCTAGAAATGTTTCAATCAGTTTGTCTCTGCATACATATAAATTATATACTTGTATCTTACATCTTAGGGGCTCTTCTTGGGCGTCACCCGTGCCTTGTGGCTGGGGCATGTACATACACAAGTGGACACACAGGCAGAGCAGCCACCTGTGGGCTTTCTTGGACAGGAATGTGTGTGTATGTGCATGGGGGGTGAGGATCCTATTTTGGGGGATGTAGACTTATATCTAAGAGTATCTGGTTAACCCTGAGCTTAAATGAAAGGAGGAGTTAGGTTGAGGCAGGCAAGGTGGGAGAAGTGGCCCAAGTCCTTTGGTGAGTGGGGGGACAGGATGGAGTAGGGGGGACAGGATGGAGTAGGGTGGAGGGGAAGAATACTGTCTGGAGTCTGGCCAGGGTTCTGCTAGAGCACACCCTCCACCTCAGCCAGGGTCCACAAGGATGGGTACCGGGCTCTGCGTCACAGCTTCAGCTTGGGGTGGTTGCTATGAGTCTGCGTGGCTCCCGCCCAGGGCAGACAGGGACAGGTCACAGGAGAGGGGCTAGGTAATCCCTGGCAGTAGTTCCTGTACAGAGGTGGTCTGGGGTCCAGGGGGTCCCCTGGGCCTAGCCTAGGCAACAGTTGGTTCACAAAGAAATGTCAGGGAGACGCCAGCATTAAAAAAAGAGAGATGTGTTTATTCCATGATCAGTACAGACCAAATGCATATTCACCGTATGAAAGTCAAACCAGTCAGTGACTCCAGAGTTTGGCCAACACTGAGGCACCAGCGTCGTGGTGTAGAGTGGGTTCTCATGGCACGCGTAACCTCACCAGGGGCTCCAATTATAAAAATTAAAAAAAAAAAAAAAAAAAAG'''\n",
"\n",
"a = '''GCTCAGTCCTCCAGGCGTCGGTACTCAGCGGTGTTGGAACTTCGTTGCTTGCTTGCCTGTGCGCGCGTGCGCGGACATGGCCTCAAACGATTATACCCAACAAGCAACCCAAAGCTATGGGGCCTACCCCACCCAGCCCGGGCAGGGCTATTCCCAGCAGAGCAGTCAGCCCTACGGACAGCAGAGTTACAGTGGTTATAGCCAGTCCACGGACACTTCAGGCTATGGCCAGAGCAGCTATTCTTCTTATGGCCAGAGCCAGAACACAGGCTATGGAACTCAGTCAACTCCCCAGGGATATGGCTCGACTGGCGGCTATGGCAGTAGCCAGAGCTCCCAATCGTCTTACGGGCAGCAGTCCTCCTACCCTGGCTATGGCCAGCAGCCAGCTCCCAGCAGCACCTCGGGAAGTTACGGTAGCAGTTCTCAGAGCAGCAGCTATGGGCAGCCCCAGAGTGGGAGCTACAGCCAGCAGCCTAGCTATGGTGGACAGCAGCAAAGCTATGGACAGCAGCAAAGCTATAATCCCCCTCAGGGCTATGGACAGCAGAACCAGTACAACAGCAGCAGTGGTGGTGGAGGTGGAGGTGGAGGTGGAGGTAACTATGGCCAAGATCAATCCTCCATGAGTAGTGGTGGTGGCAGTGGTGGCGGTTATGGCAATCAAGACCAGAGTGGTGGAGGTGGCAGCGGTGGCTATGGACAGCAGGACCGTGGAGGCCGCGGCAGGGGTGGCAGTGGTGGCGGCGGCGGCGGCGGCGGTGGTGGTTACAACCGCAGCAGTGGTGGCTATGAACCCAGAGGTCGTGGAGGTGGCCGTGGAGGCAGAGGTGGCATGGGGTAGGTGTCTCATGAGCCAGGGAGTATCTTTGGTGGGGAGTGTGGAGGATTGCATGAATCTCCCTGAAGCCAGTCCCTAGTGCATGGTTTAGTATTCTTGTTGTCTAGGGATCTGTGAGGGCTTTGATTTGGGGGCAGTGACTTTCTTTTTACATCCCCATTTTATTTTTGTGAGAACTTGGGAGCCTGAACTCCCATCCATACCACTGAATAGAGATTTTGAGTAATGATACTTGTTTCCAAAAAAAAAGAAACCATACATAGATACGTATGGATTGGAGTCATTAATATCCTAGGCAAGAAACATGGAAGTGAAGACTTCTTTCTCTGCAAGGGAAACCGATGATCCCACTCCTGGGAAATAGTAGGGAAACTTGGTATGTGTATTCCCATGTGTCCTCTAGGGAGTTGGTAATGGTTAACCTGACTTCAGCTTCCAGGAATTGGCTACTCTTCCCGTTTTCTATAGTCATTTGAATCCACGAGCTTGATTTGCACTAATTTGACCGACATTGATTTTGTGTGTGACTTGGTTTATGGGGCCAGCTGACTGAAGTAAGCAGACCTTTTGGGCAAAAATATGCTTTGACAGTGGTCTCCCACCTATTTGTTCCACTGTCTGCCTTCCCCTGGTTACTTAAAATTCATCAGCTTGTCCAACTGGACCTTCTTTCCTTCCTGCTGAAGTTGATTTGAAGTAAAACCTTAGATTTGATGTTAAAACAGTTGTCAAATCTGTTGGTAAATAAGATTTGAAGGACCCTACTCTGTCTCCCTTGAAAAAGGGGAGGAATGTCAGTGTTACTGTTTTTGGAAAAAGTAGATTTTTAAACCGAGTTTGGAAATGGTAAGTATGCAGAGGTGGGTGGGGGCAATCTCAAAAACGTGCAAAAATGAGGAAAACAAAAATGAGGAAATGTGTGCGTGTGTTTAATGCAAAACTTTAAAAAGAAAAACAACTGTTATGTGACTGTTAACTTGCTCTGCATTTTATGTGCCACAGGTATGAAAGGTGACATTGCAAAATACTCCGCTCTTCTCGCAGTGTAGAAGGGGTGACCCCGGGGGTTGGGGGAGATCAAAAACAGCTCAGTAGTTAGGACAGAGCTTAGCTAAGTTTGTCTTGCTTTAAGGGGAAGTTGCCTTTGGTTTTGACTTTTTATGGAATGGGGTTGGGTCTGCTTGCTGCTTTCAAAGCAAAAACCACAAAAATGTGTTCAAGGCTACCCCAGCCTGGTGTGAAATGTCTTCTGGGTAAATTGGGGTAGGGTTTTTAAACCAACTACTTGGTTGTCAACCACTTGCGACAAGAGGAAAAAAAAACATCTGCTCCATCGGAAGAACGACCAAGGAAAATGGGTTATTTTTTTTCCAGAGGAAATAGATAACGTAACCTTTTAAAGCAAAATCTTTATAAACTGTGTCTGAGAAATTGCACACGTGTGTGTGACATGCTCAAAGGTCAGACAAGGGGTGGTCAGGAAGGGATGTATTTTAGTAGCCACTTGTATCTTTTTCCAAAAACACCTACCCATGTTTGGGGAATGTTAAACAAAATCAAAAAACAACCTTTTGTAGCCGTTGGAAGCTTCATGTCCTTTCTTCTAACTTGTCTTCTCCAGCGGAAGTGACCGTGGTGGCTTCAATAAATTTGGTGGTAAGTGAACAGAGTTTCCAAAATTCCCAACTCCCAGCAATGCTTTGTCTGATTGTTCATTTGCAGATGTCTTAGCGTGTTAATTTAAATGTCAAAGGTTTTGAGGTGTCCAGAACCACCTCCAGAAAGGGGTAGGGTAGAATGCCACCTGTTGCCTGGTGTGTGCTAACCTGGAGCAGGTAGGGGTAAGACTCAATAGTCATCTTTTACCAAATGGGTTTGCCCCAGGTTAATAAGAGGGGTCTAGTAGGCCTTGGACTGGGCCGTTGCCACACCTGGCACTTAGTGACCATCATCATGAGAAACTGGAGAGTGCGTGCTGGAACACGTGGTGCCATCTTGGCTTTAGGATCCTTTTGATCGTTGTGTCCAAGGCTTGTGTGTGTGTGAGTGTGTGGGAGACAACTCCGAATGTTTAATTCTGGAAGAGGGATGTAACATTGCCCTGAGGATGGTGAAGTTGGTATACATTTATAAAGTACGGAATGGTGTCAATGAATGCAATTCTATGTATATGGACTTAACTGAGATGGGCAAATAGAAACTAGCTCTGGGAAGGAACATGTGCACTACTTCAAGAAAGATTGGAAGCATGTGTGGCTCATGGGAAATAACCAGGTCTTAAACAGCACAAACTGAATTCGTGGACCAGGAAGGTCTTAAACAGCACAAACTGAATTCATGGAAAAATGACAAATTTGAGAAGTCTCCCAGTAAGCTGGAACTTTTCTGGTTTGGTTAACAAAAGGTTTCTTGATTTGTTTCAAGATTTAAAGCCAAAGGTGTGGGTTCATGACTTAGGTGTCATTGCGTGTGGGTACAATATTTATATATGGCGAATTCAGATAAACATTGGTCAAAGATGGTCTCTGGAAAAACAAAATAGAGGCTGCATTACGGAAATAAGATTTCTGGTCTGTTCCCTGGGACATGCTTAAAAAATACAATAGCTATTATGTATGGTTTTTATTTTCATGTGGTTTCGGGGAAACAACACGGTTTTAAGGATGGTTTCTAAAGATGAAATTAAAAATTGTTCCACAAGGGTTAAGTGTCTGGTGGTAAAGTTGGGAGAAACTGGATGGATGCACATCGCATGGCTGGTGGCGAGCCCATCTCTCTTCTCTCGGGTGAGAGAACCGGGCCAAGCTGAGTTGGTTTGTTCACTTTAATGGGTCTCCGTTTCCCCTGCCACCTGTGCTGAGGACATTTCCCAGCCTGAGCTGGGGGAGGCAGCATTTGCTGAAGTGTGGAGTTGTCTCTGTGGAGACTCAAGTTACAGATCTTAAGGGGCCTGCCTAGAATTTTCTCCTCTGGGCAGGCGACCCAGGAAAGGGTTTGGAGTGAGGCTGTGAGCACTTACTTGATATTTTACAAGTTTGGATTTGGTGTTAATTTTTTTCCTTGTCCGTTTTTTCCTGTTGACTAACGGCTCATCTTTTCCTTGTTTTTGTTTTTTTTTTGTTCTTTTTTTCCATGTCACTAAAGGCCCTCGGGACCAAGGATCACGTCATGACTCCGAACAGGATAATTCAGACAACAACACCATCTTTGTGCAAGGCCTGGGTGAGAATGTTACAATTGAGTCTGTGGCTGATTACTTCAAGCAGATTGGTATTATTAAGACAAACAAGAAAACGGGACAGCCCATGATTAATTTGTACACAGACAGGGAAACTGGCAAGCTGAAGGGAGAGGCAACGGTCTCTTTTGATGACCCACCTTCAGCTAAAGCAGCTATTGACTGGTTTGATGGTAAAGAATTCTCCGGAAATCCTATCAAGGTCTCATTTGCTACTCGCCGGGCAGACTTTAATCGGGGTGGTGGCAATGGTCGTGGAGGCCGAGGGCGAGGAGGACCCATGGGCCGTGGAGGCTATGGAGGTGGTGGCAGTGGTGGTGGTGGCCGAGGAGGATTTCCCAGTGGAGGTGGTGGCGGTGGAGGACAGCAGCGAGCTGGTGACTGGAAGTGTCCTAATCCCACCTGTGAGAATATGAACTTCTCTTGGAGGAATGAATGCAACCAGTGTAAGGCCCCTAAACCAGATGGCCCAGGAGGGGGACCAGGTGGCTCTCACATGGGGGGTAACTACGGGGATGATCGTCGTGGTGGCAGAGGAGGCTATGATCGAGGCGGCTACCGGGGCCGCGGCGGGGACCGTGGAGGCTTCCGAGGGGGCCGGGGTGGTGGGGACAGAGGTGGCTTTGGCCCTGGCAAGATGGATTCCAGGGGTGAGCACAGACAGGATCGCAGGGAGAGGCCGTATTAATTAGCCTGGCTCCCCAGGTTCTGGAACAGCTTTTTGTCCTGTACCCAGTGTTACCCTCGTTATTTTGTAACCTTCCAATTCCTGATCACCCAAGGGTTTTTTTGTGTCGGACTATGTAATTGTAACTATACCTCTGGTTCCCATTAAAAGTGACCATTTTAGTTAAA'''\n",
"\n",
"a = '''GGCTGCAGCCGGGCTCCGTGGCGCTCGCAGCCACCGCCTCCTCTCGGCTCCAGGTCTTCCCCTTCTTTTTACAACTGATCCTGTTGGGGATTTTTTTTTTTTCTAAATTGGAACGGTGGGGAGGAGCAGGGAGGGGGGACCTGGAGGAAGGGGAGAGATTAGGCAGCCATCAATTTCCTCCAGTTTCTCCCAGAACAGGTGATGCTTCTAAATTGTGATCACTTTCAGGAGGCAGCACTGCAGCTGGAAGGATGCGAGCGACCTAGGGTGGAGTGGCTGAGGCGGCAGATCTGAACTTGCGGAGGATAAGAACCCAAACTTTGACTACATCAGTCCGCACCTCGCCAGTGAAGCAAAGGACGGGTTATCTTTTTTTTTTTTCTAAGACTCAAACTTGGGCACTTGATCCCTTTTCTTGGATTGCTTTGGAGGAGACGATTTGCTGGCAACGTTGGGAACAGTCAGGACTGTGTTGTAACTCTTACTTTTAAAGCGACAGTAGAGGATCAGACTTTTTAAATGTTTGGAATTCAAGATACTTTAGGAAGAGGACCAACTCTGAAAGAGAAATCGCTGGGCGCGGAGATGGATTCGGTCAGGTCCTGGGTCCGGAATGTCGGAGTGGTGGACGCTAATGTCGCCGCGCAGAGCGGGGTCGCCCTGTCCCGGGCCCACTTTGAGAAACAGCCTCCTTCCAACTTGAGGAAATCCAACTTCTTTCACTTCGTCCTGGCGCTCTATGACAGGCAGGGCCAGCCGGTGGAGATCGAGCGGACGGCCTTCGTGGACTTTGTGGAGAATGACAAAGAACAAGGCAACGAGAAGACCAACAACGGCACTCACTACAAGTTACAGCTCCTCTACAGCAACGGTGTCCGCACGGAACAGGACCTCTATGTCAGGCTCATCGACTCGGTCACCAAGCAGCCCATCGCTTACGAGGGACAGAATAAGAATCCGGAAATGTGCCGAGTTCTCCTGACGCACGAAGTGATGTGTAGTCGATGCTGCGAAAAGAAAAGCTGTGGAAACCGAAATGAGACTCCATCGGACCCAGTCATAATTGACAGATTCTTTTTAAAATTTTTCCTCAAGTGCAATCAGAATTGTTTGAAAACAGCAGGAAACCCAAGGGACATGAGACGGTTTCAGGTTGTGTTGTCAACAACGGTGAATGTGGATGGACACGTCCTGGCTGTTTCTGACAACATGTTTGTTCATAACAACTCCAAGCATGGACGGAGAGCAAGAAGACTCGATCCATCGGAAGCTACCCCCTGCATCAAAGCCATTAGCCCGAGTGAAGGCTGGACCACAGGAGGAGCCATGGTCATCATCATCGGGGACAACTTCTTTGATGGTCTCCAAGTGGTGTTTGGGACTATGCTTGTATGGAGCGAGCTAATAACCCCTCATGCCATCAGAGTACAGACTCCTCCCCGGCACATCCCAGGCGTGGTAGAGGTGACATTATCTTATAAATCTAAACAGTTCTGCAAAGGAGCCCCAGGAAGGTTCATTTACACAGCATTAAATGAACCCACCATAGACTATGGCTTCCAGAGACTGCAGAAGGTCATCCCTAGGCATCCTGGAGATCCTGAGAGATTAGCTAAGGAGATGCTGTTGAAAAGAGCTGCAGATCTAGTGGAAGCTCTTTATGGCACACCACACAATAACCAGGACATCATTTTGAAGCGAGCCGCAGACATTGCTGAAGCTCTCTACAGCGTCCCCAGGAATCCCAGCCAGCTTCCAGCCCTCTCTAGCTCCCCAGCGCACAGTGGCATGATGGGAATCAACTCCTATGGCAGCCAGCTTGGGGTCAGCATCTCAGAGTCAACACAAGGAAATAATCAAGGGTACATCCGCAACACAAGCAGCATCTCTCCGCGGGGATACTCTTCCAGCTCCACGCCTCAACAGTCTAATTACAGTACCTCCAGCAACAGTATGAATGGCTACAGCAATGTCCCCATGGCCAACTTGGGTGTTCCAGGTTCACCAGGATTTCTAAATGGCTCACCCACCGGCTCTCCTTATGGAATCATGTCATCAAGTCCCACCGTTGGGTCTTCCAGCACATCCTCCATCCTCCCATTTTCCTCTTCAGTTTTTCCTGCTGTCAAACAGAAGAGTGCCTTTGCCCCTGTCATCAGGCCCCAAGGCTCCCCTTCACCTGCCTGCTCCAGCGGCAATGGAAATGGATTCAGAGCCATGACCGGACTTGTTGTACCCCCGATGTAAAGAAGAACTGCTTTCTTATAGCACAAAACTACTTACTCTGATGGACCAATAATGAAGAAAGCACTAGGAGCTCTTTTGGGGGTGTAGTGGTGCCCCCACATGAACATGATGGACACCCTTGGGTCTGCAAGGAGCCAGCATCTTACTTGGTCCCACGTCCTCCTATAGCTCTGATGGTGGCTACACAAACTGACCCTCTTGGGACAAGGACAAAAGATGTCATTGACGTAGTCAGTGCTAAGAGCAGAAATGCAATTCTTTGTTATGAACATTATGAAAACCACCTTCCTATGTTTGTAAAATATTTAAGAAAAAATTGGCAAACAATTAATGCTTAATATTTTGGATACTATTTGTTTTTCTTTGTAGGAAAAAAAAGTTGAAAGTTTCTATTTTCTATGAAGCCTTTCAGATACCAATTTAGTTTATGCAGAAAAAAATTGAACAAAACAGGGTACCAGCACGGAAGACTTTCTTAAAACGCAACCTGAATTGAATGATGAAATGTTGTATGTGTGTTTGCTTATAGCTTAATCTCTTTAAAAAATGAACAAAAAAAA'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"a = \"TGGACAGCTCCCGCTCACCCAAACAGAAGACGTCGGCGCCGGAGCGGGCTCGGACATGGCGAGGCTGCGAGCCGGCCCGAGCGGCGGGGCCCGGTGATCCCTCCCTCCCTCCCCGTCCCCTCCCCTCTCCCGCACGCACGCCCCGTCCGCCCCCACCCCGCCCCCACCCCGGGCGAGCCCGCCCGCAGCCCGGGGCGCACACCCGCACGCGCACTCCTCTCCACTCACTCCCGCGCCCGCCCCCACTCCCGCAGCCGAGCCCCGCCACGCGCGCCTTGCCCGCCCGCCGGCCGCCCCCGCCGCCCCCGCCGCCCCCGGGCCCTGATGGACTGAATGAAGGCTGCCTACACCGCCTATCGATGCCTCACCAAAGACCTAGAAGGCTGCGCCATGAACCCGGAGCTGACAATGGAAAGTCTGGGCACTTTGCACGGGCCGGCCGGCGGCGGCAGTGGCGGGGGCGGCGGCGGGGGCGGCGGGGGCGGCGGCGGGGGCCCGGGCCATGAGCAGGAGCTGCTGGCCAGCCCCAGCCCCCACCACGCGGGCCGCGGCGCCGCTGGCTCGCTGCGGGGCCCTCCGCCGCCTCCAACCGCGCACCAGGAGCTGGGCACGGCGGCAGCGGCGGCAGCGGCGGCGTCGCGCTCGGCCATGGTCACCAGCATGGCCTCGATCCTGGACGGCGGCGACTACCGGCCCGAGCTCTCCATCCCGCTGCACCACGCCATGAGCATGTCCTGCGACTCGTCTCCGCCTGGCATGGGCATGAGCAACACCTACACCACGCTGACACCGCTCCAGCCGCTGCCACCCATCTCCACCGTGTCTGACAAGTTCCACCACCCTCACCCGCACCACCATCCGCACCACCACCACCACCACCACCACCAGCGCCTGTCCGGCAACGTCAGCGGCAGCTTCACCCTCATGCGCGACGAGCGCGGGCTCCCGGCCATGAACAACCTCTACAGTCCCTACAAGGAGATGCCCGGCATGAGCCAGAGCCTGTCCCCGCTGGCCGCCACGCCGCTGGGCAACGGGCTAGGCGGCCTCCACAACGCGCAGCAGAGTCTGCCCAACTACGGTCCGCCGGGCCACGACAAAATGCTCAGCCCCAACTTCGACGCGCACCACACTGCCATGCTGACCCGCGGTGAGCAACACCTGTCCCGCGGCCTGGGCACCCCACCTGCGGCCATGATGTCGCACCTGAACGGCCTGCACCACCCGGGCCACACTCAGTCTCACGGGCCGGTGCTGGCACCCAGTCGCGAGCGGCCACCCTCGTCCTCATCGGGCTCGCAGGTGGCCACGTCGGGCCAGCTGGAAGAAATCAACACCAAAGAGGTGGCCCAGCGCATCACAGCGGAGCTGAAGCGCTACAGTATCCCCCAGGCGATCTTTGCGCAGAGGGTGCTGTGCCGGTCTCAGGGGACTCTCTCCGACCTGCTCCGGAATCCAAAACCGTGGAGTAAACTCAAATCTGGCAGGGAGACCTTCCGCAGGATGTGGAAGTGGCTTCAGGAGCCCGAGTTCCAGCGCATGTCCGCCTTACGCCTGGCAGCGTGCAAACGCAAAGAGCAAGAACCAAACAAAGACAGGAACAATTCCCAGAAGAAGTCCCGCCTGGTGTTCACTGACCTCCAACGCCGAACACTCTTCGCCATCTTCAAGGAGAACAAACGCCCGTCAAAGGAGATGCAGATCACCATTTCCCAGCAGCTGGGCCTGGAGCTCACAACCGTCAGCAACTTCTTCATGAACGCCCGGCGCCGCAGCCTGGAGAAGTGGCAAGACGATCTGAGCACAGGGGGCTCCTCGTCCACCTCCAGCACGTGTACCAAAGCATGATGGAAGGACTCTCACTTGGGCACAAGTCACCTCCAAATGAGGACAACAGATACCAAAAGAAAACAAAGGAAAAAGACACCGGATTCCTAGCTGGGGCCCTTCACTGGTGATTTGAAAGCACAATTCTCTTGCAAAGAAACTTATATTCTAGCTGTAATCATAGGCCAGGTGTTCTTCTTTTGTTTTTAATGGCTATGGAGTCCAAGTGCAAGCTGAAAAATTAATCTCTTAGAACCAGACACTGTTCTCTGAGCATGCTAAGCATCCCAGAAACCCAAATGGGGCCTTCCTGGAGCGAGTTAATTCCAGTATGGTGTCAACCAAGCTCGGGATTGCTTAAAATATCATCCATCCCACTTCAGGTCCTGTCAGCTTCTTGCAGTCAGAGTTCCTATGAGTAACAATAGGAGTTTGGCCTATGTAAGGACTCTGAGTTTAGGCTTCCAAGATACAACAATAAGAGAAGAATCTAGCAACGAGAATGACCTCATTTGCTTTCCACATGCTTAGCCTCATTATACCATGTTATGTCCAAGTTCACAGCCACAACATCAGAATGGTAATTACTGAGCACAAGTTTTAAATATGGACGTTAAAAAAAAAAATCCAAGGACCTGTTTTTCCAACCCAGACATCTTTTCATTGAATGATTTAGAAAGCTTTAAGTTGATCCAGCTTACAATTTTTTTTTTCTTTACCTCCTGGAAATCTCATATGGTCTTGGATCCGTCAAAAAAACCAGTCAGTTCACTTGCGCTCAAAGTATCAAGCACAACAAAGATAAACAGAAGTGAGGAAGGTTCTGGGTTCACTACATCTGGATTTTCAAGACACCTATTGTGAAGTCATTAGGGAATTGATGAGAATATGGCTTCAAGCACATTTTGCAGTTTGCTACAAATTCTGTTGTACATAATGCAGACGCACACTCAGGAGGCCAATTTAACTGTTAACAGTGCATGGAGCGAATGCAGCATTTTAAAAGATCTAGGTTTTTTTAGGTCATTAATGTGTCCTTGGTTGATCAGTCATCTGGTCCCTCCTACTGTGTGTTATGACCACCACGTAATCCATTCTCGCTCTTTCTGATTTGGGGTTTTTCCTCATCCATCCCATTAGTAGGGATGTTTTCTGTGTTTTCTAGCAAGAAAAAAAAATCAATCAATCAAACCTGCATACATGTTACTCATGACTGTCATCTAGTCCTAAATCTCTTCTGTTGTTGAATCATCCTTGCAAAACAGCTGAATACATCTGGAGAAAACACAGCACACCAAAGAAGCAGAATACTGCAAACCAAAGACATTTATGACTTGTCATTTTCTAGCCTAAAAATACTGTGATTACTTTTAGAAATCAGAAAACCTCTGCAACTCCGAATGGCATTCAGCTCTTGCATTTGGCGCATCATCGGGCTGAGCGGACCAGCTACACCAAGGACATTAGCCAAGCCACCCAGAGGGGTGGCTTTGCCACACCAGTTGTCACCTTCCCATAGCAAGTGGAAGAGCGCCCACAGAACTCTGGGAGATTGCAAAGGTCACAATGTGCATATTTACCAGTGAATGGCCCCGGGTGGGGCCACGTGGGGGTGTTCAAAGCAAGCCAAACGCTGCAATCATTCTTTACAGACACTTGAGACTGACTTTTTTATGAATTACTTAGTCGAAACCAAAGAAACTTTTTCTGCACCTACTTCTGCAACAAACAAAACTGTCCCATTAAAATGAATAAATAAATCCGTAAATCAATGGAAATCACCACCAATAAGAAGGAAGCACGCCAGAAAATAAACGAAAACAAAAACAGGGAGACACACTGTGTTCAAACAGACCTCTTGGGACATTTTTTGGAAGCAGATTTTAAAGAAAGGGTTGAGACAAAGATAGAAATAAGGAAGAGCCTCAGTGGCTGCTGCTTCATTTGACAACTCACACGGTAATCTTAAAGCTGAAGATTGTCTTTAATTTGTGCCTATGCAGTTTTTCAAAAGAACACGGAACAGAGCAACAGAAACCTCAACAGCTACAATACCAAAGATGAGGATTTCTCACACCTTTTGTTTCAGTTCATTATCTCCTCTTGCCTGGCTAAAATACTAATAGCGCCATTGAACTGTATAAAGGTAATCAATTATGTTTCTCTGAGCAACAAAAGGAAAGGGCCATTTATTTGATTTTATTGTTTCATTTCAATTTTGTCTTATGGTTTTTTGCCCCAACATGGAATCTCTCAAAAGTTTCCATGGACTCCAAGTTTAAGATGTTGGGATATTGAACAGTTCTCTCTGCTCAGCAGAGGGTAGGGAATAACATTATCACTTGAATGTTCTTTGCTTAACCCTTAGACTTGGTTCCTTCTATGTTCAGAGTCTCATCATCAGGGGAAGGAAAGGGAGTGAGGGTCAGGGATAGGGGTCTTGGTGATGCATCCTCTCCCGAGCCACAGAACCAAAGAGTTTATAGAGGAATTTACAGCCTCGTTTTCATGTGATTGCTACATCCTAACAGGGCTTCATTTGGGGGTGGGGGGAAACATGTAAAAATAATTGCCAGTTTCTACTTTTCTATTAGCTTTTTAAAAATCAGCTGTAAAGTTGCATTTCTAAAGAAAGATATATATAATATATAAAATACATATATAGATCAACTTGACATTGGTGATAACCAAAATTATTGCTGTCCAAATTCATGTCTTGTTTTGGTCCAGTGCTTCATTTGCTAAGTATTCGGTTCAGAATTTTTCTCATTTCTCATGCCATTCCAGAGTTAATTTGCCACTGTGGATGATTTGAAGTATTCAGATCTCTATGGAAGTTTCTGGGACAGGTTTAAAGTCAAGATCAAGCATTTTAGCATTTAACCTGTTGATAAATGGATCCATGGTGTACATGAGTTTTATTTGTATTCGGAGTCATCTCTATTCTATCCCTCAGCCTCGATTAAGGTGGTGAGTGAAGTGCATCCAACAGACTCGGCCCAGAACTGGGTCCTGACAGTGGGGTGCTCATCTTCTGTAACTGTTGGGAAGGCTCGGTGGTCCATTTTCACCAGTTAAAGAATATGAGGCCAGCCCAGAAATCTGTTCTCCAGGAGCTGCCCTGTCCCATCTGGGTGTGCCAGACCCCCTCAGTGAGCAGGTCCACCAAAGGGACTTCTCACAGGGGAAGCCCAACTCCTGTTGCAATGGGTTGATAGATTTCCTCAGGGTGGTAATTACCAATTCGTATTTTGACAAGCCTATGTGCAACCACAGCTGGCACTGGGGTGGGCAGTGGTGTTGGGTGGGATGGGGGAGAGTGTCTCAATCCTGAAGAGAAAATATAAAGCAGGTTTTGGGGAGACTTCTGGAGTCCTGCCCCTAGAGAGCCCCATTGTTGTTCTTTGTGCCCCCTCCTCATTCCCCCTATGTGGGTCTCCCTATGCAGGAGCTGTGAGAGAATGTGACTCTCCACAATTTTTATAATTCATCCTTCCTAGGAGATTGTTCATTGGCTCTTCCCTTGTGTCCCTTTGTCCCTTGCTCATACTCCATGTTTCCTTTGTCAAAGGACTAAGAAAAGAGCATATTTCAGCAGAGGAGTGTTCCCATGTGGGTTGATTTCAACTTGGGTATTTCTAAAAGAGTCCTTGTGACATGTGTCCAGTGGAAATGGTTGCTCTTTTCCAGACTGGATTGAGGAATGGAGCCTGTTTGATTTGGTTAGTGATTCTTTGACATACTAATCTCAGCGTTTGGGTCTCCAGCATCCTCTGAAGATGTCTAGACTAGTAGAGGCTGCCTTTGTGACCTGACATTACAACATTGGTCAAACCAGTCCTCTGATAATCAGAAGAACATGTCATAATTGTTTAAAAAAAAAAAAAAGGCAAGAATTTCTCTCCAAGGAGCTTTAATAAATGTCTCATTCCAGATAATGTCATACCAGAGAAAAGTGCTTGCTTTTAGAAAATTATTTACATACATATATAAATATATATGTGTATCTATACAGTTATGTATCAAAATTTTAAGCCCTGCAGAATTTCAATTTGTTAGAAATCTAACAGAAAAAAATTTCTATATTGAAAGGTAATAGAATTTAACCCAGTGAGTTTACTCAAGGATTTTTAAATTTAAGTTAATAATTTCAGAGAAAATAACCATTTGGGTGTGGTTATAGTTTAGTATCCATTACCTCAATCCAAGGAAAATTCCAGGCATTCCTCAACCATCAGGAAAAGGTACAGTGTGAAGGAACAGTTCTCAGCCAAATTTCACATTCTTGAGGCAACAGAAATCAAAACACTCAGAGCCATTGAGTGGAAAAACAATTTACTTTATTCCTTTACACAAATAGGCTTGCATTGTTTTTGTTTTAATGTGATTTTGGTACTAGGGATATAATTATTTCATTCCAGGAAATAATAAAAAAAAACAGACAGAGCCAATACATTTCTTTTTTTAAAGGAAACAGCAACAACAATAAAAACTCAGCACCAATATTTAAAAGCTTTTCCAAAATGTAAAAGAAGTGTTTAGCTTGCACCATGCATAAAGGTGCAGGCTAGTTGAACCAGGAAGCATGGCACTTCCTCTGGAGAAATCCAGAAAGAGTTGCTTCTAAGCTCCCTTTTCCCCCTGCAGGCTCTTGGCAATTGTAGGCTTTAGCAAATCCAGAATAATTTTCAATTCAAGCTAAAATAAAATCAACATTTGGAATGTAAATCTGATACACACACACTTTTCTAAGTCAAACAACATATTTCAAAACCAAAAATAAATACCTTTTAGATAATCAGTTATTTTCTTTGTCTATACTGGGCACCCACCTACTAGTGCCAGTAAATTCAAGTTGAACAGATTTTTAAAATCACTATTATCTGGGTATGGGGGAAACTTCCCCACTTTTGAAAATGTTGGTAGAATTATAGGAATGTCTGTTTGATTATCATTACCAAAGTGTCATGACAGTATGCCTTTGTAGTGAACTCGGATTTTCAGGAGTTTGAATAGTTGGATATTTTAAAATCTAAGAAGAAAAGGCCTGTTTCCAATGTTGTTGAAGAATAATGAACTCTATTAAAAAGTGGAGAAAAAGATAATACATGTGGTCAAGGTTGACCACAAGGCCCAGGCACAACTACCTTGGCGATAATCTTCTAGATTCGTAACAGGTTAGAGCTGACTTTTTGTTTTTGTTGTTGCTGATGCTGTGTGATTCAGACTTCTCAGCCTAACCAGGAAGAGTAAGTGGAAATGGTAGATGAAGAAGGGGTAGAGCTGGTGTATCTATAACTTTCTGATATTTGTCTGCCAAACTTGATATATTAGTAATTTTTTTATCTTTAGCTAAGATCAAGTCACCCCTGAAACAACAGGAGATTCTAGTTTTAAAATAAGGCCACAAAAATCCTTACGGAATGAAGAATGGCACCCCAGTTGGTTGTATAAGTCTCATAAGATAATGATGTTGATTTTAAATATGGATGTCTCAATGCCTGTTTTCTATCAATGATTTGTTTGTTTCCAAGGTCGGGGAGGGAAAGAGGGGAGGGTTTATCTGTTTTAGAAAGTCTCAGAATACTTATAAAATACAGAAGTAGTTATTAAAATATATAGGACCTCACATAGGTAGATACAGAACTTACCATTGAGGCTGATGGGCTGTTGTGTGAATCACACAGGACCTTAAATGAGGCTCATTATTCTCACACACCAAAATGACTCTGACAGCCTGAAGCAGTTATTGCTAGAGCCCAAGCTTTCCTTGGAGGTTTTGGAGTTAGGTTGATTGGAAGTAACCAGCTAATACCTTTTCTAGTGGAGAAAAAGACATTGCTACCAGCTTGTTCATCCCATAGAAGTCTTCCACTCTGCTCCATTTTTAGCAGCAAGCATTTCATGTAGCATAAACCTTGGCAGATAAGTGTGCCTAAGGTTTATACAGTCTGTCCGCTTGGATGTATACAAATTTAGATACATATTTTAACATGTGTTCTCATAGATGACTTTATAACAACACACATTACCTATAGGTGTCTAGACTGTGTACATACAAGTGTGTACAGACAAGCTTCATACGTATATACTGTAATCCGTTACAACAAATAAATTTTAAATCATCGTTTAACATGTATGTGGTACTTCTACAGTGTACATTGTTTTCATTATTTATTGTAACATTGAAAACCACAGTGCAGGGAAAACAAAAGTATCCCAGCATCTTCATCCTGTACACTTGGAATTAATTTCATTTGGGCATATCCAAGATAAACTCAACTTTCAAGAAATCTTGTATATTATTTAATCATCTGTGTTAGGATGACACCTATGATTGATGACTTCGGTTGAATAGCTTTATTCTGGATTTTTCATAACTAAAGCTAAATCCAAAGACCTGAAAAAGGACAAAAAGAAAAAAAAAAAAAGAAA\"\n",
"a = a.upper()\n",
"a = a.replace(\"\\n\",\"\")\n",
" \n",
"table = { \n",
" 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M+', \n",
" 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', \n",
" 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', \n",
" 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R', \n",
" 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', \n",
" 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', \n",
" 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', \n",
" 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', \n",
" 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', \n",
" 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', \n",
" 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', \n",
" 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', \n",
" 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', \n",
" 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', \n",
" 'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', \n",
" 'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', \n",
"} \n",
"\n",
"protein = \"\"\n",
"protein2 = \"\"\n",
"protein3 = \"\"\n",
"\n",
"exon = False\n",
"intron = True\n",
"for i in range(0, len(a) - 3,3): \n",
" codon = a[i:i+3] \n",
" codon2 = a[i+1:i+4]\n",
" codon3 = a[i+2:i+5]\n",
" \n",
" protein += table[codon]\n",
" protein2 += table[codon2]\n",
" protein3 += table[codon3]\n",
" \n",
"print(protein)\n",
"print(protein2)\n",
"print(protein3)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def translate(seq,orf):\n",
" seq = seq.upper()\n",
" seq = seq.replace(\"\\n\",\"\")\n",
"\n",
" table = { \n",
" 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', \n",
" 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', \n",
" 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', \n",
" 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R', \n",
" 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', \n",
" 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', \n",
" 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', \n",
" 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', \n",
" 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', \n",
" 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', \n",
" 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', \n",
" 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', \n",
" 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', \n",
" 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', \n",
" 'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', \n",
" 'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', \n",
" } \n",
"\n",
" protein = \"\"\n",
" exon = False\n",
" translating = True\n",
" i = orf\n",
"\n",
" while (translating):\n",
" codon = seq[i:i+3]\n",
" \n",
" try: table[codon]\n",
" except: break\n",
"\n",
" if (table[codon] == \"M\"):\n",
" exon = True\n",
"\n",
" if (exon):\n",
" if (table[codon] == \"_\"):\n",
" exon = False\n",
" translating = False\n",
" else:\n",
" protein += table[codon]\n",
" i += 3\n",
" else:\n",
" i += 3\n",
" \n",
" return protein\n",
"\n",
"a = \"TGGACAGCTCCCGCTCACCCAAACAGAAGACGTCGGCGCCGGAGCGGGCTCGGACATGGCGAGGCTGCGAGCCGGCCCGAGCGGCGGGGCCCGGTGATCCCTCCCTCCCTCCCCGTCCCCTCCCCTCTCCCGCACGCACGCCCCGTCCGCCCCCACCCCGCCCCCACCCCGGGCGAGCCCGCCCGCAGCCCGGGGCGCACACCCGCACGCGCACTCCTCTCCACTCACTCCCGCGCCCGCCCCCACTCCCGCAGCCGAGCCCCGCCACGCGCGCCTTGCCCGCCCGCCGGCCGCCCCCGCCGCCCCCGCCGCCCCCGGGCCCTGATGGACTGAATGAAGGCTGCCTACACCGCCTATCGATGCCTCACCAAAGACCTAGAAGGCTGCGCCATGAACCCGGAGCTGACAATGGAAAGTCTGGGCACTTTGCACGGGCCGGCCGGCGGCGGCAGTGGCGGGGGCGGCGGCGGGGGCGGCGGGGGCGGCGGCGGGGGCCCGGGCCATGAGCAGGAGCTGCTGGCCAGCCCCAGCCCCCACCACGCGGGCCGCGGCGCCGCTGGCTCGCTGCGGGGCCCTCCGCCGCCTCCAACCGCGCACCAGGAGCTGGGCACGGCGGCAGCGGCGGCAGCGGCGGCGTCGCGCTCGGCCATGGTCACCAGCATGGCCTCGATCCTGGACGGCGGCGACTACCGGCCCGAGCTCTCCATCCCGCTGCACCACGCCATGAGCATGTCCTGCGACTCGTCTCCGCCTGGCATGGGCATGAGCAACACCTACACCACGCTGACACCGCTCCAGCCGCTGCCACCCATCTCCACCGTGTCTGACAAGTTCCACCACCCTCACCCGCACCACCATCCGCACCACCACCACCACCACCACCACCAGCGCCTGTCCGGCAACGTCAGCGGCAGCTTCACCCTCATGCGCGACGAGCGCGGGCTCCCGGCCATGAACAACCTCTACAGTCCCTACAAGGAGATGCCCGGCATGAGCCAGAGCCTGTCCCCGCTGGCCGCCACGCCGCTGGGCAACGGGCTAGGCGGCCTCCACAACGCGCAGCAGAGTCTGCCCAACTACGGTCCGCCGGGCCACGACAAAATGCTCAGCCCCAACTTCGACGCGCACCACACTGCCATGCTGACCCGCGGTGAGCAACACCTGTCCCGCGGCCTGGGCACCCCACCTGCGGCCATGATGTCGCACCTGAACGGCCTGCACCACCCGGGCCACACTCAGTCTCACGGGCCGGTGCTGGCACCCAGTCGCGAGCGGCCACCCTCGTCCTCATCGGGCTCGCAGGTGGCCACGTCGGGCCAGCTGGAAGAAATCAACACCAAAGAGGTGGCCCAGCGCATCACAGCGGAGCTGAAGCGCTACAGTATCCCCCAGGCGATCTTTGCGCAGAGGGTGCTGTGCCGGTCTCAGGGGACTCTCTCCGACCTGCTCCGGAATCCAAAACCGTGGAGTAAACTCAAATCTGGCAGGGAGACCTTCCGCAGGATGTGGAAGTGGCTTCAGGAGCCCGAGTTCCAGCGCATGTCCGCCTTACGCCTGGCAGCGTGCAAACGCAAAGAGCAAGAACCAAACAAAGACAGGAACAATTCCCAGAAGAAGTCCCGCCTGGTGTTCACTGACCTCCAACGCCGAACACTCTTCGCCATCTTCAAGGAGAACAAACGCCCGTCAAAGGAGATGCAGATCACCATTTCCCAGCAGCTGGGCCTGGAGCTCACAACCGTCAGCAACTTCTTCATGAACGCCCGGCGCCGCAGCCTGGAGAAGTGGCAAGACGATCTGAGCACAGGGGGCTCCTCGTCCACCTCCAGCACGTGTACCAAAGCATGATGGAAGGACTCTCACTTGGGCACAAGTCACCTCCAAATGAGGACAACAGATACCAAAAGAAAACAAAGGAAAAAGACACCGGATTCCTAGCTGGGGCCCTTCACTGGTGATTTGAAAGCACAATTCTCTTGCAAAGAAACTTATATTCTAGCTGTAATCATAGGCCAGGTGTTCTTCTTTTGTTTTTAATGGCTATGGAGTCCAAGTGCAAGCTGAAAAATTAATCTCTTAGAACCAGACACTGTTCTCTGAGCATGCTAAGCATCCCAGAAACCCAAATGGGGCCTTCCTGGAGCGAGTTAATTCCAGTATGGTGTCAACCAAGCTCGGGATTGCTTAAAATATCATCCATCCCACTTCAGGTCCTGTCAGCTTCTTGCAGTCAGAGTTCCTATGAGTAACAATAGGAGTTTGGCCTATGTAAGGACTCTGAGTTTAGGCTTCCAAGATACAACAATAAGAGAAGAATCTAGCAACGAGAATGACCTCATTTGCTTTCCACATGCTTAGCCTCATTATACCATGTTATGTCCAAGTTCACAGCCACAACATCAGAATGGTAATTACTGAGCACAAGTTTTAAATATGGACGTTAAAAAAAAAAATCCAAGGACCTGTTTTTCCAACCCAGACATCTTTTCATTGAATGATTTAGAAAGCTTTAAGTTGATCCAGCTTACAATTTTTTTTTTCTTTACCTCCTGGAAATCTCATATGGTCTTGGATCCGTCAAAAAAACCAGTCAGTTCACTTGCGCTCAAAGTATCAAGCACAACAAAGATAAACAGAAGTGAGGAAGGTTCTGGGTTCACTACATCTGGATTTTCAAGACACCTATTGTGAAGTCATTAGGGAATTGATGAGAATATGGCTTCAAGCACATTTTGCAGTTTGCTACAAATTCTGTTGTACATAATGCAGACGCACACTCAGGAGGCCAATTTAACTGTTAACAGTGCATGGAGCGAATGCAGCATTTTAAAAGATCTAGGTTTTTTTAGGTCATTAATGTGTCCTTGGTTGATCAGTCATCTGGTCCCTCCTACTGTGTGTTATGACCACCACGTAATCCATTCTCGCTCTTTCTGATTTGGGGTTTTTCCTCATCCATCCCATTAGTAGGGATGTTTTCTGTGTTTTCTAGCAAGAAAAAAAAATCAATCAATCAAACCTGCATACATGTTACTCATGACTGTCATCTAGTCCTAAATCTCTTCTGTTGTTGAATCATCCTTGCAAAACAGCTGAATACATCTGGAGAAAACACAGCACACCAAAGAAGCAGAATACTGCAAACCAAAGACATTTATGACTTGTCATTTTCTAGCCTAAAAATACTGTGATTACTTTTAGAAATCAGAAAACCTCTGCAACTCCGAATGGCATTCAGCTCTTGCATTTGGCGCATCATCGGGCTGAGCGGACCAGCTACACCAAGGACATTAGCCAAGCCACCCAGAGGGGTGGCTTTGCCACACCAGTTGTCACCTTCCCATAGCAAGTGGAAGAGCGCCCACAGAACTCTGGGAGATTGCAAAGGTCACAATGTGCATATTTACCAGTGAATGGCCCCGGGTGGGGCCACGTGGGGGTGTTCAAAGCAAGCCAAACGCTGCAATCATTCTTTACAGACACTTGAGACTGACTTTTTTATGAATTACTTAGTCGAAACCAAAGAAACTTTTTCTGCACCTACTTCTGCAACAAACAAAACTGTCCCATTAAAATGAATAAATAAATCCGTAAATCAATGGAAATCACCACCAATAAGAAGGAAGCACGCCAGAAAATAAACGAAAACAAAAACAGGGAGACACACTGTGTTCAAACAGACCTCTTGGGACATTTTTTGGAAGCAGATTTTAAAGAAAGGGTTGAGACAAAGATAGAAATAAGGAAGAGCCTCAGTGGCTGCTGCTTCATTTGACAACTCACACGGTAATCTTAAAGCTGAAGATTGTCTTTAATTTGTGCCTATGCAGTTTTTCAAAAGAACACGGAACAGAGCAACAGAAACCTCAACAGCTACAATACCAAAGATGAGGATTTCTCACACCTTTTGTTTCAGTTCATTATCTCCTCTTGCCTGGCTAAAATACTAATAGCGCCATTGAACTGTATAAAGGTAATCAATTATGTTTCTCTGAGCAACAAAAGGAAAGGGCCATTTATTTGATTTTATTGTTTCATTTCAATTTTGTCTTATGGTTTTTTGCCCCAACATGGAATCTCTCAAAAGTTTCCATGGACTCCAAGTTTAAGATGTTGGGATATTGAACAGTTCTCTCTGCTCAGCAGAGGGTAGGGAATAACATTATCACTTGAATGTTCTTTGCTTAACCCTTAGACTTGGTTCCTTCTATGTTCAGAGTCTCATCATCAGGGGAAGGAAAGGGAGTGAGGGTCAGGGATAGGGGTCTTGGTGATGCATCCTCTCCCGAGCCACAGAACCAAAGAGTTTATAGAGGAATTTACAGCCTCGTTTTCATGTGATTGCTACATCCTAACAGGGCTTCATTTGGGGGTGGGGGGAAACATGTAAAAATAATTGCCAGTTTCTACTTTTCTATTAGCTTTTTAAAAATCAGCTGTAAAGTTGCATTTCTAAAGAAAGATATATATAATATATAAAATACATATATAGATCAACTTGACATTGGTGATAACCAAAATTATTGCTGTCCAAATTCATGTCTTGTTTTGGTCCAGTGCTTCATTTGCTAAGTATTCGGTTCAGAATTTTTCTCATTTCTCATGCCATTCCAGAGTTAATTTGCCACTGTGGATGATTTGAAGTATTCAGATCTCTATGGAAGTTTCTGGGACAGGTTTAAAGTCAAGATCAAGCATTTTAGCATTTAACCTGTTGATAAATGGATCCATGGTGTACATGAGTTTTATTTGTATTCGGAGTCATCTCTATTCTATCCCTCAGCCTCGATTAAGGTGGTGAGTGAAGTGCATCCAACAGACTCGGCCCAGAACTGGGTCCTGACAGTGGGGTGCTCATCTTCTGTAACTGTTGGGAAGGCTCGGTGGTCCATTTTCACCAGTTAAAGAATATGAGGCCAGCCCAGAAATCTGTTCTCCAGGAGCTGCCCTGTCCCATCTGGGTGTGCCAGACCCCCTCAGTGAGCAGGTCCACCAAAGGGACTTCTCACAGGGGAAGCCCAACTCCTGTTGCAATGGGTTGATAGATTTCCTCAGGGTGGTAATTACCAATTCGTATTTTGACAAGCCTATGTGCAACCACAGCTGGCACTGGGGTGGGCAGTGGTGTTGGGTGGGATGGGGGAGAGTGTCTCAATCCTGAAGAGAAAATATAAAGCAGGTTTTGGGGAGACTTCTGGAGTCCTGCCCCTAGAGAGCCCCATTGTTGTTCTTTGTGCCCCCTCCTCATTCCCCCTATGTGGGTCTCCCTATGCAGGAGCTGTGAGAGAATGTGACTCTCCACAATTTTTATAATTCATCCTTCCTAGGAGATTGTTCATTGGCTCTTCCCTTGTGTCCCTTTGTCCCTTGCTCATACTCCATGTTTCCTTTGTCAAAGGACTAAGAAAAGAGCATATTTCAGCAGAGGAGTGTTCCCATGTGGGTTGATTTCAACTTGGGTATTTCTAAAAGAGTCCTTGTGACATGTGTCCAGTGGAAATGGTTGCTCTTTTCCAGACTGGATTGAGGAATGGAGCCTGTTTGATTTGGTTAGTGATTCTTTGACATACTAATCTCAGCGTTTGGGTCTCCAGCATCCTCTGAAGATGTCTAGACTAGTAGAGGCTGCCTTTGTGACCTGACATTACAACATTGGTCAAACCAGTCCTCTGATAATCAGAAGAACATGTCATAATTGTTTAAAAAAAAAAAAAAGGCAAGAATTTCTCTCCAAGGAGCTTTAATAAATGTCTCATTCCAGATAATGTCATACCAGAGAAAAGTGCTTGCTTTTAGAAAATTATTTACATACATATATAAATATATATGTGTATCTATACAGTTATGTATCAAAATTTTAAGCCCTGCAGAATTTCAATTTGTTAGAAATCTAACAGAAAAAAATTTCTATATTGAAAGGTAATAGAATTTAACCCAGTGAGTTTACTCAAGGATTTTTAAATTTAAGTTAATAATTTCAGAGAAAATAACCATTTGGGTGTGGTTATAGTTTAGTATCCATTACCTCAATCCAAGGAAAATTCCAGGCATTCCTCAACCATCAGGAAAAGGTACAGTGTGAAGGAACAGTTCTCAGCCAAATTTCACATTCTTGAGGCAACAGAAATCAAAACACTCAGAGCCATTGAGTGGAAAAACAATTTACTTTATTCCTTTACACAAATAGGCTTGCATTGTTTTTGTTTTAATGTGATTTTGGTACTAGGGATATAATTATTTCATTCCAGGAAATAATAAAAAAAAACAGACAGAGCCAATACATTTCTTTTTTTAAAGGAAACAGCAACAACAATAAAAACTCAGCACCAATATTTAAAAGCTTTTCCAAAATGTAAAAGAAGTGTTTAGCTTGCACCATGCATAAAGGTGCAGGCTAGTTGAACCAGGAAGCATGGCACTTCCTCTGGAGAAATCCAGAAAGAGTTGCTTCTAAGCTCCCTTTTCCCCCTGCAGGCTCTTGGCAATTGTAGGCTTTAGCAAATCCAGAATAATTTTCAATTCAAGCTAAAATAAAATCAACATTTGGAATGTAAATCTGATACACACACACTTTTCTAAGTCAAACAACATATTTCAAAACCAAAAATAAATACCTTTTAGATAATCAGTTATTTTCTTTGTCTATACTGGGCACCCACCTACTAGTGCCAGTAAATTCAAGTTGAACAGATTTTTAAAATCACTATTATCTGGGTATGGGGGAAACTTCCCCACTTTTGAAAATGTTGGTAGAATTATAGGAATGTCTGTTTGATTATCATTACCAAAGTGTCATGACAGTATGCCTTTGTAGTGAACTCGGATTTTCAGGAGTTTGAATAGTTGGATATTTTAAAATCTAAGAAGAAAAGGCCTGTTTCCAATGTTGTTGAAGAATAATGAACTCTATTAAAAAGTGGAGAAAAAGATAATACATGTGGTCAAGGTTGACCACAAGGCCCAGGCACAACTACCTTGGCGATAATCTTCTAGATTCGTAACAGGTTAGAGCTGACTTTTTGTTTTTGTTGTTGCTGATGCTGTGTGATTCAGACTTCTCAGCCTAACCAGGAAGAGTAAGTGGAAATGGTAGATGAAGAAGGGGTAGAGCTGGTGTATCTATAACTTTCTGATATTTGTCTGCCAAACTTGATATATTAGTAATTTTTTTATCTTTAGCTAAGATCAAGTCACCCCTGAAACAACAGGAGATTCTAGTTTTAAAATAAGGCCACAAAAATCCTTACGGAATGAAGAATGGCACCCCAGTTGGTTGTATAAGTCTCATAAGATAATGATGTTGATTTTAAATATGGATGTCTCAATGCCTGTTTTCTATCAATGATTTGTTTGTTTCCAAGGTCGGGGAGGGAAAGAGGGGAGGGTTTATCTGTTTTAGAAAGTCTCAGAATACTTATAAAATACAGAAGTAGTTATTAAAATATATAGGACCTCACATAGGTAGATACAGAACTTACCATTGAGGCTGATGGGCTGTTGTGTGAATCACACAGGACCTTAAATGAGGCTCATTATTCTCACACACCAAAATGACTCTGACAGCCTGAAGCAGTTATTGCTAGAGCCCAAGCTTTCCTTGGAGGTTTTGGAGTTAGGTTGATTGGAAGTAACCAGCTAATACCTTTTCTAGTGGAGAAAAAGACATTGCTACCAGCTTGTTCATCCCATAGAAGTCTTCCACTCTGCTCCATTTTTAGCAGCAAGCATTTCATGTAGCATAAACCTTGGCAGATAAGTGTGCCTAAGGTTTATACAGTCTGTCCGCTTGGATGTATACAAATTTAGATACATATTTTAACATGTGTTCTCATAGATGACTTTATAACAACACACATTACCTATAGGTGTCTAGACTGTGTACATACAAGTGTGTACAGACAAGCTTCATACGTATATACTGTAATCCGTTACAACAAATAAATTTTAAATCATCGTTTAACATGTATGTGGTACTTCTACAGTGTACATTGTTTTCATTATTTATTGTAACATTGAAAACCACAGTGCAGGGAAAACAAAAGTATCCCAGCATCTTCATCCTGTACACTTGGAATTAATTTCATTTGGGCATATCCAAGATAAACTCAACTTTCAAGAAATCTTGTATATTATTTAATCATCTGTGTTAGGATGACACCTATGATTGATGACTTCGGTTGAATAGCTTTATTCTGGATTTTTCATAACTAAAGCTAAATCCAAAGACCTGAAAAAGGACAAAAAGAAAAAAAAAAAAAGAAA\"\n",
"print(translate(a,0))\n",
"print(translate(a,1))\n",
"print(translate(a,2))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: '/home/annaldas/projects/result/ONECUT2/ONECUT2_seq.fa'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-9-28170ef6fa91>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0mtranscripts_filename\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/home/annaldas/projects/result/ONECUT2/ONECUT2_seq.fa\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 127\u001b[0;31m \u001b[0mtranscripts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSeqIO\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtranscripts_filename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"fasta\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 128\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[0mgene\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"ONECUT2\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/Bio/SeqIO/__init__.py\u001b[0m in \u001b[0;36mindex\u001b[0;34m(filename, format, alphabet, key_function)\u001b[0m\n\u001b[1;32m 951\u001b[0m \u001b[0mrepr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"SeqIO.index(%r, %r, alphabet=%r, key_function=%r)\"\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 952\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey_function\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 953\u001b[0;31m return _IndexedSeqFileDict(proxy_class(filename, format, alphabet),\n\u001b[0m\u001b[1;32m 954\u001b[0m key_function, repr, \"SeqRecord\")\n\u001b[1;32m 955\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/Bio/SeqIO/_index.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, filename, format, alphabet)\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 184\u001b[0m \u001b[0;34m\"\"\"Initialize the class.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 185\u001b[0;31m \u001b[0mSeqFileRandomAccess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 186\u001b[0m marker = {\"ace\": b\"CO \",\n\u001b[1;32m 187\u001b[0m \u001b[0;34m\"embl\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34mb\"ID \"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/Bio/SeqIO/_index.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, filename, format, alphabet)\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0;34m\"\"\"Initialize the class.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 46\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_handle\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_open_for_random_access\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 47\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_alphabet\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0malphabet\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_format\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/Bio/File.py\u001b[0m in \u001b[0;36m_open_for_random_access\u001b[0;34m(filename)\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[0mIf\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mfile\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mgzipped\u001b[0m \u001b[0mbut\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mBGZF\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ma\u001b[0m \u001b[0mspecific\u001b[0m \u001b[0mValueError\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mraised\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 131\u001b[0m \"\"\"\n\u001b[0;32m--> 132\u001b[0;31m \u001b[0mhandle\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 133\u001b[0m \u001b[0mmagic\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 134\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseek\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/home/annaldas/projects/result/ONECUT2/ONECUT2_seq.fa'"
]
}
],
"source": [
"codon_table = {\n",
" 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', \n",
" 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', \n",
" 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', \n",
" 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R', \n",
" 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', \n",
" 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', \n",
" 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', \n",
" 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', \n",
" 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', \n",
" 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', \n",
" 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', \n",
" 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', \n",
" 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', \n",
" 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', \n",
" 'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', \n",
" 'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', \n",
" }\n",
"\n",
"def determine_utrs(gene):\n",
" filename = \"/home/annaldas/projects/result/%s/%s_utr_regions.fa\" %(gene,gene)\n",
" bedfastafile = open(filename,\"r\")\n",
" bedfastalines = bedfastafile.readlines()\n",
" bedfastafile.close()\n",
" gene_utr = dict()\n",
" for line in bedfastalines:\n",
" if (line.startswith(\">\")):\n",
" trans_id = line[1:].strip()\n",
" if (trans_id not in gene_utr):\n",
" gene_utr[trans_id] = []\n",
" else:\n",
" gene_utr[trans_id].append(line.strip())\n",
" return gene_utr\n",
"\n",
"def score(seq,start): \n",
" kozak = {\n",
" \"A\":[0.25,0.61,0.27,0.15,1.00,0.00,0.00,0.23],\n",
" \"C\":[0.53,0.02,0.49,0.55,0.00,0.00,0.00,0.16],\n",
" \"G\":[0.15,0.36,0.13,0.21,0.00,0.00,1.00,0.46],\n",
" \"T\":[0.07,0.01,0.11,0.09,0.00,1.00,0.00,0.15]\n",
" }\n",
" \n",
" score = 1.0\n",
" for i in range(start,len(seq)):\n",
" score *= kozak[seq[i]][i]\n",
" return score\n",
" \n",
"\n",
"def translate(seq, i, utr_regions):\n",
" translating = True\n",
" aa = \"\"\n",
" \n",
" in_utr = False\n",
" for utr in utr_regions:\n",
" start,stop = utr\n",
" if ((start < i) and (i < stop)):\n",
" in_utr = True\n",
" \n",
" while(translating): \n",
" if ((len(seq) < 3) or (in_utr)):\n",
" translating = False\n",
" aa = \"\"\n",
" else:\n",
" codon = seq[0:3]\n",
" if (codon_table[codon] == \"_\"):\n",
" translating = False\n",
" else:\n",
" aa += codon_table[codon]\n",
" seq = seq[3:]\n",
" i += 3\n",
" return aa,i\n",
"\n",
"def find_utrs(seq,utr):\n",
" pos = seq.find(utr)\n",
" if (pos == -1):\n",
" if (len(utr) > 20): \n",
" for i in range(len(utr) - 1,len(utr)*5//10 - 1,-1):\n",
" pos = seq.find(utr[:i])\n",
" return pos\n",
"\n",
"def translate_aa_seq(seq,enst,gene_utrs):\n",
" utr_regions = []\n",
" for utr in gene_utrs[enst]:\n",
" pos = find_utrs(seq,utr)\n",
" if (pos != -1):\n",
" utr_regions.append([pos,pos + len(utr)])\n",
" \n",
" longest_aa_seq = \"M\"\n",
" longest_aa_seq_sc = 0\n",
" longest_aa_seq_sc_end = 0\n",
" for i in range(len(seq)):\n",
" if (seq[i:i+3] == \"ATG\"):\n",
" sc = score(seq[i-4:i+4],0)\n",
" aa,end = translate(seq[i:], i, utr_regions)\n",
" #print(i,seq[i-4:i+4],aa,sc, end)\n",
" if ((len(aa) > 20) and (sc > longest_aa_seq_sc) and (i > longest_aa_seq_sc_end)):\n",
" longest_aa_seq = aa\n",
" longest_aa_seq_sc = sc\n",
" longest_aa_seq_sc_end = end\n",
" return (longest_aa_seq,longest_aa_seq_sc)\n",
"\n",
"\n",
"\n",
"def translate_aa_seq_length(seq,enst,gene_utrs):\n",
" utr_regions = []\n",
" \n",
" longest_aa_seq = \"M\"\n",
" for i in range(len(seq)):\n",
" if (seq[i:i+3] == \"ATG\"):\n",
" aa,end = translate(seq[i:], i, utr_regions)\n",
" #print(i,seq[i-4:i+4],aa, end)\n",
" if (len(aa) > len(longest_aa_seq)):\n",
" longest_aa_seq = aa\n",
" return longest_aa_seq\n",
"\n",
"def find_all_aa_seqs(seq,enst,gene):\n",
" gene_utrs = determine_utrs(gene)\n",
" \n",
" longest_aa_seq = translate_aa_seq_length(seq,enst,gene_utrs)\n",
" if gene in gene_utrs:\n",
" for utr in gene_utrs[gene]:\n",
" if (find_utrs(seq,utr) != -1):\n",
" longest_aa_seq,longest_aa_seq_sc = translate_aa_seq(seq,enst,gene_utrs)\n",
" \n",
" return longest_aa_seq\n",
" \n",
"transcripts_filename = \"/home/annaldas/projects/result/ONECUT2/ONECUT2_seq.fa\"\n",
"transcripts = SeqIO.index(transcripts_filename, \"fasta\")\n",
"\n",
"gene = \"ONECUT2\"\n",
"\n",
"for transcript in transcripts:\n",
" seq = str(transcripts[transcript].seq).strip()\n",
" enst = str(transcripts[transcript].id).split(\"|\")[-1].strip()\n",
" protein = find_all_aa_seqs(seq,enst,gene)\n",
" transcript_name = str(transcripts[transcript].id)\n",
" transcript_name = str(transcripts[transcript].id)\n",
" transcript_filename = transcript_name.replace(\"|\",\"_\")\n",
" transcript_filename = transcript_filename.replace(\"_\",\"\")\n",
" print(transcript_name,transcript_filename,protein)\n",
" \n",
"#enst = \"enst\"\n",
"#a=\"TGGACAGCTCCCGCTCACCCAAACAGAAGACGTCGGCGCCGGAGCGGGCTCGGACATGGCGAGGCTGCGAGCCGGCCCGAGCGGCGGGGCCCGGTGATCCCTCCCTCCCTCCCCGTCCCCTCCCCTCTCCCGCACGCACGCCCCGTCCGCCCCCACCCCGCCCCCACCCCGGGCGAGCCCGCCCGCAGCCCGGGGCGCACACCCGCACGCGCACTCCTCTCCACTCACTCCCGCGCCCGCCCCCACTCCCGCAGCCGAGCCCCGCCACGCGCGCCTTGCCCGCCCGCCGGCCGCCCCCGCCGCCCCCGCCGCCCCCGGGCCCTGATGGACTGAATGAAGGCTGCCTACACCGCCTATCGATGCCTCACCAAAGACCTAGAAGGCTGCGCCATGAACCCGGAGCTGACAATGGAAAGTCTGGGCACTTTGCACGGGCCGGCCGGCGGCGGCAGTGGCGGGGGCGGCGGCGGGGGCGGCGGGGGCGGCGGCGGGGGCCCGGGCCATGAGCAGGAGCTGCTGGCCAGCCCCAGCCCCCACCACGCGGGCCGCGGCGCCGCTGGCTCGCTGCGGGGCCCTCCGCCGCCTCCAACCGCGCACCAGGAGCTGGGCACGGCGGCAGCGGCGGCAGCGGCGGCGTCGCGCTCGGCCATGGTCACCAGCATGGCCTCGATCCTGGACGGCGGCGACTACCGGCCCGAGCTCTCCATCCCGCTGCACCACGCCATGAGCATGTCCTGCGACTCGTCTCCGCCTGGCATGGGCATGAGCAACACCTACACCACGCTGACACCGCTCCAGCCGCTGCCACCCATCTCCACCGTGTCTGACAAGTTCCACCACCCTCACCCGCACCACCATCCGCACCACCACCACCACCACCACCACCAGCGCCTGTCCGGCAACGTCAGCGGCAGCTTCACCCTCATGCGCGACGAGCGCGGGCTCCCGGCCATGAACAACCTCTACAGTCCCTACAAGGAGATGCCCGGCATGAGCCAGAGCCTGTCCCCGCTGGCCGCCACGCCGCTGGGCAACGGGCTAGGCGGCCTCCACAACGCGCAGCAGAGTCTGCCCAACTACGGTCCGCCGGGCCACGACAAAATGCTCAGCCCCAACTTCGACGCGCACCACACTGCCATGCTGACCCGCGGTGAGCAACACCTGTCCCGCGGCCTGGGCACCCCACCTGCGGCCATGATGTCGCACCTGAACGGCCTGCACCACCCGGGCCACACTCAGTCTCACGGGCCGGTGCTGGCACCCAGTCGCGAGCGGCCACCCTCGTCCTCATCGGGCTCGCAGGTGGCCACGTCGGGCCAGCTGGAAGAAATCAACACCAAAGAGGTGGCCCAGCGCATCACAGCGGAGCTGAAGCGCTACAGTATCCCCCAGGCGATCTTTGCGCAGAGGGTGCTGTGCCGGTCTCAGGGGACTCTCTCCGACCTGCTCCGGAATCCAAAACCGTGGAGTAAACTCAAATCTGGCAGGGAGACCTTCCGCAGGATGTGGAAGTGGCTTCAGGAGCCCGAGTTCCAGCGCATGTCCGCCTTACGCCTGGCAGCGTGCAAACGCAAAGAGCAAGAACCAAACAAAGACAGGAACAATTCCCAGAAGAAGTCCCGCCTGGTGTTCACTGACCTCCAACGCCGAACACTCTTCGCCATCTTCAAGGAGAACAAACGCCCGTCAAAGGAGATGCAGATCACCATTTCCCAGCAGCTGGGCCTGGAGCTCACAACCGTCAGCAACTTCTTCATGAACGCCCGGCGCCGCAGCCTGGAGAAGTGGCAAGACGATCTGAGCACAGGGGGCTCCTCGTCCACCTCCAGCACGTGTACCAAAGCATGATGGAAGGACTCTCACTTGGGCACAAGTCACCTCCAAATGAGGACAACAGATACCAAAAGAAAACAAAGGAAAAAGACACCGGATTCCTAGCTGGGGCCCTTCACTGGTGATTTGAAAGCACAATTCTCTTGCAAAGAAACTTATATTCTAGCTGTAATCATAGGCCAGGTGTTCTTCTTTTGTTTTTAATGGCTATGGAGTCCAAGTGCAAGCTGAAAAATTAATCTCTTAGAACCAGACACTGTTCTCTGAGCATGCTAAGCATCCCAGAAACCCAAATGGGGCCTTCCTGGAGCGAGTTAATTCCAGTATGGTGTCAACCAAGCTCGGGATTGCTTAAAATATCATCCATCCCACTTCAGGTCCTGTCAGCTTCTTGCAGTCAGAGTTCCTATGAGTAACAATAGGAGTTTGGCCTATGTAAGGACTCTGAGTTTAGGCTTCCAAGATACAACAATAAGAGAAGAATCTAGCAACGAGAATGACCTCATTTGCTTTCCACATGCTTAGCCTCATTATACCATGTTATGTCCAAGTTCACAGCCACAACATCAGAATGGTAATTACTGAGCACAAGTTTTAAATATGGACGTTAAAAAAAAAAATCCAAGGACCTGTTTTTCCAACCCAGACATCTTTTCATTGAATGATTTAGAAAGCTTTAAGTTGATCCAGCTTACAATTTTTTTTTTCTTTACCTCCTGGAAATCTCATATGGTCTTGGATCCGTCAAAAAAACCAGTCAGTTCACTTGCGCTCAAAGTATCAAGCACAACAAAGATAAACAGAAGTGAGGAAGGTTCTGGGTTCACTACATCTGGATTTTCAAGACACCTATTGTGAAGTCATTAGGGAATTGATGAGAATATGGCTTCAAGCACATTTTGCAGTTTGCTACAAATTCTGTTGTACATAATGCAGACGCACACTCAGGAGGCCAATTTAACTGTTAACAGTGCATGGAGCGAATGCAGCATTTTAAAAGATCTAGGTTTTTTTAGGTCATTAATGTGTCCTTGGTTGATCAGTCATCTGGTCCCTCCTACTGTGTGTTATGACCACCACGTAATCCATTCTCGCTCTTTCTGATTTGGGGTTTTTCCTCATCCATCCCATTAGTAGGGATGTTTTCTGTGTTTTCTAGCAAGAAAAAAAAATCAATCAATCAAACCTGCATACATGTTACTCATGACTGTCATCTAGTCCTAAATCTCTTCTGTTGTTGAATCATCCTTGCAAAACAGCTGAATACATCTGGAGAAAACACAGCACACCAAAGAAGCAGAATACTGCAAACCAAAGACATTTATGACTTGTCATTTTCTAGCCTAAAAATACTGTGATTACTTTTAGAAATCAGAAAACCTCTGCAACTCCGAATGGCATTCAGCTCTTGCATTTGGCGCATCATCGGGCTGAGCGGACCAGCTACACCAAGGACATTAGCCAAGCCACCCAGAGGGGTGGCTTTGCCACACCAGTTGTCACCTTCCCATAGCAAGTGGAAGAGCGCCCACAGAACTCTGGGAGATTGCAAAGGTCACAATGTGCATATTTACCAGTGAATGGCCCCGGGTGGGGCCACGTGGGGGTGTTCAAAGCAAGCCAAACGCTGCAATCATTCTTTACAGACACTTGAGACTGACTTTTTTATGAATTACTTAGTCGAAACCAAAGAAACTTTTTCTGCACCTACTTCTGCAACAAACAAAACTGTCCCATTAAAATGAATAAATAAATCCGTAAATCAATGGAAATCACCACCAATAAGAAGGAAGCACGCCAGAAAATAAACGAAAACAAAAACAGGGAGACACACTGTGTTCAAACAGACCTCTTGGGACATTTTTTGGAAGCAGATTTTAAAGAAAGGGTTGAGACAAAGATAGAAATAAGGAAGAGCCTCAGTGGCTGCTGCTTCATTTGACAACTCACACGGTAATCTTAAAGCTGAAGATTGTCTTTAATTTGTGCCTATGCAGTTTTTCAAAAGAACACGGAACAGAGCAACAGAAACCTCAACAGCTACAATACCAAAGATGAGGATTTCTCACACCTTTTGTTTCAGTTCATTATCTCCTCTTGCCTGGCTAAAATACTAATAGCGCCATTGAACTGTATAAAGGTAATCAATTATGTTTCTCTGAGCAACAAAAGGAAAGGGCCATTTATTTGATTTTATTGTTTCATTTCAATTTTGTCTTATGGTTTTTTGCCCCAACATGGAATCTCTCAAAAGTTTCCATGGACTCCAAGTTTAAGATGTTGGGATATTGAACAGTTCTCTCTGCTCAGCAGAGGGTAGGGAATAACATTATCACTTGAATGTTCTTTGCTTAACCCTTAGACTTGGTTCCTTCTATGTTCAGAGTCTCATCATCAGGGGAAGGAAAGGGAGTGAGGGTCAGGGATAGGGGTCTTGGTGATGCATCCTCTCCCGAGCCACAGAACCAAAGAGTTTATAGAGGAATTTACAGCCTCGTTTTCATGTGATTGCTACATCCTAACAGGGCTTCATTTGGGGGTGGGGGGAAACATGTAAAAATAATTGCCAGTTTCTACTTTTCTATTAGCTTTTTAAAAATCAGCTGTAAAGTTGCATTTCTAAAGAAAGATATATATAATATATAAAATACATATATAGATCAACTTGACATTGGTGATAACCAAAATTATTGCTGTCCAAATTCATGTCTTGTTTTGGTCCAGTGCTTCATTTGCTAAGTATTCGGTTCAGAATTTTTCTCATTTCTCATGCCATTCCAGAGTTAATTTGCCACTGTGGATGATTTGAAGTATTCAGATCTCTATGGAAGTTTCTGGGACAGGTTTAAAGTCAAGATCAAGCATTTTAGCATTTAACCTGTTGATAAATGGATCCATGGTGTACATGAGTTTTATTTGTATTCGGAGTCATCTCTATTCTATCCCTCAGCCTCGATTAAGGTGGTGAGTGAAGTGCATCCAACAGACTCGGCCCAGAACTGGGTCCTGACAGTGGGGTGCTCATCTTCTGTAACTGTTGGGAAGGCTCGGTGGTCCATTTTCACCAGTTAAAGAATATGAGGCCAGCCCAGAAATCTGTTCTCCAGGAGCTGCCCTGTCCCATCTGGGTGTGCCAGACCCCCTCAGTGAGCAGGTCCACCAAAGGGACTTCTCACAGGGGAAGCCCAACTCCTGTTGCAATGGGTTGATAGATTTCCTCAGGGTGGTAATTACCAATTCGTATTTTGACAAGCCTATGTGCAACCACAGCTGGCACTGGGGTGGGCAGTGGTGTTGGGTGGGATGGGGGAGAGTGTCTCAATCCTGAAGAGAAAATATAAAGCAGGTTTTGGGGAGACTTCTGGAGTCCTGCCCCTAGAGAGCCCCATTGTTGTTCTTTGTGCCCCCTCCTCATTCCCCCTATGTGGGTCTCCCTATGCAGGAGCTGTGAGAGAATGTGACTCTCCACAATTTTTATAATTCATCCTTCCTAGGAGATTGTTCATTGGCTCTTCCCTTGTGTCCCTTTGTCCCTTGCTCATACTCCATGTTTCCTTTGTCAAAGGACTAAGAAAAGAGCATATTTCAGCAGAGGAGTGTTCCCATGTGGGTTGATTTCAACTTGGGTATTTCTAAAAGAGTCCTTGTGACATGTGTCCAGTGGAAATGGTTGCTCTTTTCCAGACTGGATTGAGGAATGGAGCCTGTTTGATTTGGTTAGTGATTCTTTGACATACTAATCTCAGCGTTTGGGTCTCCAGCATCCTCTGAAGATGTCTAGACTAGTAGAGGCTGCCTTTGTGACCTGACATTACAACATTGGTCAAACCAGTCCTCTGATAATCAGAAGAACATGTCATAATTGTTTAAAAAAAAAAAAAAGGCAAGAATTTCTCTCCAAGGAGCTTTAATAAATGTCTCATTCCAGATAATGTCATACCAGAGAAAAGTGCTTGCTTTTAGAAAATTATTTACATACATATATAAATATATATGTGTATCTATACAGTTATGTATCAAAATTTTAAGCCCTGCAGAATTTCAATTTGTTAGAAATCTAACAGAAAAAAATTTCTATATTGAAAGGTAATAGAATTTAACCCAGTGAGTTTACTCAAGGATTTTTAAATTTAAGTTAATAATTTCAGAGAAAATAACCATTTGGGTGTGGTTATAGTTTAGTATCCATTACCTCAATCCAAGGAAAATTCCAGGCATTCCTCAACCATCAGGAAAAGGTACAGTGTGAAGGAACAGTTCTCAGCCAAATTTCACATTCTTGAGGCAACAGAAATCAAAACACTCAGAGCCATTGAGTGGAAAAACAATTTACTTTATTCCTTTACACAAATAGGCTTGCATTGTTTTTGTTTTAATGTGATTTTGGTACTAGGGATATAATTATTTCATTCCAGGAAATAATAAAAAAAAACAGACAGAGCCAATACATTTCTTTTTTTAAAGGAAACAGCAACAACAATAAAAACTCAGCACCAATATTTAAAAGCTTTTCCAAAATGTAAAAGAAGTGTTTAGCTTGCACCATGCATAAAGGTGCAGGCTAGTTGAACCAGGAAGCATGGCACTTCCTCTGGAGAAATCCAGAAAGAGTTGCTTCTAAGCTCCCTTTTCCCCCTGCAGGCTCTTGGCAATTGTAGGCTTTAGCAAATCCAGAATAATTTTCAATTCAAGCTAAAATAAAATCAACATTTGGAATGTAAATCTGATACACACACACTTTTCTAAGTCAAACAACATATTTCAAAACCAAAAATAAATACCTTTTAGATAATCAGTTATTTTCTTTGTCTATACTGGGCACCCACCTACTAGTGCCAGTAAATTCAAGTTGAACAGATTTTTAAAATCACTATTATCTGGGTATGGGGGAAACTTCCCCACTTTTGAAAATGTTGGTAGAATTATAGGAATGTCTGTTTGATTATCATTACCAAAGTGTCATGACAGTATGCCTTTGTAGTGAACTCGGATTTTCAGGAGTTTGAATAGTTGGATATTTTAAAATCTAAGAAGAAAAGGCCTGTTTCCAATGTTGTTGAAGAATAATGAACTCTATTAAAAAGTGGAGAAAAAGATAATACATGTGGTCAAGGTTGACCACAAGGCCCAGGCACAACTACCTTGGCGATAATCTTCTAGATTCGTAACAGGTTAGAGCTGACTTTTTGTTTTTGTTGTTGCTGATGCTGTGTGATTCAGACTTCTCAGCCTAACCAGGAAGAGTAAGTGGAAATGGTAGATGAAGAAGGGGTAGAGCTGGTGTATCTATAACTTTCTGATATTTGTCTGCCAAACTTGATATATTAGTAATTTTTTTATCTTTAGCTAAGATCAAGTCACCCCTGAAACAACAGGAGATTCTAGTTTTAAAATAAGGCCACAAAAATCCTTACGGAATGAAGAATGGCACCCCAGTTGGTTGTATAAGTCTCATAAGATAATGATGTTGATTTTAAATATGGATGTCTCAATGCCTGTTTTCTATCAATGATTTGTTTGTTTCCAAGGTCGGGGAGGGAAAGAGGGGAGGGTTTATCTGTTTTAGAAAGTCTCAGAATACTTATAAAATACAGAAGTAGTTATTAAAATATATAGGACCTCACATAGGTAGATACAGAACTTACCATTGAGGCTGATGGGCTGTTGTGTGAATCACACAGGACCTTAAATGAGGCTCATTATTCTCACACACCAAAATGACTCTGACAGCCTGAAGCAGTTATTGCTAGAGCCCAAGCTTTCCTTGGAGGTTTTGGAGTTAGGTTGATTGGAAGTAACCAGCTAATACCTTTTCTAGTGGAGAAAAAGACATTGCTACCAGCTTGTTCATCCCATAGAAGTCTTCCACTCTGCTCCATTTTTAGCAGCAAGCATTTCATGTAGCATAAACCTTGGCAGATAAGTGTGCCTAAGGTTTATACAGTCTGTCCGCTTGGATGTATACAAATTTAGATACATATTTTAACATGTGTTCTCATAGATGACTTTATAACAACACACATTACCTATAGGTGTCTAGACTGTGTACATACAAGTGTGTACAGACAAGCTTCATACGTATATACTGTAATCCGTTACAACAAATAAATTTTAAATCATCGTTTAACATGTATGTGGTACTTCTACAGTGTACATTGTTTTCATTATTTATTGTAACATTGAAAACCACAGTGCAGGGAAAACAAAAGTATCCCAGCATCTTCATCCTGTACACTTGGAATTAATTTCATTTGGGCATATCCAAGATAAACTCAACTTTCAAGAAATCTTGTATATTATTTAATCATCTGTGTTAGGATGACACCTATGATTGATGACTTCGGTTGAATAGCTTTATTCTGGATTTTTCATAACTAAAGCTAAATCCAAAGACCTGAAAAAGGACAAAAAGAAAAAAAAAAAAAGAAA\"\n",
"#find_all_aa_seqs(a,enst,\"ONECUT2\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test = determine_utrs(\"ONECUT2\")\n",
"for i in test:\n",
" for j in test[i]:\n",
" print(i,find_utrs(a,j), j)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(\"MALNGAEVDDFSWEPPTEAETKVLQARRERQDRISRLMGDYLLRGYRMLGETCADCGTILLQDKQRKIYCVACQELDSDVDKDNPALRDVVPQPLPF\") *3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"domains = dict()\n",
"\n",
"file = open(\"/home/annaldas/projects/result/ZNRD2/ZNRD2_blastx.gff3\", \"r\")\n",
"lines = file.readlines()\n",
"file.close()\n",
"\n",
"\n",
"for line in lines:\n",
" if (line.startswith(\">\")):\n",
" break\n",
" \n",
" if (not line.startswith(\"#\")):\n",
" data = line.split(\"\\t\")\n",
" seqid,source,attr = data[0],data[1],data[8]\n",
" if (seqid not in domains): \n",
" domains[seqid] = set()\n",
" if (source != \".\" and \"Dbxref\" in attr):\n",
" Dbxref = attr.split(\";\")[-1]\n",
" IPR = Dbxref.split(\"=\")[-1][10:-2]\n",
" if (IPR != []):\n",
" domains[seqid].add(IPR)\n",
"\n",
"try:\n",
" key,value = domains.popitem()\n",
" \n",
" domains[key] = value\n",
" common_domains = value\n",
" for transcript in domains:\n",
" curr = domains[transcript]\n",
" common_domains = common_domains.intersection(curr)\n",
"except:\n",
" common_domains = []\n",
"\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"domains"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for line in lines:\n",
" if (line.startswith(\">\")):\n",
" break\n",
" \n",
" if (not line.startswith(\"#\")):\n",
" data = line.split(\"\\t\")\n",
" seqid,source,attr = data[0],data[1],data[8]\n",
" if (seqid not in domains): \n",
" domains[seqid] = set()\n",
" if (source != \".\"):\n",
" Dbxref = attr.split(\";\")[-1]\n",
" IPR = Dbxref.split(\"=\")[-1][10:-2]\n",
" domains[seqid].add(IPR)\n",
"\n",
"key,value = domains.popitem()\n",
"domains[key] = value\n",
"common_domains = value\n",
"for transcript in domains:\n",
" curr = domains[transcript]\n",
" common_domains.intersection(curr)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"file = open(\"/home/annaldas/projects/result/snrnp70/snrnp70_protein_analysis.txt\",\"w+\")\n",
"file.write(\"Gene:\\t%s\\n\" %(\"SNRNP30\"))\n",
"file.write(\"Common Domains:\\t\")\n",
"file.write(\",\".join(list(common_domains)))\n",
"file.write(\"\\nSpecific Domains:\\n\")\n",
"for transcript in domains:\n",
" specific_domains = domains[transcript].difference(common_domains)\n",
" file.write(\"%s: %s \\n\" %(transcript,\",\".join(list(specific_domains))))\n",
"file.close()\n",
"print(common_domains)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}