diff --git a/Snakefile b/Snakefile index 931e0fb..3678212 100644 --- a/Snakefile +++ b/Snakefile @@ -1,13 +1,26 @@ +import pandas as pd + +configfile: "config.yaml" + +gene_file = config["gene_file"] + +GENES = pd.read_table(gene_file)["gene_symbol"] +NanoporeGTF = config["nanopore_gtf"] +Transcripts = config["polished_reads"] + rule all: input: - db = "/project/owlmayerTemporary/Sid/blast/test/human.protein.fa" + expand("/home/annaldas/projects/result/{gene}/{gene}_map_protein_analysis.txt", gene = GENES), + expand("/home/annaldas/projects/result/{gene}/{gene}_blastx_protein_analysis.txt", gene = GENES) + #expand("/home/annaldas/projects/result/{gene}/{gene}_sashimi.pdf", gene = GENES) rule gene_transcript: input: - "/project/owlmayerTemporary/Sid/Nanopore_Results_Strict/Results/GffCompare/nanopore.combined.gtf", - "/project/owlmayerTemporary/Sid/Nanopore_Results_Strict/Results/Pinfish/corrected_transcriptome_polished_collapsed.fas" + NanoporeGTF, + Transcripts output: - "/home/annaldas/projects/result/{gene}/{gene}_seq.fa" + "/home/annaldas/projects/result/{gene}/{gene}_seq.fa", + "/home/annaldas/projects/result/{gene}/{gene}_sashimi.sh" params: gene = "{gene}" script: @@ -15,26 +28,56 @@ rule gene_transcript: rule blastx: input: - "/home/annaldas/projects/result/{gene}/{gene}_seq.fa" + gene_fa = "/home/annaldas/projects/result/{gene}/{gene}_seq.fa", + db = config["human_protein"] output: "/home/annaldas/projects/result/{gene}/{gene}_blastx.out" threads: 4 shell: - "/home/annaldas/ncbi-blast-2.9.0+/bin/blastx -query {input} -db {rules.all.input.db} -out {output} -evalue 1e-5 -max_target_seqs 1 -max_hsps 1 -outfmt '6 qseqid sseqid evalue' -num_threads {threads} -soft_masking false" + "/home/annaldas/ncbi-blast-2.9.0+/bin/blastx -query {input.gene_fa} -db {input.db} -out {output} -evalue 1e-5 -max_target_seqs 1 -max_hsps 1 -outfmt '6 qseqid sseqid evalue' -num_threads {threads} -soft_masking false" rule protein_sequence: input: - "/home/annaldas/projects/result/{gene}/{gene}_blastx.out" + blastx = "/home/annaldas/projects/result/{gene}/{gene}_blastx.out", + db = config["human_protein"] output: "/home/annaldas/projects/result/{gene}/{gene}_blastx_protein.fa" shell: - "sh protein_transcript_sequences.sh {input} {rules.all.input.db} {output}" + "sh protein_transcript_sequences.sh {input.blastx} {input.db} {output}" + +rule utr_regions: + input: + config["utr_regions"] + params: + gene = "{gene}" + output: + "/home/annaldas/projects/result/{gene}/{gene}_utr_regions.bed" + script: + "utr_regions.py" + +rule utr_sequences: + input: + hg = config["human_genome"], + utr = "/home/annaldas/projects/result/{gene}/{gene}_utr_regions.bed" + output: + "/home/annaldas/projects/result/{gene}/{gene}_utr_regions.fa" + shell: + "bedtools getfasta -fi {input.hg} -bed {input.utr} -fo {output} -name" + +rule transcript_filter_utr: + input: + utr = "/home/annaldas/projects/result/{gene}/{gene}_utr_regions.fa", + seq = "/home/annaldas/projects/result/{gene}/{gene}_seq.fa" + output: + "/home/annaldas/projects/result/{gene}/{gene}_seq_filt.fa" + script: + "filter_utr.py" rule mapping: input: - "/home/annaldas/projects/result/{gene}/{gene}_seq.fa" + "/home/annaldas/projects/result/{gene}/{gene}_seq_filt.fa" output: "/home/annaldas/projects/result/{gene}/{gene}_map_protein.fa" script: @@ -46,7 +89,7 @@ rule interpro_scan: output: "/home/annaldas/projects/result/{gene}/{gene}_{type}.gff3" params: - db = "Pfam,ProDom,Gene3D" + db = "Pfam,ProDom,Gene3D,CDD,Coils,MobiDBLite" shell: "sh /home/annaldas/my_interproscan/interproscan-5.38-76.0/interproscan.sh -i {input} -o {output} -f GFF3 -appl {params.db} -dra" @@ -58,4 +101,15 @@ rule protein_domain_analysis: params: gene = "{gene}" script: - "interproscan_analysis.py" \ No newline at end of file + "interproscan_analysis.py" + +rule sashimi_plot: + input: + sashimi_sh = "/home/annaldas/projects/result/{gene}/{gene}_sashimi.sh", + sashimi_py = config["sashimi"], + bams = config["input_bams"], + gtf = NanoporeGTF + output: + "/home/annaldas/projects/result/{gene}/{gene}_sashimi.pdf" + shell: + "sh {input.sashimi_sh} {input.sashimi_py} {input.bams} {input.gtf} {output}" \ No newline at end of file diff --git a/gene_transcripts.py b/gene_transcripts.py index b743291..be78531 100644 --- a/gene_transcripts.py +++ b/gene_transcripts.py @@ -9,6 +9,9 @@ annotate_df = pd.read_csv(annotation_filename,sep = "\t", header = None) annotate_df = annotate_df[annotate_df[2] != "exon"] annotate_lines = list(annotate_df[8]) +chrms = list(annotate_df[0]) +start = list(annotate_df[3]) +stop = list(annotate_df[4]) # Mapping gene name to oID @@ -19,20 +22,31 @@ gene_oID = dict() oID_tID = dict() +gene_pos = dict() +tID_pos = dict() #tID_exon = dict() -for ann in annotate_lines: - if "gene_name" in ann: - line = ann.split(";") +for ann in range(len(annotate_lines)): + if "gene_name" in annotate_lines[ann]: + line = annotate_lines[ann].split(";") tID = line[0].split(" ")[-1][1:-1] gene = line[2].split(" ")[-1][1:-1] oID = line[3].split(" ")[-1][1:-1] + transID = line[4].split(" ")[-1][1:-1].split(".")[0] if (gene not in gene_oID): gene_oID[gene] = [oID] else: gene_oID[gene].append(oID) + if (gene not in gene_pos): + gene_pos[gene] = [chrms[ann],start[ann],stop[ann]] + else: + if (start[ann] < gene_pos[gene][1]): gene_pos[gene][1] = start[ann] + if (gene_pos[gene][2] < stop[ann]) : gene_pos[gene][2] = stop[ann] + if (oID not in oID_tID): oID_tID[oID] = tID + if (tID not in tID_pos): tID_pos[tID] = transID#,chrms[ann],start[ann],stop[ann]] + # Import transcript isoform sequences @@ -45,7 +59,9 @@ output = [] gene = snakemake.params[0].upper() for oID in gene_oID[gene]: - tID = ">" + oID_tID[transcripts[oID].id] + tID = oID_tID[transcripts[oID].id] + transID = tID_pos[tID] + tID = ">" + tID + "|" + transID #+ "," + chrm + "," + str(start) + "," + str(stop) output.append(tID) seq = str(transcripts[oID].seq) output.append(seq) @@ -55,4 +71,14 @@ output_file.write("\n".join(output)) output_file.close() +# Create sashmimi sh + +output_filename = snakemake.output[1] +output_file = open(output_filename,"w+") + +chrm,start,stop = gene_pos[gene][0],gene_pos[gene][1],gene_pos[gene][2] +binbash = "#!/bin/bash" +sashimi = "python $1 -b $2 -c %s:%d-%d -g $3 -M 10 -C 3 -O 3 --shrink --alpha 1 --base-size=20 --ann-height=5 --height=7 --width=18 -S both -o $4" %(chrm,start,stop) +output_file.write("\n".join([binbash,sashimi])) +output_file.close() diff --git a/isoform_transcripts.ipynb b/isoform_transcripts.ipynb index d7a5e2e..d574150 100644 --- a/isoform_transcripts.ipynb +++ b/isoform_transcripts.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -34,76 +34,145 @@ }, { "cell_type": "code", - "execution_count": 320, + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "annotation_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/GffCompare/nanopore.combined.gtf\"\n", + "annotate_df = open(annotation_filename,\"r\")\n", + "annotate_df_lines = annotate_df.readlines()\n", + "annotate_df.close()\n", + "transcripts = []\n", + "\n", + "exon = 1\n", + "lines = []\n", + "result = []\n", + "for line in annotate_df_lines:\n", + " t = line.split(\"\\t\")[2]\n", + " if (t == \"transcript\"):\n", + " if (exon != 1):\n", + " result.append(transcript)\n", + " transcript = line.split(\"\\t\")[8].split(\";\")[0].split(\" \")[-1][1:-1].strip()\n", + " exon = 0\n", + " elif (t == \"exon\"):\n", + " exon += 1\n", + " lines.append(line.strip())\n", + "\n", + "#output_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/multiple_exons.txt\"\n", + "#output = open(output_filename,\"w+\")\n", + "#output.writelines(\"\\n\".join(result))\n", + "#output.close()\n", + "\n", + "#output_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_10_1/Results_10_1/GffCompare/nanopore.combined_filt.gtf\"\n", + "#output = open(output_filename,\"w+\")\n", + "#output.writelines(\"\\n\".join(result))\n", + "#output.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "# Import the nanopore annotation file\n", "\n", - "annotation_filename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/Results/GffCompare/nanopore.combined.gtf\"\n", + "annotation_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/GffCompare/nanopore.combined_filt.gtf\"\n", "annotate_df = pd.read_csv(annotation_filename,sep = \"\\t\", header = None)\n", "annotate_df = annotate_df[annotate_df[2] != \"exon\"]\n", - "annotate_lines = list(annotate_df[8])" + "annotate_lines = list(annotate_df[8])\n", + "chrms = list(annotate_df[0])\n", + "start = list(annotate_df[3])\n", + "stop = list(annotate_df[4])" ] }, { "cell_type": "code", - "execution_count": 321, + "execution_count": 40, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['9df75831-2fcc-4219-a690-ec7550c440a8|254', '84e0c7ed-e242-445b-a898-6175b282735c|21', 'e1776fe5-112b-4663-a6ef-b7952b91023b|401', '66a4c7e2-5ddb-4daf-8250-63e36d7832ce|11', 'daaefcee-2df7-463b-8bfa-8dd0328c64eb|3', '8cc51f63-25d0-44e0-af17-924fab90c341|3', '6f4da30c-34b3-492e-929a-c392d748c9ba|9', '2a5059e7-9262-4c68-9717-2a1beaa0f0d4|33']\n" - ] - } - ], + "outputs": [], "source": [ "# Mapping gene name to oID\n", "# Mapping oID to transcript id\n", "# Mapping transcript id to exons\n", "\n", - "KDM1A_info = []\n", + "BRD4_info = []\n", "\n", "gene_oID = dict()\n", + "oID_gene = dict()\n", "oID_tID = dict()\n", + "tID_oID = dict()\n", + "\n", "#tID_exon = dict()\n", "\n", - "for ann in annotate_lines: \n", - " if \"gene_name\" in ann:\n", - " line = ann.split(\";\")\n", + "for ann in range(len(annotate_lines)): \n", + " if \"gene_name\" in annotate_lines[ann]:\n", + " line = annotate_lines[ann].split(\";\")\n", " tID = line[0].split(\" \")[-1][1:-1]\n", " gene = line[2].split(\" \")[-1][1:-1]\n", " oID = line[3].split(\" \")[-1][1:-1]\n", + " transID = line[4].split(\" \")[-1][1:-1].split(\".\")[0]\n", " \n", " if (gene not in gene_oID): gene_oID[gene] = [oID]\n", " else: gene_oID[gene].append(oID)\n", " \n", " if (oID not in oID_tID): oID_tID[oID] = tID\n", + " if (tID not in tID_oID): \n", + " tID_oID[tID] = oID\n", + " else:\n", + " print(\"this sucks\")\n", + " if (oID not in gene_oID): oID_gene[oID] = gene\n", " \n", - " if (gene == \"KDM1A\"): ABI2_info.append(ann) \n", + " if (gene == \"BRD4\"): \n", + " BRD4_info.append([chrms[ann],start[ann],stop[ann],annotate_lines[ann]]) \n", " #if (tID not in tID_exon): tID_exon[tID] = []\n", "\n", - "KDM1A = gene_oID[\"KDM1A\"]\n", - "print(KDM1A)\n" + "KDM1A = gene_oID[\"KDM1A\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'cc2e8b66-6da7-4245-9544-c06d33b50252'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0moID_tID\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"cc2e8b66-6da7-4245-9544-c06d33b50252\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m: 'cc2e8b66-6da7-4245-9544-c06d33b50252'" + ] + } + ], + "source": [ + "oID_tID[\"cc2e8b66-6da7-4245-9544-c06d33b50252\"]" ] }, { "cell_type": "code", - "execution_count": 271, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Import transcript isoform sequences\n", "\n", - "transcripts_filename = \"/project/Neurodifferentiation_System/NanoporeResults/Results/Pinfish/corrected_transcriptome_polished_collapsed.fas\"\n", + "transcripts_filename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/Results/Pinfish/corrected_transcriptome_polished_collapsed.fas\"\n", "transcripts = SeqIO.index(transcripts_filename, \"fasta\")" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -117,15 +186,15 @@ " seq = str(transcripts[oID].seq)\n", " output.append(seq)\n", "\n", - "output_filename = \"/project/owlmayerTemporary/Sid/blast/test/polished_transcripts_tcons.fa\"\n", - "output_file = open(output_filename,\"w+\")\n", - "output_file.write(\"\\n\".join(output))\n", - "output_file.close()" + "#output_filename = \"/project/owlmayerTemporary/Sid/blast/test/polished_transcripts_tcons.fa\"\n", + "#output_file = open(output_filename,\"w+\")\n", + "#output_file.write(\"\\n\".join(output))\n", + "#output_file.close()" ] }, { "cell_type": "code", - "execution_count": 350, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -151,30 +220,42 @@ }, { "cell_type": "code", - "execution_count": 349, + "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "None\n", - "None\n", - "None\n", - "None\n", - "None\n", - "None\n", - "None\n", - "None\n" + "92b2df50-a034-43a9-a75a-ad52cbccf075|6 AKAP8L\n", + "78c3fe18-5c83-402c-8269-c07aedbd40a4|3 c1554cca-de07-418f-9818-eca090c80b38 TCONS_00030383\n", + "9a2337b7-740a-4725-838a-8847f42a61bc|6 392ae6cc-c2f2-47a3-bb89-3fdf6caa6719 TCONS_00030384\n", + "fd563dd9-374b-4439-a696-517bcfe44fad|3 79b8683e-5bbc-4bdd-a545-b464ff3564e3 TCONS_00030385\n", + "e172e6af-1c34-43dd-bdb6-3656d795a229|3 4a88e4c4-0501-49dc-b89a-1990cdbaf23c TCONS_00030386\n", + "fc150c74-c6c6-4e61-b4d5-d3639c0dee91|33 89d79286-527c-427b-bb19-83b110feb370 TCONS_00030387\n", + "84a1fcd2-ec02-488e-b1e0-706944cb8fc5|3 9f9ce2f8-92d1-4a3c-90f7-68fcda8fb148 TCONS_00030388\n", + "e0325806-c4fd-4d16-a8a3-3cd3ee13cdfc|6 388b7956-d60f-42cc-948c-f607b14cab9d TCONS_00030389\n", + "b655aec1-864d-418b-b78a-f19b975b80fb|21 6be26262-4cff-4aca-8f54-9c7b3e5bdf22 TCONS_00030390\n", + "bdc78543-c0d2-4ef3-b74e-e60e1c97e9d0|8 ab1a0ddb-1b97-47bc-b004-6c34f2f09b6a TCONS_00030392\n", + "db5576fd-1921-46ce-a0d2-412728ab9db8|3 c62aeaa1-b7c1-47ad-b0a3-9815ad439460 TCONS_00032730\n", + "3fe42f15-a4d1-49ae-a646-64d71a8c43e6|3 36436437-fd4b-4802-b9da-0f048cb4c048 TCONS_00032731\n", + "287bdde2-eb53-4bbc-b96a-5b26773c3dd8|9 36436437-fd4b-4802-b9da-0f048cb4c048 TCONS_00032733\n", + "5a5a062a-2f6c-46d2-aea4-a5e80bbab920|3 21c301c2-0b56-4ee7-8b6b-27f35c89cb08 TCONS_00032734\n", + "6464d1df-7486-4008-a0bb-40d6227027a5|5 fb469fcf-d099-4192-a71a-d5c4c4ae37ba TCONS_00032735\n", + "1a232563-721a-4f76-83af-18a01a9cf221|10 fb469fcf-d099-4192-a71a-d5c4c4ae37ba TCONS_00032736\n", + "43396b69-e18d-4578-903b-782887dd7340|4 fb469fcf-d099-4192-a71a-d5c4c4ae37ba TCONS_00032737\n", + "8004538e-da37-43d1-b8cc-2e8cffc77383|3 fb469fcf-d099-4192-a71a-d5c4c4ae37ba TCONS_00032738\n", + "3debe6b9-8564-4b1f-a8a5-8d3587e3a789|3 fb469fcf-d099-4192-a71a-d5c4c4ae37ba TCONS_00032739\n", + "7ccaec48-68b4-416b-b2b8-79d9e200e14b|11 fb469fcf-d099-4192-a71a-d5c4c4ae37ba TCONS_00032740\n" ] } ], "source": [ + "print(tID_oID[\"TCONS_00032753\"],oID_gene[tID_oID[\"TCONS_00032753\"]])\n", "\n", - "\n", - "for oID in gene_oID[\"KDM1A\"]:\n", + "for oID in gene_oID[\"BRD4\"]:\n", " try:\n", - " print(oID,oID_geneID[oID.split(\"|\")[0]])\n", + " print(oID,oID_geneID[oID.split(\"|\")[0]],oID_tID[oID])\n", " except:\n", " print(\"None\")\n", "\n" @@ -182,88 +263,57 @@ }, { "cell_type": "code", - "execution_count": 343, + "execution_count": 42, "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "\"['day_0.Rep2' 'day_3.Rep2' 'day_5.Rep2'] not found in axis\"", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mrep1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"day_0.Rep1\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"day_3.Rep1\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"day_5.Rep1\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mrep2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"day_0.Rep2\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"day_3.Rep2\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"day_5.Rep2\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mdf_rep1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrep2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mdf_rep2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrep1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mdrop\u001b[0;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[1;32m 3938\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3939\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlevel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minplace\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3940\u001b[0;31m errors=errors)\n\u001b[0m\u001b[1;32m 3941\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3942\u001b[0m @rewrite_axis_style_signature('mapper', [('copy', True),\n", - "\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mdrop\u001b[0;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[1;32m 3778\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;32min\u001b[0m \u001b[0maxes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3779\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3780\u001b[0;31m \u001b[0mobj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_drop_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlevel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3781\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3782\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m_drop_axis\u001b[0;34m(self, labels, axis, level, errors)\u001b[0m\n\u001b[1;32m 3810\u001b[0m \u001b[0mnew_axis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlevel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3811\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3812\u001b[0;31m \u001b[0mnew_axis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3813\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreindex\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0maxis_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnew_axis\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3814\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/pkg/python-3.7.4-0/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mdrop\u001b[0;34m(self, labels, errors)\u001b[0m\n\u001b[1;32m 4963\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m'ignore'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4964\u001b[0m raise KeyError(\n\u001b[0;32m-> 4965\u001b[0;31m '{} not found in axis'.format(labels[mask]))\n\u001b[0m\u001b[1;32m 4966\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m~\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4967\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdelete\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: \"['day_0.Rep2' 'day_3.Rep2' 'day_5.Rep2'] not found in axis\"" - ] - } - ], + "outputs": [], "source": [ - "filename = \"//home/annaldas/projects/nanopore-transcriptome-analysis/Results/Quantification/all_counts.txt\"\n", + "filename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/Results/Quantification/all_counts.txt\"\n", "df = pd.read_csv(filename)\n", - "rep1 = [\"day_0.Rep1\",\"day_3.Rep1\",\"day_5.Rep1\"]\n", - "rep2 = [\"day_0.Rep2\",\"day_3.Rep2\",\"day_5.Rep2\"]\n", + "\n", + "rep1 = [\"OJ32\",\"OJ33\",\"OJ34\"]\n", + "rep2 = [\"OJ40\",\"OJ41\",\"OJ42\"]\n", "df_rep1 = df.drop(columns = rep2)\n", "df_rep2 = df.drop(columns = rep1)\n", "\n", - "for name in rep1:\n", - " df_rep1[name] = (df_rep1[name] - df_rep1[name].min())/(df_rep1[name].max() - df_rep1[name].min())\n", + "#for name in rep1:\n", + "# df_rep1[name] = (df_rep1[name] - df_rep1[name].min())/(df_rep1[name].max() - df_rep1[name].min())\n", " \n", - "for name in rep2:\n", - " df_rep2[name] = (df_rep2[name] - df_rep2[name].min())/(df_rep2[name].max() - df_rep2[name].min())" + "#for name in rep2:\n", + "# df_rep2[name] = (df_rep2[name] - df_rep2[name].min())/(df_rep2[name].max() - df_rep2[name].min())" ] }, { "cell_type": "code", - "execution_count": 345, - "metadata": {}, + "execution_count": 60, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "KDM1A\n", - " OJ32 OJ33 OJ34 OJ40 OJ41 OJ42 \\\n", - "45406 15.793588 17.162132 15.678439 28.903236 16.041181 21.654887 \n", - "53070 39.053235 43.576893 35.157712 44.376686 43.043835 62.950252 \n", - "74482 47.380763 49.397094 47.035318 56.054761 47.856190 78.897649 \n", - "34085 15.506431 15.520537 13.302918 21.020535 12.565592 17.122468 \n", - "71657 4.307342 2.984719 3.642466 3.503423 1.871471 4.532418 \n", - "47771 2.297249 1.790831 4.117570 3.503423 2.138824 3.189479 \n", - "37454 0.574312 0.746180 0.475104 1.167808 2.406177 1.510806 \n", - "12760 1.722937 0.149236 1.583681 0.291952 4.812354 5.539622 \n", - "\n", - " class_code gene_id gene_name ref_transcript \\\n", - "45406 j ENSG00000004487.16 KDM1A ENST00000400181.8 \n", - "53070 j ENSG00000004487.16 KDM1A ENST00000400181.8 \n", - "74482 = ENSG00000004487.16 KDM1A ENST00000356634.7 \n", - "34085 j ENSG00000004487.16 KDM1A ENST00000400181.8 \n", - "71657 c ENSG00000004487.16 KDM1A ENST00000400181.8 \n", - "47771 j ENSG00000004487.16 KDM1A ENST00000400181.8 \n", - "37454 x ENSG00000004487.16 KDM1A ENST00000400181.8 \n", - "12760 x ENSG00000004487.16 KDM1A ENST00000400181.8 \n", + "ABI2\n", + " OJ32 OJ33 OJ34 class_code gene_id \\\n", + "31847 12.061978 19.252866 14.095533 = ENSG00000138443.16 \n", + "22086 16.082637 13.730726 17.263069 j ENSG00000138443.16 \n", + "79409 11.200408 5.671387 5.384810 = ENSG00000138443.16 \n", + "64102 3.159089 2.387952 1.583768 = ENSG00000138443.16 \n", "\n", - " transcript_id \n", - "45406 TCONS_00000595 \n", - "53070 TCONS_00000596 \n", - "74482 TCONS_00000598 \n", - "34085 TCONS_00000599 \n", - "71657 TCONS_00000600 \n", - "47771 TCONS_00000601 \n", - "37454 TCONS_00003846 \n", - "12760 TCONS_00003847 \n" + " gene_name ref_transcript transcript_id \n", + "31847 ABI2 ENST00000261017.9 TCONS_00035868 \n", + "22086 ABI2 ENST00000295851.10 TCONS_00035869 \n", + "79409 ABI2 ENST00000261018.11 TCONS_00035870 \n", + "64102 ABI2 ENST00000424558.5 TCONS_00035871 \n" ] } ], "source": [ - "group = df.groupby([\"gene_name\"])\n", + "group = df_rep1.groupby([\"gene_name\"])\n", "\n", "count = 0\n", "for key,item in group:\n", - " if (\"KDM1A\" == key):\n", + " if (\"ABI2\" == key):\n", " #for name in rep1:\n", " # item[name] = (item[name] - item[name].mean())/item[name].std()\n", " print(key)\n", @@ -273,7 +323,7 @@ }, { "cell_type": "code", - "execution_count": 342, + "execution_count": 56, "metadata": { "scrolled": true }, @@ -282,50 +332,32 @@ "name": "stdout", "output_type": "stream", "text": [ - "KDM1A\n", - " class_code day_0.Rep2 day_3.Rep2 day_5.Rep2 gene_id \\\n", - "83203 j 0.288084 0.049621 -0.062497 ENSG00000004487.16 \n", - "101253 j 1.453223 1.664798 1.979082 ENSG00000004487.16 \n", - "20724 j 0.332897 -0.045900 0.302070 ENSG00000004487.16 \n", - "28776 o -0.832242 -0.705864 -0.815937 ENSG00000004487.16 \n", - "26584 = 2.595956 2.689480 2.173518 ENSG00000004487.16 \n", - "16355 j -0.361705 -0.454036 -0.354152 ENSG00000004487.16 \n", - "74621 j -0.003201 -0.106686 -0.184020 ENSG00000004487.16 \n", - "7790 j -0.630584 -0.592976 -0.670110 ENSG00000004487.16 \n", - "11817 j 0.579369 0.423022 0.885379 ENSG00000004487.16 \n", - "96671 j -0.720210 -0.671129 -0.767328 ENSG00000004487.16 \n", - "102290 c -0.652990 -0.627711 -0.718719 ENSG00000004487.16 \n", - "48051 j -0.496144 -0.558241 -0.743024 ENSG00000004487.16 \n", - "41377 x -0.742616 -0.636394 -0.621501 ENSG00000004487.16 \n", - "32537 x -0.809836 -0.427985 -0.402761 ENSG00000004487.16 \n", + "FUS\n", + " OJ40 OJ41 OJ42 class_code gene_id \\\n", + "45283 14.890654 7.486137 11.079592 j ENSG00000089280.18 \n", + "53160 17.810391 4.277792 8.897248 = ENSG00000089280.18 \n", + "68181 131.388127 31.816081 114.992736 = ENSG00000089280.18 \n", + "55151 93.431557 28.073012 56.405196 j ENSG00000089280.18 \n", + "58630 61.606433 41.173751 59.426903 = ENSG00000089280.18 \n", + "64460 13.722760 3.208344 5.707669 j ENSG00000089280.18 \n", + "47855 6.131446 2.138896 6.043414 c ENSG00000089280.18 \n", + "84426 14.014734 6.951413 14.101299 c ENSG00000089280.18 \n", + "36211 16.058549 6.416689 8.393630 c ENSG00000089280.18 \n", + "35581 0.875921 0.267362 1.175108 x ENSG00000089280.18 \n", + "9268 0.000000 0.267362 0.671490 s ENSG00000089280.18 \n", "\n", - " gene_name ref_transcript transcript_id \n", - "83203 KDM1A ENST00000400181.8 TCONS_00000647 \n", - "101253 KDM1A ENST00000400181.8 TCONS_00000648 \n", - "20724 KDM1A ENST00000400181.8 TCONS_00000649 \n", - "28776 KDM1A ENST00000465864.2 TCONS_00000650 \n", - "26584 KDM1A ENST00000356634.7 TCONS_00000653 \n", - "16355 KDM1A ENST00000400181.8 TCONS_00000654 \n", - "74621 KDM1A ENST00000400181.8 TCONS_00000655 \n", - "7790 KDM1A ENST00000400181.8 TCONS_00000656 \n", - "11817 KDM1A ENST00000400181.8 TCONS_00000657 \n", - "96671 KDM1A ENST00000400181.8 TCONS_00000658 \n", - "102290 KDM1A ENST00000400181.8 TCONS_00000661 \n", - "48051 KDM1A ENST00000400181.8 TCONS_00000662 \n", - "41377 KDM1A ENST00000400181.8 TCONS_00004737 \n", - "32537 KDM1A ENST00000400181.8 TCONS_00004738 \n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/pkg/python-3.7.4-0/lib/python3.7/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", - " import sys\n" + " gene_name ref_transcript transcript_id \n", + "45283 FUS ENST00000254108.11 TCONS_00022528 \n", + "53160 FUS ENST00000566605.5 TCONS_00022529 \n", + "68181 FUS ENST00000254108.11 TCONS_00022530 \n", + "55151 FUS ENST00000254108.11 TCONS_00022531 \n", + "58630 FUS ENST00000487509.6 TCONS_00022532 \n", + "64460 FUS ENST00000254108.11 TCONS_00022533 \n", + "47855 FUS ENST00000487045.6 TCONS_00022534 \n", + "84426 FUS ENST00000487509.6 TCONS_00022535 \n", + "36211 FUS ENST00000254108.11 TCONS_00022536 \n", + "35581 FUS ENST00000254108.11 TCONS_00023974 \n", + "9268 FUS ENST00000254108.11 TCONS_00023975 \n" ] } ], @@ -333,10 +365,10 @@ "group = df_rep2.groupby([\"gene_name\"])\n", "count = 0\n", "for gene,item in group:\n", - " if(gene == \"KDM1A\"):\n", + " if(gene == \"FUS\"):\n", " print(gene)\n", - " for name in rep2:\n", - " item[name] = (item[name] - item[name].mean())/item[name].std()\n", + " #for name in rep2:\n", + " # item[name] = (item[name] - item[name].mean())/item[name].std()\n", " #for name in rep2:\n", " # if (item[name].max() > 3):\n", " # print(gene)\n", @@ -346,26 +378,35 @@ }, { "cell_type": "code", - "execution_count": 325, + "execution_count": 3, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "68ac40bc-b938-4bd6-b6ab-9c578ba0ac69\n", - "9df75831-2fcc-4219-a690-ec7550c440a8|254\n", - "84e0c7ed-e242-445b-a898-6175b282735c|21\n", - "e1776fe5-112b-4663-a6ef-b7952b91023b|401\n", - "66a4c7e2-5ddb-4daf-8250-63e36d7832ce|11\n", - "daaefcee-2df7-463b-8bfa-8dd0328c64eb|3\n", - "8cc51f63-25d0-44e0-af17-924fab90c341|3\n", - "6f4da30c-34b3-492e-929a-c392d748c9ba|9\n", - "2a5059e7-9262-4c68-9717-2a1beaa0f0d4|33\n" + "ename": "NameError", + "evalue": "name 'df' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mgroup\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"gene_name\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mcount\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mgene\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mitem\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mgroup\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mif\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgene\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"TLL3\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgene\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined" ] } ], - "source": [] + "source": [ + "group = df.groupby([\"gene_name\"])\n", + "count = 0\n", + "for gene,item in group:\n", + " if(gene == \"TLL3\"):\n", + " print(gene)\n", + " #for name in rep2:\n", + " # item[name] = (item[name] - item[name].mean())/item[name].std()\n", + " #for name in rep2:\n", + " # if (item[name].max() > 3):\n", + " # print(gene)\n", + " print(item.sort_values(\"transcript_id\"))\n", + " break" + ] }, { "cell_type": "code", @@ -376,27 +417,18 @@ }, { "cell_type": "code", - "execution_count": 273, + "execution_count": 76, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['>TCONS_00000648', 'GCTTGGCGCGTGCGTACGCGACGGCGGTTGGCGGCGCGCGGGCAGCGTGAAGCGAGGCGAGGCAAGGCTTTTCGGACCCACGGAGCGACAGAGCGAGCGGCCCCTACGGCCGTCGGCGGCCCGGCGGCCCGAGATGTTATCTGGGAAGAAGGCGGCAGCCGCGGCGGCGGCGGCTGCAGCGGCAGCAACCGGGACGGAGGCTGGCCCTGGGACAGCAGGCGGCTCCGAGAACGGGTCTGAGGTGGCCGCGCAGCCCGCGGGCCTGTCGGGCCCAGCCGAGGTCGGGCCGGGGGCGGTGGGGGAGCGCACACCCCGCAAGAAAGAGCCTCCGCGGGCCTCGCCCCCCGGGGGCCTGGCGGAACCGCCGGGGTCCGCAGGGCCTCAGGCCGGCCCTACTGTCGTGCCTGGGTCTGCGACCCCCATGGAAACTGGAATAGCAGAGACTCCGGAGGGGCGTCGGACCAGCCGGCGCAAGCGGGCGAAGGTAGAGTACAGAGAGATGGATGAAAGCTTGGCCAACCTCTCAGAAGATGAGTATTATTCAGAAGAAGAGAGAAATGCCAAAGCAGAGAAGGAAAAGAAGCTTCCCCCACCACCCCCTCAAGCCCCACCTGAGGAAGAAAATGAAAGTGAGCCTGAAGAACCATCGGGGCAAGCAGGAGGACTTCAAGACGACAGTTCTGGAGGGTATGGAGACGGCCAAGCATCAGGTGTGGAGGGCGCAGCTTTCCAGAGCCGACTTCCTCATGACCGGATGACTTCTCAAGAAGCAGCCTGTTTTCCAGATATTATCAGTGGACCACAACAGACCCAGAAGGTTTTTCTTTTCATTAGAAACCGCACACTGCAGTTGTGGTTGGATAATCCAAAGATTCAGCTGACATTTGAGGCTACTCTCCAACAATTAGAAGCACCTTATAACAGTGATACTGTGCTTGTCCACCGAGTTCACAGTTATTTAGAGCGTCATGGTCTTATCAACTTCGGCATCTATAAGAGGATAAAACCCCTACCAACTAAAAAGACAGGAAAGGTAATTATTATAGGCTCTGGGGTCTCAGGCTTGGCAGCAGCTCGACAGTTACAAAGTTTTGGAATGGATGTCACACTTTTGGAAGCCAGGGATCGTGTGGGTGGACGAGTTGCCACATTTCGCAAAGGAAACTATGTAGCTGATCTTGGAGCCATGGTGGTAACAGGTCTTGGAGGGAATCCTATGGCTGTGGTCAGCAAACAAGTAAATATGGAACTGGCCAAGATCAAGCAAAAATGCCCACTTTATGAAGCCAACGGACAAGCTGTTCCTAAAGAGAAAGATGAAATGGTAGAGCAAGAGTTTAACCGGTTGCTAGAAGCTACATCTTACCTTAGTCATCAACTAGACTTCAATGTCCTCAATAATAAGCCTGTGTCCCTTGGCCAGGCATTGGAAGTTGTCATTCAGTTACAAGAGAAGCATGTCAAAGATGAGCAGATTGAACATTGGAAGAAGATAGTGAAAACTCAGGAAGAATTGAAAGAACTTCTTAATAAGATGGTAAATTTGAAAGAGAAAATTAAAGAACTCCATCAGCAATACAAAGAAGCATCTGAAGTAAAGCCACCCAGAGATATTACTGCCGAGTTCTTAGTGAAAAGCAAACACAGGGATCTGACCGCCCTATGCAAGGAATATGATGAATTAGCTGAAACACAAGGAAAGCTAGAAGAAAAACTTCAGGAGTTGGAAGCGAATCCCCCAAGTGATGTATATCTCTCATCAAGAGACAGACAAATACTTGATTGGCATTTTGCAAATCTTGAATTTGCTAATGCCACACCTCTCTCAACTCTCTCCCTTAAGCACTGGGATCAGGATGATGACTTTGAGTTCACTGGCAGCCACCTGACAGTAAGGAATGGCTACTCGTGTGTGCCTGTGGCTTTAGCAGAAGGCCTAGACATTAAACTGAATACAGCAGTGCGACAGGTTCGCTACACGGCTTCAGGATGTGAAGTGATAGCTGTGAATACCCGCTCCACGAGTCAAACCTTTATTTATAAATGCGACGCAGTTCTCTGTACCCTTCCCCTGGGTGTGCTGAAGCAGCAGCCACCAGCCGTTCAGTTTGTGCCACCTCTCCCTGAGTGGAAAACATCTGCAGTCCAAAGGATGGGATTTGGCAACCTTAACAAGGTGGTGTTGTGTTTTGATCGGGTGTTCTGGGATCCAAGTGTCAATTTGTTCGGGCATGTTGGCAGTACGACTGCCAGCAGGGGTGAGCTCTTCCTCTTCTGGAACCTCTATAAAGCTCCAATACTGTTGGCACTAGTGGCAGGAGAAGCTGCTGGTATCATGGAAAACATAAGTGACGATGTGATTGTTGGCCGATGCCTGGCCATTCTCAAAGGGATTTTTGGTAGCAGTGCAGTACCTCAGCCCAAAGAAACTGTGGTGTCTCGTTGGCGTGCTGATCCCTGGGCTCGGGGCTCTTATTCCTATGTTGCTGCAGGATCATCTGGAAATGACTATGATTTAATGGCTCAGCCAATCACTCCTGGCCCCTCGATTCCAGGTGCCCCACAGCCGATTCCACGACTCTTCTTTGCGGGAGAACATACGATCCGTAACTACCCAGCCACAGTGCATGGTGCTCTGCTGAGTGGGCTGCGAGAAGCGGGAAGAATTGCAGACCAGTTTTTGGGGGCCATGTATACGCTGCCTCGCCAGGCCACACCAGGTGTTCCTGCACAGCAGTCCCCAAGCATGTGAGACAGATGCATTCTAAGGGAAGAGGCCCATGTGCCTGTTTCTGCCATGTAAGGAAGGCTCTTCTAGCAATACTAGATCCCACTGAGAAAATCCACCCTGGCATCTGGGCTCCTGATCAGCTGATGGAGCTCCTGATTTGACAAAGGAGCTTGCCTCCTTTGAATGACCTAGAGCACAGGGAGGAACTTGTCCATTAGTTTGGAATTGTGTTCTTCGTAAAGACTGAGGCAAGCAAGTGCTGTGAAATAACATCATCTTAGTCCCTTGGTGTGTGGGGTTTTTGTTTTTTTTTTATATTTTGAGAATAAAACTTCATATAAAA', '>TCONS_00000647', 'GCTTGGCGCGTGCGTACGCGACGGCGGTTGGCGGCGCGCGGGCAGCGTGAAGCGAGGCGAGGCAAGGCTTTTCGGACCCACGGAGCGACAGAGCGAGCGGCCCCTACGGCCGTCGGCGGCCCGGCGGCCCGAGATGTTATCTGGGAAGAAGGCGGCAGCCGCGGCGGCGGCGGCTGCAGCGGCAGCAACCGGGACGGAGGCTGGCCCTGGGACAGCAGGCGGCTCCGAGAACGGGTCTGAGGTGGCCGCGCAGCCCGCGGGCCTGTCGGGCCCAGCCGAGGTCGGGCCGGGGGCGGTGGGGGAGCGCACACCCCGCAAGAAAGAGCCTCCGCGGGCCTCGCCCCCCGGGGGCCTGGCGGAACCGCCGGGGTCCGCAGGGCCTCAGGCCGGCCCTACTGTCGTGCCTGGGTCTGCGACCCCCATGGAAACTGGAATAGCAGAGACTCCGGAGGGGCGTCGGACCAGCCGGCGCAAGCGGGCGAAGGTAGAGTACAGAGAGATGGATGAAAGCTTGGCCAACCTCTCAGAAGATGAGTATTATTCAGAAGAAGAGAGAAATGCCAAAGCAGAGAAGGAAAAGAAGCTTCCCCCACCACCCCCTCAAGCCCCACCTGAGGAAGAAAATGAAAGTGAGCCTGAAGAACCATCGGGTGTGGAGGGCGCAGCTTTCCAGAGCCGACTTCCTCATGACCGGATGACTTCTCAAGAAGCAGCCTGTTTTCCAGATATTATCAGTGGACCACAACAGACCCAGAAGGTTTTTCTTTTCATTAGAAACCGCACACTGCAGTTGTGGTTGGATAATCCAAAGATTCAGCTGACATTTGAGGCTACTCTCCAACAATTAGAAGCACCTTATAACAGTGATACTGTGCTTGTCCACCGAGTTCACAGTTATTTAGAGCGTCATGGTCTTATCAACTTCGGCATCTATAAGAGGATAAAACCCCTACCAACTAAAAAGACAGGAAAGGTAATTATTATAGGCTCTGGGGTCTCAGGCTTGGCAGCAGCTCGACAGTTACAAAGTTTTGGAATGGATGTCACACTTTTGGAAGCCAGGGATCGTGTGGGTGGACGAGTTGCCACATTTCGCAAAGGAAACTATGTAGCTGATCTTGGAGCCATGGTGGTAACAGGTCTTGGAGGGAATCCTATGGCTGTGGTCAGCAAACAAGTAAATATGGAACTGGCCAAGATCAAGCAAAAATGCCCACTTTATGAAGCCAACGGACAAGCTGTTCCTAAAGAGAAAGATGAAATGGTAGAGCAAGAGTTTAACCGGTTGCTAGAAGCTACATCTTACCTTAGTCATCAACTAGACTTCAATGTCCTCAATAATAAGCCTGTGTCCCTTGGCCAGGCATTGGAAGTTGTCATTCAGTTACAAGAGAAGCATGTCAAAGATGAGCAGATTGAACATTGGAAGAAGATAGTGAAAACTCAGGAAGAATTGAAAGAACTTCTTAATAAGATGGTAAATTTGAAAGAGAAAATTAAAGAACTCCATCAGCAATACAAAGAAGCATCTGAAGTAAAGCCACCCAGAGATATTACTGCCGAGTTCTTAGTGAAAAGCAAACACAGGGATCTGACCGCCCTATGCAAGGAATATGATGAATTAGCTGAAACACAAGGAAAGCTAGAAGAAAAACTTCAGGAGTTGGAAGCGAATCCCCCAAGTGATGTATATCTCTCATCAAGAGACAGACAAATACTTGATTGGCATTTTGCAAATCTTGAATTTGCTAATGCCACACCTCTCTCAACTCTCTCCCTTAAGCACTGGGATCAGGATGATGACTTTGAGTTCACTGGCAGCCACCTGACAGTAAGGAATGGCTACTCGTGTGTGCCTGTGGCTTTAGCAGAAGGCCTAGACATTAAACTGAATACAGCAGTGCGACAGGTTCGCTACACGGCTTCAGGATGTGAAGTGATAGCTGTGAATACCCGCTCCACGAGTCAAACCTTTATTTATAAATGCGACGCAGTTCTCTGTACCCTTCCCCTGGGTGTGCTGAAGCAGCAGCCACCAGCCGTTCAGTTTGTGCCACCTCTCCCTGAGTGGAAAACATCTGCAGTCCAAAGGATGGGATTTGGCAACCTTAACAAGGTGGTGTTGTGTTTTGATCGGGTGTTCTGGGATCCAAGTGTCAATTTGTTCGGGCATGTTGGCAGTACGACTGCCAGCAGGGGTGAGCTCTTCCTCTTCTGGAACCTCTATAAAGCTCCAATACTGTTGGCACTAGTGGCAGGAGAAGCTGCTGGTATCATGGAAAACATAAGTGACGATGTGATTGTTGGCCGATGCCTGGCCATTCTCAAAGGGATTTTTGGTAGCAGTGCAGTACCTCAGCCCAAAGAAACTGTGGTGTCTCGTTGGCGTGCTGATCCCTGGGCTCGGGGCTCTTATTCCTATGTTGCTGCAGGATCATCTGGAAATGACTATGATTTAATGGCTCAGCCAATCACTCCTGGCCCCTCGATTCCAGGTGCCCCACAGATTTTAGTCATTCCATCTTCGGACCCTTTCAGATAACCAAGAGCAGAGTTAAAAGGATGGGCAGCATTTCTGATTTCTCATAATTTGGTTTTGTAAATCTAGGCAGTCTACATTCAGAATGGAGGAGTCCAGAGTATAAGTCAAATAACATTTTTCCTTATTCAGGTTTTCTCCCTAAAAAACAAAACCATTTTTAATTGCACTTCCATTTTGTAATGGCTCATAGGATTGCTGTAAGGTCTCAATTACTAGGGGCTTCCAGGGCATTTCTGAGAAATAACCCTGGGTCCTTGTCTAGACCCTTATGCCAGACCCCACTCCAAAGAGCGGTAAGAATTCCTTAGTGTCATAGCCCAGACCTGCTGAGCTGCGAGGCTTAAGTGTCCCTGATCACCAAATGTCCTGTGCTTCAGGGTAGCGAGGCTCCCTCCCAGAAGGTAGTTATACATGAGGGAAGGACGCTTTACAACTGGGTATCTAAACTGATGAGAACACATGTTAAGCATCACTTTAGGACTGAGCCTAGGTAGAGTTTTATTGTCTCATTTCTACTTGTCAATTCTGGGAAAGTGCCTACTGATAAGGGAGACTCTTCGATAGAATGATGAATAGTAATTGGGGGGGTCAGCCTTTAAAAAGGTCAACAGCAATTTAAGTACTTAGCAATTTAAGTACAAGAATAAAGGTATATGTGCAGCCTGCCAATTTTCTCTTTTTCCCCTAAAATAGCCGATTCCACGACTCTTCTTTGCGGGAGAACATACGATCCGTAACTACCCAGCCACAGTGCATGGTGCTCTGCTGAGTGGGCTGCGAGAAGCGGGAAGAATTGCAGACCAGTTTTTGGGGGCCATGTATACGCTGCCTCGCCAGGCCACACCAGGTGTTCCTGCACAGCAGTCCCCAAGCATGTGAGACAGATGCATTCTAAGGGAAGAGGCCCATGTGCCTGTTTCTGCCATGTAAGGAAGGCTCTTCTAGCAATACTAGATCCCACTGAGAAAATCCACCCTGGCATCTGGGCTCCTGATCAGCTGATGGAGCTCCTGATTTGACAAAGGAGCTTGCCTCCTTTGAATGACCTAGAGCACAGGGAGGAACTTGTCCATTAGTTTGGAATTGTGTTCTTCGTAAAGACTGAGGCAAGCAAGTGCTGTGAAATAACATCATCTTAGTCCCTTGGTGTGTGGGGTTTTTGTTTTTTTTTTATATTTTGAGAATAAAACTTCATATAAAA', '>TCONS_00000649', 'GCTTGGCGCGTGCGTACGCGACGGCGGTTGGCGGCGCGCGGGCAGCGTGAAGCGAGGCGAGGCAAGGCTTTTCGGACCCACGGAGCGACAGAGCGAGCGGCCCCTACGGCCGTCGGCGGCCCGGCGGCCCGAGATGTTATCTGGGAAGAAGGCGGCAGCCGCGGCGGCGGCGGCTGCAGCGGCAGCAACCGGGACGGAGGCTGGCCCTGGGACAGCAGGCGGCTCCGAGAACGGGTCTGAGGTGGCCGCGCAGCCCGCGGGCCTGTCGGGCCCAGCCGAGGTCGGGCCGGGGGCGGTGGGGGAGCGCACACCCCGCAAGAAAGAGCCTCCGCGGGCCTCGCCCCCCGGGGGCCTGGCGGAACCGCCGGGGTCCGCAGGGCCTCAGGCCGGCCCTACTGTCGTGCCTGGGTCTGCGACCCCCATGGAAACTGGAATAGCAGAGACTCCGGAGGGGCGTCGGACCAGCCGGCGCAAGCGGGCGAAGGTAGAGTACAGAGAGATGGATGAAAGCTTGGCCAACCTCTCAGAAGATGAGTATTATTCAGAAGAAGAGAGAAATGCCAAAGCAGAGAAGGAAAAGAAGCTTCCCCCACCACCCCCTCAAGCCCCACCTGAGGAAGAAAATGAAAGTGAGCCTGAAGAACCATCGGGTGTGGAGGGCGCAGCTTTCCAGAGCCGACTTCCTCATGACCGGATGACTTCTCAAGAAGCAGCCTGTTTTCCAGATATTATCAGTGGACCACAACAGACCCAGAAGGTTTTTCTTTTCATTAGAAACCGCACACTGCAGTTGTGGTTGGATAATCCAAAGATTCAGCTGACATTTGAGGCTACTCTCCAACAATTAGAAGCACCTTATAACAGTGATACTGTGCTTGTCCACCGAGTTCACAGTTATTTAGAGCGTCATGGTCTTATCAACTTCGGCATCTATAAGAGGATAAAACCCCTACCAACTAAAAAGACAGGAAAGGTAATTATTATAGGCTCTGGGGTCTCAGGCTTGGCAGCAGCTCGACAGTTACAAAGTTTTGGAATGGATGTCACACTTTTGGAAGCCAGGGATCGTGTGGGTGGACGAGTTGCCACATTTCGCAAAGGAAACTATGTAGCTGATCTTGGAGCCATGGTGGTAACAGGTCTTGGAGGGAATCCTATGGCTGTGGTCAGCAAACAAGTAAATATGGAACTGGCCAAGATCAAGCAAAAATGCCCACTTTATGAAGCCAACGGACAAGCTGTAAGTCGAGGTTCCTAAAGAGAAAGATGAAATGGTAGAGCAAGAGTTTAACCGGTTGCTAGAAGCTACATCTTACCTTAGTCATCAACTAGACTTCAATGTCCTCAATAATAAGCCTGTGTCCCTTGGCCAGGCATTGGAAGTTGTCATTCAGTTACAAGAGAAGCATGTCAAAGATGAGCAGATTGAACATTGGAAGAAGATAGTGAAAACTCAGGAAGAATTGAAAGAACTTCTTAATAAGATGGTAAATTTGAAAGAGAAAATTAAAGAACTCCATCAGCAATACAAAGAAGCATCTGAAGTAAAGCCACCCAGAGATATTACTGCCGAGTTCTTAGTGAAAAGCAAACACAGGGATCTGACCGCCCTATGCAAGGAATATGATGAATTAGCTGAAACACAAGGAAAGCTAGAAGAAAAACTTCAGGAGTTGGAAGCGAATCCCCCAAGTGATGTATATCTCTCATCAAGAGACAGACAAATACTTGATTGGCATTTTGCAAATCTTGAATTTGCTAATGCCACACCTCTCTCAACTCTCTCCCTTAAGCACTGGGATCAGGATGATGACTTTGAGTTCACTGGCAGCCACCTGACAGTAAGGAATGGCTACTCGTGTGTGCCTGTGGCTTTAGCAGAAGGCCTAGACATTAAACTGAATACAGCAGTGCGACAGGTTCGCTACACGGCTTCAGGATGTGAAGTGATAGCTGTGAATACCCGCTCCACGAGTCAAACCTTTATTTATAAATGCGACGCAGTTCTCTGTACCCTTCCCCTGGGTGTGCTGAAGCAGCAGCCACCAGCCGTTCAGTTTGTGCCACCTCTCCCTGAGTGGAAAACATCTGCAGTCCAAAGGATGGGATTTGGCAACCTTAACAAGGTGGTGTTGTGTTTTGATCGGGTGTTCTGGGATCCAAGTGTCAATTTGTTCGGGCATGTTGGCAGTACGACTGCCAGCAGGGGTGAGCTCTTCCTCTTCTGGAACCTCTATAAAGCTCCAATACTGTTGGCACTAGTGGCAGGAGAAGCTGCTGGTATCATGGAAAACATAAGTGACGATGTGATTGTTGGCCGATGCCTGGCCATTCTCAAAGGGATTTTTGGTAGCAGTGCAGTACCTCAGCCCAAAGAAACTGTGGTGTCTCGTTGGCGTGCTGATCCCTGGGCTCGGGGCTCTTATTCCTATGTTGCTGCAGGATCATCTGGAAATGACTATGATTTAATGGCTCAGCCAATCACTCCTGGCCCCTCGATTCCAGGTGCCCCACAGCCGATTCCACGACTCTTCTTTGCGGGAGAACATACGATCCGTAACTACCCAGCCACAGTGCATGGTGCTCTGCTGAGTGGGCTGCGAGAAGCGGGAAGAATTGCAGACCAGTTTTTGGGGGCCATGTATACGCTGCCTCGCCAGGCCACACCAGGTGTTCCTGCACAGCAGTCCCCAAGCATGTGAGACAGATGCATTCTAAGGGAAGAGGCCCATGTGCCTGTTTCTGCCATGTAAGGAAGGCTCTTCTAGCAATACTAGATCCCACTGAGAAAATCCACCCTGGCATCTGGGCTCCTGATCAGCTGATGGAGCTCCTGATTTGACAAAGGAGCTTGCCTCCTTTGAATGACCTAGAGCACAGGGAGGAACTTGTCCATTAGTTTGGAATTGTGTTCTTCGTAAAGACTGAGGCAAGCAAGTGCTGTGAAATAACATCATCTTAGTCCCTTGGTGTGTGGGGTTTTTGTTTTTTTTTTATATTTTGAGAATAAAACTTCATATAAAATT', '>TCONS_00000650', 'GGCGCGTGCGTACGCGACGGCGGTTGGCGGCGCGCGGGCAGCGTGAAGCGAGGCGAGGCAAGGCTTTTCGGACCCACGGAGCGACAGAGCGAGCGGCCCCTACGGCCGTCGGCGGCCCGGCGGCCCGAGATGTTATCTGGGAAGAAGGCGGCAGCCGCGGCGGCGGCGGCTGCAGCGGCAGCAACCGGGACGGAGGCTGGCCCTGGGACAGCAGGCGGCTCCGAGAACGGGTCTGAGGTGGCCGCGCAGCCCGCGGGCCTGTCGGGCCCAGCCGAGGTCGGGCCGGGGGCGGTGGGGGAGCGCACACCCCGCAAGAAAGAGCCTCCGCGGGCCTCGCCCCCCGGGGGCCTGGCGGAACCGCCGGGGTCCGCAGGGCCTCAGGCCGGCCCTACTGTCGTGCCTGGGTCTGCGACCCCCATGGAAACTGGAATAGCAGAGACTCCGGAGGGGCGTCGGACCAGCCGGCGCAAGCGGGCGAAGGTAAG', '>TCONS_00000653', 'GCGTGCGTACGCGACGGCGGTTGGCGGCGCGCGGGCAGCGTGAAGCGAGGCGAGGCAAGGCTTTTCGGACCCACGGAGCGACAGAGCGAGCGGCCCCTACGGCCGTCGGCGGCCCGGCGGCCCGAGATGTTATCTGGGAAGAAGGCGGCAGCCGCGGCGGCGGCGGCTGCAGCGGCAGCAACCGGGACGGAGGCTGGCCCTGGGACAGCAGGCGGCTCCGAGAACGGGTCTGAGGTGGCCGCGCAGCCCGCGGGCCTGTCGGGCCCAGCCGAGGTCGGGCCGGGGGCGGTGGGGGAGCGCACACCCCGCAAGAAAGAGCCTCCGCGGGCCTCGCCCCCCGGGGGCCTGGCGGAACCGCCGGGGTCCGCAGGGCCTCAGGCCGGCCCTACTGTCGTGCCTGGGTCTGCGACCCCCATGGAAACTGGAATAGCAGAGACTCCGGAGGGGCGTCGGACCAGCCGGCGCAAGCGGGCGAAGGTAGAGTACAGAGAGATGGATGAAAGCTTGGCCAACCTCTCAGAAGATGAGTATTATTCAGAAGAAGAGAGAAATGCCAAAGCAGAGAAGGAAAAGAAGCTTCCCCCACCACCCCCTCAAGCCCCACCTGAGGAAGAAAATGAAAGTGAGCCTGAAGAACCATCGGGTGTGGAGGGCGCAGCTTTCCAGAGCCGACTTCCTCATGACCGGATGACTTCTCAAGAAGCAGCCTGTTTTCCAGATATTATCAGTGGACCACAACAGACCCAGAAGGTTTTTCTTTTCATTAGAAACCGCACACTGCAGTTGTGGTTGGATAATCCAAAGATTCAGCTGACATTTGAGGCTACTCTCCAACAATTAGAAGCACCTTATAACAGTGATACTGTGCTTGTCCACCGAGTTCACAGTTATTTAGAGCGTCATGGTCTTATCAACTTCGGCATCTATAAGAGGATAAAACCCCTACCAACTAAAAAGACAGGAAAGGTAATTATTATAGGCTCTGGGGTCTCAGGCTTGGCAGCAGCTCGACAGTTACAAAGTTTTGGAATGGATGTCACACTTTTGGAAGCCAGGGATCGTGTGGGTGGACGAGTTGCCACATTTCGCAAAGGAAACTATGTAGCTGATCTTGGAGCCATGGTGGTAACAGGTCTTGGAGGGAATCCTATGGCTGTGGTCAGCAAACAAGTAAATATGGAACTGGCCAAGATCAAGCAAAAATGCCCACTTTATGAAGCCAACGGACAAGCTGTTCCTAAAGAGAAAGATGAAATGGTAGAGCAAGAGTTTAACCGGTTGCTAGAAGCTACATCTTACCTTAGTCATCAACTAGACTTCAATGTCCTCAATAATAAGCCTGTGTCCCTTGGCCAGGCATTGGAAGTTGTCATTCAGTTACAAGAGAAGCATGTCAAAGATGAGCAGATTGAACATTGGAAGAAGATAGTGAAAACTCAGGAAGAATTGAAAGAACTTCTTAATAAGATGGTAAATTTGAAAGAGAAAATTAAAGAACTCCATCAGCAATACAAAGAAGCATCTGAAGTAAAGCCACCCAGAGATATTACTGCCGAGTTCTTAGTGAAAAGCAAACACAGGGATCTGACCGCCCTATGCAAGGAATATGATGAATTAGCTGAAACACAAGGAAAGCTAGAAGAAAAACTTCAGGAGTTGGAAGCGAATCCCCCAAGTGATGTATATCTCTCATCAAGAGACAGACAAATACTTGATTGGCATTTTGCAAATCTTGAATTTGCTAATGCCACACCTCTCTCAACTCTCTCCCTTAAGCACTGGGATCAGGATGATGACTTTGAGTTCACTGGCAGCCACCTGACAGTAAGGAATGGCTACTCGTGTGTGCCTGTGGCTTTAGCAGAAGGCCTAGACATTAAACTGAATACAGCAGTGCGACAGGTTCGCTACACGGCTTCAGGATGTGAAGTGATAGCTGTGAATACCCGCTCCACGAGTCAAACCTTTATTTATAAATGCGACGCAGTTCTCTGTACCCTTCCCCTGGGTGTGCTGAAGCAGCAGCCACCAGCCGTTCAGTTTGTGCCACCTCTCCCTGAGTGGAAAACATCTGCAGTCCAAAGGATGGGATTTGGCAACCTTAACAAGGTGGTGTTGTGTTTTGATCGGGTGTTCTGGGATCCAAGTGTCAATTTGTTCGGGCATGTTGGCAGTACGACTGCCAGCAGGGGTGAGCTCTTCCTCTTCTGGAACCTCTATAAAGCTCCAATACTGTTGGCACTAGTGGCAGGAGAAGCTGCTGGTATCATGGAAAACATAAGTGACGATGTGATTGTTGGCCGATGCCTGGCCATTCTCAAAGGGATTTTTGGTAGCAGTGCAGTACCTCAGCCCAAAGAAACTGTGGTGTCTCGTTGGCGTGCTGATCCCTGGGCTCGGGGCTCTTATTCCTATGTTGCTGCAGGATCATCTGGAAATGACTATGATTTAATGGCTCAGCCAATCACTCCTGGCCCCTCGATTCCAGGTGCCCCACAGCCGATTCCACGACTCTTCTTTGCGGGAGAACATACGATCCGTAACTACCCAGCCACAGTGCATGGTGCTCTGCTGAGTGGGCTGCGAGAAGCGGGAAGAATTGCAGACCAGTTTTTGGGGGCCATGTATACGCTGCCTCGCCAGGCCACACCAGGTGTTCCTGCACAGCAGTCCCCAAGCATGTGAGACAGATGCATTCTAAGGGAAGAGGCCCATGTGCCTGTTTCTGCCATGTAAGGAAGGCTCTTCTAGCAATACTAGATCCCACTGAGAAAATCCACCCTGGCATCTGGGCTCCTGATCAGCTGATGGAGCTCCTGATTTGACAAAGGAGCTTGCCTCCTTTGAATGACCTAGAGCACAGGGAGGAACTTGTCCATTAGTTTGGAATTGTGTTCTTCGTAAAGACTGAGGCAAGCAAGTGCTGTGAAATAACATCATCTTAGTCCCTTGGTGTGTGGGGTTTTTGTTTTTTTTTTATATTTTGAGAATAAAACTTCATATAAAA', '>TCONS_00000654', 'GTACGCGACGGCGGTTGGCGGCGCGCGGGCAGCGTGAAGCGAGGCGAGGCAAGGCTTTTCGGACCCACGGAGCGACAGAGCGAGCGGCCCCTACGGCCGTCGGCGGCCCGGCGGCCCGAGATGTTATCTGGGAAGAAGGCGGCAGCCGCGGCGGCGGCGGCTGCAGCGGCAGCAACCGGGACGGAGGCTGGCCCTGGGACAGCAGGCGGCTCCGAGAACGGGTCTGAGGTGGCCGCGCAGCCCGCGGGCCTGTCGGGCCCAGCCGAGGTCGGGCCGGGGGCGGTGGGGGAGCGCACACCCCGCAAGAAAGAGCCTCCGCGGGCCTCGCCCCCCGGGGGCCTGGCGGAACCGCCGGGGTCCGCAGGGCCTCAGGCCGGCCCTACTGTCGTGCCTGGGTCTGCGACCCCCATGGAAACTGGAATAGCAGAGACTCCGGAGGGGCGTCGGACCAGCCGGCGCAAGCGGGCGAAGGTAGAGTACAGAGAGATGGATGAAAGCTTGGCCAACCTCTCAGAAGATGAGTATTATTCAGAAGAAGAGAGAAATGCCAAAGCAGAGAAGGAAAAGAAGCTTCCCCCACCACCCCCTCAAGCCCCACCTGAGGAAGAAAATGAAAGTGAGCCTGAAGAACCATCGGGGCAAGCAGGAGGACTTCAAGACGACAGTTCTGGAGGGTATGGAGACGGCCAAGCATCAGGTGTGGAGGGCGCAGCTTTCCAGAGCCGACTTCCTCATGACCGGATGACTTCTCAAGAAGCAGCCTGTTTTCCAGATATTATCAGTGGACCACAACAGACCCAGAAGGTTTTTCTTTTCATTAGAAACCGCACACTGCAGTTGTGGTTGGATAATCCAAAGATTCAGCTGACATTTGAGGCTACTCTCCAACAATTAGAAGCACCTTATAACAGTGATACTGTGCTTGTCCACCGAGTTCACAGTTATTTAGAGCGTCATGGTCTTATCAACTTCGGCATCTATAAGAGGATAAAACCCCTACCAACTAAAAAGACAGGAAAGGTAATTATTATAGGCTCTGGGGTCTCAGGCTTGGCAGCAGCTCGACAGTTACAAAGTTTTGGAATGGATGTCACACTTTTGGAAGCCAGGGATCGTGTGGGTGGACGAGTTGCCACATTTCGCAAAGGAAACTATGTAGCTGATCTTGGAGCCATGGTGGTAACAGGTCTTGGAGGGAATCCTATGGCTGTGGTCAGCAAACAAGTAAATATGGAACTGGCCAAGATCAAGCAAAAATGCCCACTTTATGAAGCCAACGGACAAGCTGTTCCTAAAGAGAAAGATGAAATGGTAGAGCAAGAGTTTAACCGGTTGCTAGAAGCTACATCTTACCTTAGTCATCAACTAGACTTCAATGTCCTCAATAATAAGCCTGTGTCCCTTGGCCAGGCATTGGAAGTTGTCATTCAGTTACAAGAGAAGCATGTCAAAGATGAGCAGATTGAACATTGGAAGAAGATAGTGAAAACTCAGGAAGAATTGAAAGAACTTCTTAATAAGATGGTAAATTTGAAAGAGAAAATTAAAGAACTCCATCAGCAATACAAAGAAGCATCTGAAGTAAAGCCACCCAGAGATATTACTGCCGAGTTCTTAGTGAAAAGCAAACACAGGGATCTGACCGCCCTATGCAAGGAATATGATGAATTAGCTGAAACACAAGGAAAGCTAGAAGAAAAACTTCAGGAGTTGGAAGCGAATCCCCCAAGTGATGTATATCTCTCATCAAGAGACAGACAAATACTTGATTGGCATTTTGCAAATCTTGAATTTGCTAATGCCACACCTCTCTCAACTCTCTCCCTTAAGCACTGGGATCAGACAAGGTCTTGCTTTGTTGCCCAGGCTGGAGTGGTGTGATCATGGCTCCCTTGCAGCCTTTACCTCCTGGGCTCAAGTGATCTTCCTGCCTTGGACTCCCATTCCTATTTATAAAATTTTTCTTTTAAACCATGTTCTTTTTCTATCTTGAAAAGAATCTTTGTCTTACCTGTGGTCACAAAGATGTTATCCTGTGTTCTCCTGCAGAAATTGTATGGGTTTAGCTCTTATGTTTAGGTCTATACTCCATTTTGTGTTAATTTTTATGTTTAAGGAAAGGTAAGTGAAGGGTTCATGTTTTTCCATATAGTTATGTAATTATTATTCCAGCACTATCCTTTTCTTCATTTGTCTTGGCACCTTTGTTGAAAATTAATTGGCATGTAAGCATGAGTATTTAGGGTTGTCTTTTCTGTTAATTTATCTGTCTATCCATATGCCACTATCATACTGTCTTGTAAGTTCTCCAACTTTGTTCATTTTTGAAATTGTTTTGGCTATTTTAAGGTACGTATTTCCATAACAGTTTTAGAATCAACTTGTCAATTTCTACAAAAATCACCTTCTGGGATTTTGATTAGGAGTGCATTAAATTTCTAGGTCAATTTAGGGTGAATTGCCATCTTAACAATATTGAATATTCTTATCCTTAAATACAGTATACTTCTCCAAAAATAGTTTTCTTTCGTTTATGTCAGTGATGTTTTATAGTTCTCAGTGAGCAGGGCTATACCATCTTTTCTTAAATTTCAGTGTGATTTACTCTTCAAGTGAGAAATCACAAGGCTTCTTGGACCAATCTCTAAATCTGAGACTTTGGCCAGAATTAAGACTGTATAGGCTGGGTACGGTGGCTCACGCCTTTTTAAACCCAGCACTTTGGGAGGCCAGAGCAGGAGGATCAATTGAGTCAGGAGTTCAAGACCAGCCTGGCCCTGTCTCTACTAAAAATACAAAAATTAGCCGGGCATGGTGGCGGGCACCTGTAATCCCAGCTACTTGGGAGGCTGAGGCACGAGAATTGCCTGAACCCAGGAGGCAGAGGCTGCAGTGAGCGGAGATCGCGCCATTGCACTCCAGCCTGGGCAACAGAGCAAGACTCCATCTCAAAAAAAAAAAAAATGTATAGAAGTAAATTATTAAATTTTTATTTATATATTTTAGTTTACATTATACATTTTCCTAATAAAAATTATTTTGACAGGCTGGGCATGGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCCAAGGCAGGCGGATCACCTCAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCATCTCTACTAAAAATACAAAATTAGCCTGGCGTGGTGGCGCATGCCTGTAGTCTCAGCTGCTTGGGAGACTGAGGCAAGAGAATCGCTTGAACCCAGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCACTGCACTGCAGCCTGGGCAACAAGAGCAAAACTCCATCTCAAAAAAAAAATTATTTTGACTTCTTAATCTTTAGATTATAATCTGATAGTTTTAGTATTTGAAGAGTTGGACTTGTTAGAGATAAAGTCTTGCAGAACTACCAGTTCCTTTGTGGTTTTTCATGTAAATACTAACTTAGGCCCTTCTAGTCATTGATTTATCATTTGATGTTTCAATATGTGTCTATTTGTAATAGTATTAAATTTTAAATGAGTGATTTCAGCTGAAGTAGAGTGTTGCTGCCAGCCCAGTATTTTGATCTAATTTCCCCAAGAATTTCATGAAATTAGAAATCCTTACCAGCTTACAAAAAAAATGTTACATAGAAAAATTTTCAGTAAATTCAAAGAAGTAAGATCATTTTGGGGATCTTTAAATTCCTGGTCAATTCCTGACTTTTTTTTTACTGTATGGACTGTCTTGGGCCACTCTTTATGGTCCTAGATTACCTTCATCTAGTTTGACTGACTTAGTTTTGGAAATCTTCCTCAACTTCAAACCTGTAAATTTAAAGAATAAAAACTGATTTTGCTCTCAGGTAAAAGAAAATGTAATCAGGACTGTGACCCTGATATCCTTCAAGTCTGTAATGTATTCAAAAGCCAGATAAATAAACTTGAATGATGTTTAATC', '>TCONS_00000655', 'GTACGCGACGGCGGTTGGCGGCGCGCGGGCAGCGTGAAGCGAGGCGAGGCAAGGCTTTTCGGACCCACGGAGCGACAGAGCGAGCGGCCCCTACGGCCGTCGGCGGCCCGGCGGCCCGAGATGTTATCTGGGAAGAAGGCGGCAGCCGCGGCGGCGGCGGCTGCAGCGGCAGCAACCGGGACGGAGGCTGGCCCTGGGACAGCAGGCGGCTCCGAGAACGGGTCTGAGGTGGCCGCGCAGCCCGCGGGCCTGTCGGGCCCAGCCGAGGTCGGGCCGGGGGCGGTGGGGGAGCGCACACCCCGCAAGAAAGAGCCTCCGCGGGCCTCGCCCCCCGGGGGCCTGGCGGAACCGCCGGGGTCCGCAGGGCCTCAGGCCGGCCCTACTGTCGTGCCTGGGTCTGCGACCCCCATGGAAACTGGAATAGCAGAGACTCCGGAGGGGCGTCGGACCAGCCGGCGCAAGCGGGCGAAGGTAGAGTACAGAGAGATGGATGAAAGCTTGGCCAACCTCTCAGAAGATGAGTATTATTCAGAAGAAGAGAGAAATGCCAAAGCAGAGAAGGAAAAGAAGCTTCCCCCACCACCCCCTCAAGCCCCACCTGAGGAAGAAAATGAAAGTGAGCCTGAAGAACCATCGGGGCAAGCAGGAGGACTTCAAGACGACAGTTCTGGAGGGTATGGAGACGGCCAAGCATCAGGTGTGGAGGGCGCAGCTTTCCAGAGCCGACTTCCTCATGACCGGATGACTTCTCAAGAAGCAGCCTGTTTTCCAGATATTATCAGTGGACCACAACAGACCCAGAAGGTTTTTCTTTTCATTAGAAACCGCACACTGCAGTTGTGGTTGGATAATCCAAAGATTCAGCTGACATTTGAGGCTACTCTCCAACAATTAGAAGCACCTTATAACAGTGATACTGTGCTTGTCCACCGAGTTCACAGTTATTTAGAGCGTCATGGTCTTATCAACTTCGGCATCTATAAGAGGATAAAACCCCTACCAACTAAAAAGACAGGAAAGGTAATTATTATAGGCTCTGGGGTCTCAGGCTTGGCAGCAGCTCGACAGTTACAAAGTTTTGGAATGGATGTCACACTTTTGGAAGCCAGGGATCGTGTGGGTGGACGAGTTGCCACATTTCGCAAAGGAAACTATGTAGCTGATCTTGGAGCCATGGTGGTAACAGGTCTTGGAGGGAATCCTATGGCTGTGGTCAGCAAACAAGTAAATATGGAACTGGCCAAGATCAAGCAAAAATGCCCACTTTATGAAGCCAACGGACAAGCTGTTCCTAAAGAGAAAGATGAAATGGTAGAGCAAGAGTTTAACCGGTTGCTAGAAGCTACATCTTACCTTAGTCATCAACTAGACTTCAATGTCCTCAATAATAAGCCTGTGTCCCTTGGCCAGGCATTGGAAGTTGTCATTCAGTTACAAGAGAAGCATGTCAAAGATGAGCAGATTGAACATTGGAAGAAGATAGTGAAAACTCAGGAAGAATTGAAAGAACTTCTTAATAAGATGGTAAATTTGAAAGAGAAAATTAAAGAACTCCATCAGCAATACAAAGAAGCATCTGAAGTAAAGCCACCCAGAGATATTACTGCCGAGTTCTTAGTGAAAAGCAAACACAGGGATCTGACCGCCCTATGCAAGGAATATGATGAATTAGCTGAAACACAAGGAAAGCTAGAAGAAAAACTTCAGGAGTTGGAAGCGAATCCCCCAAGTGATGTATATCTCTCATCAAGAGACAGACAAATACTTGATTGGCATTTTGCAAATCTTGAATTTGCTAATGCCACACCTCTCTCAACTCTCTCCCTTAAGCACTGGGATCAGGATGATGACTTTGAGTTCACTGGCAGCCACCTGACAGTAAGGAATGGCTACTCGTGTGTGCCTGTGGCTTTAGCAGAAGGCCTAGACATTAAACTGAATACAGCAGTGCGACAGGTTCGCTACACGGCTTCAGGATGTGAAGTGATAGCTGTGAATACCCGCTCCACGAGTCAAACCTTTATTTATAAATGCGACGCAGTTCTCTGTACCCTTCCCCTGGGTGTGCTGAAGCAGCAGCCACCAGCCGTTCAGTTTGTGCCACCTCTCCCTGAGTGGAAAACATCTGCAGTCCAAAGGATGGGATTTGGCAACCTTAACAAGGTGGTGTTGTGTTTTGATCGGGTGTTCTGGGATCCAAGTGTCAATTTGTTCGGGCATGTTGGCAGTACGACTGCCAGCAGGGGTGAGCTCTTCCTCTTCTGGAACCTCTATAAAGCTCCAATACTGTTGGCACTAGTGGCAGGAGAAGCTGCTGGTATCATGGAAAACATAAGTGACGATGTGATTGTTGGCCGATGCCTGGCCATTCTCAAAGGGATTTTTGGTAGCAGTGCAGTACCTCAGCCCAAAGAAACTGTGGTGTCTCGTTGGCGTGCTGATCCCTGGGCTCGGGGCTCTTATTCCTATGTTGCTGCAGGATCATCTGGAAATGACTATGATTTAATGGCTCAGCCAATCACTCCTGGCCCCTCGATTCCAGGTGCCCCACAGATTTTAGTCATTCCATCTTCGGACCCTTTCAGATAACCAAGAGCAGAGTTAAAAGGATGGGCAGCATTTCTGATTTCTCATAATTTGGTTTTGTAAATCTAGGCAGTCTACATTCAGAATGGAGGAGTCCAGAGTATAAGTCAAATAACATTTTTCCTTATTCAGGTTTTCTCCCTAAAAAACAAAACCATTTTTAATTGCACTTCCATTTTGTAATGGCTCATAGGATTGCTGTAAGGTCTCAATTACTAGGGGCTTCCAGGGCATTTCTGAGAAATAACCCTGGGTCCTTGTCTAGACCCTTATGCCAGACCCCACTCCAAAGAGCGGTAAGAATTCCTTAGTGTCATAGCCCAGACCTGCTGAGCTGCGAGGCTTAAGTGTCCCTGATCACCAAATGTCCTGTGCTTCAGGGTAGCGAGGCTCCCTCCCAGAAGGTAGTTATACATGAGGGAAGGACGCTTTACAACTGGGTATCTAAACTGATGAGAACACATGTTAAGCATCACTTTAGGACTGAGCCTAGGTAGAGTTTTATTGTCTCATTTCTACTTGTCAATTCTGGGAAAGTGCCTACTGATAAGGGAGACTCTTCGATAGAATGATGAATAGTAATTGGGGGGGTCAGCCTTTAAAAAGGTCAACAGCAATTTAAGTACTTAGCAATTTAAGTACAAGAATAAAGGTATATGTGCAGCCTGCCAATTTTCTCTTTTTCCCCTAAAATAGCCGATTCCACGACTCTTCTTTGCGGGAGAACATACGATCCGTAACTACCCAGCCACAGTGCATGGTGCTCTGCTGAGTGGGCTGCGAGAAGCGGGAAGAATTGCAGACCAGTTTTTGGGGGCCATGTATACGCTGCCTCGCCAGGCCACACCAGGTGTTCCTGCACAGCAGTCCCCAAGCATGTGAGACAGATGCATTCTAAGGGAAGAGGCCCATGTGCCTGTTTCTGCCATGTAAGGAAGGCTCTTCTAGCAATACTAGATCCCACTGAGAAAATCCACCCTGGCATCTGGGCTCCTGATCAGCTGATGGAGCTCCTGATTTGACAAAGGAGCTTGCCTCCTTTGAATGACCTAGAGCACAGGGAGGAACTTGTCCATTAGTTTGGAATTGTGTTCTTCGTAAAGACTGAGGCAAGCAAGTGCTGTGAAATAACATCATCTTAGTCCCTTGGTGTGTGGGGTTTTTGTTTTTTTTTTATATTTTGAGAATAAAACTTCATATAAAA', '>TCONS_00000656', 'GCGACGGCGGTTGGCGGCGCGCGGGCAGCGTGAAGCGAGGCGAGGCAAGGCTTTTCGGACCCACGGAGCGACAGAGCGAGCGGCCCCTACGGCCGTCGGCGGCCCGGCGGCCCGAGATGTTATCTGGGAAGAAGGCGGCAGCCGCGGCGGCGGCGGCTGCAGCGGCAGCAACCGGGACGGAGGCTGGCCCTGGGACAGCAGGCGGCTCCGAGAACGGGTCTGAGGTGGCCGCGCAGCCCGCGGGCCTGTCGGGCCCAGCCGAGGTCGGGCCGGGGGCGGTGGGGGAGCGCACACCCCGCAAGAAAGAGCCTCCGCGGGCCTCGCCCCCCGGGGGCCTGGCGGAACCGCCGGGGTCCGCAGGGCCTCAGGCCGGCCCTACTGTCGTGCCTGGGTCTGCGACCCCCATGGAAACTGGAATAGCAGAGACTCCGGAGGGGCGTCGGACCAGCCGGCGCAAGCGGGCGAAGGTAGAGTACAGAGAGATGGATGAAAGCTTGGCCAACCTCTCAGAAGATGAGTATTATTCAGAAGAAGAGAGAAATGCCAAAGCAGAGAAGGAAAAGAAGCTTCCCCCACCACCCCCTCAAGCCCCACCTGAGGAAGAAAATGAAAGTGAGCCTGAAGAACCATCGGGTGTGGAGGGCGCAGCTTTCCAGAGCCGACTTCCTCATGACCGGATGACTTCTCAAGAAGCAGCCTGTTTTCCAGATATTATCAGTGGACCACAACAGACCCAGAAGGTTTTTCTTTTCATTAGAAACCGCACACTGCAGTTGTGGTTGGATAATCCAAAGATTCAGCTGACATTTGAGGCTACTCTCCAACAATTAGAAGCACCTTATAACAGTGATACTGTGCTTGTCCACCGAGTTCACAGTTATTTAGAGCGTCATGGTCTTATCAACTTCGGCATCTATAAGAGGATAAAACCCCTACCAACTAAAAAGACAGGAAAGGTAATTATTATAGGCTCTGGGGTCTCAGGCTTGGCAGCAGCTCGACAGTTACAAAGTTTTGGAATGGATGTCACACTTTTGGAAGCCAGGGATCGTGTGGGTGGACGAGTTGCCACATTTCGCAAAGGAAACTATGTAGCTGATCTTGGAGCCATGGTGGTAACAGGTCTTGGAGGGAATCCTATGGCTGTGGTCAGCAAACAAGTAAATATGGAACTGGCCAAGATCAAGCAAAAATGCCCACTTTATGAAGCCAACGGACAAGCTGTTCCTAAAGAGAAAGATGAAATGGTAGAGCAAGAGTTTAACCGGTTGCTAGAAGCTACATCTTACCTTAGTCATCAACTAGACTTCAATGTCCTCAATAATAAGCCTGTGTCCCTTGGCCAGGCATTGGAAGTTGTCATTCAGTTACAAGAGAAGCATGTCAAAGATGAGCAGATTGAACATTGGAAGAAGATAGTGAAAACTCAGGAAGAATTGAAAGAACTTCTTAATAAGGTGAAATT', '>TCONS_00000657', 'GGCGCGCGGGCAGCGTGAAGCGAGGCGAGGCAAGGCTTTTCGGACCCACGGAGCGACAGAGCGAGCGGCCCCTACGGCCGTCGGCGGCCCGGCGGCCCGAGATGTTATCTGGGAAGAAGGCGGCAGCCGCGGCGGCGGCGGCTGCAGCGGCAGCAACCGGGACGGAGGCTGGCCCTGGGACAGCAGGCGGCTCCGAGAACGGGTCTGAGGTGGCCGCGCAGCCCGCGGGCCTGTCGGGCCCAGCCGAGGTCGGGCCGGGGGCGGTGGGGGAGCGCACACCCCGCAAGAAAGAGCCTCCGCGGGCCTCGCCCCCCGGGGGCCTGGCGGAACCGCCGGGGTCCGCAGGGCCTCAGGCCGGCCCTACTGTCGTGCCTGGGTCTGCGACCCCCATGGAAACTGGAATAGCAGAGACTCCGGAGGGGCGTCGGACCAGCCGGCGCAAGCGGGCGAAGGTAGAGTACAGAGAGATGGATGAAAGCTTGGCCAACCTCTCAGAAGATGAGTATTATTCAGAAGAAGAGAGAAATGCCAAAGCAGAGAAGGAAAAGAAGCTTCCCCCACCACCCCCTCAAGCCCCACCTGAGGAAGAAAATGAAAGTGAGCCTGAAGAACCATCGGGTGTGGAGGGCGCAGCTTTCCAGAGCCGACTTCCTCATGACCGGATGACTTCTCAAGAAGCAGCCTGTTTTCCAGATATTATCAGTGGACCACAACAGACCCAGAAGGTTTTTCTTTTCATTAGAAACCGCACACTGCAGTTGTGGTTGGATAATCCAAAGATTCAGCTGACATTTGAGGCTACTCTCCAACAATTAGAAGCACCTTATAACAGTGATACTGTGCTTGTCCACCGAGTTCACAGTTATTTAGAGCGTCATGGTCTTATCAACTTCGGCATCTATAAGAGGATAAAACCCCTACCAACTAAAAAGACAGGAAAGGTAATTATTATAGGCTCTGGGGTCTCAGGCTTGGCAGCAGCTCGACAGTTACAAAGTTTTGGAATGGATGTCACACTTTTGGAAGCCAGGGATCGTGTGGGTGGACGAGTTGCCACATTTCGCAAAGGAAACTATGTAGCTGATCTTGGAGCCATGGTGGTAACAGGTCTTGGAGGGAATCCTATGGCTGTGGTCAGCAAACAAGTAAATATGGAACTGGCCAAGATCAAGCAAAAATGCCCACTTTATGAAGCCAACGGACAAGCTATGGTAAATTTGAAAGAGAAAATTAAAGAACTCCATCAGCAATACAAAGAAGCATCTGAAGTAAAGCCACCCAGAGATATTACTGCCGAGTTCTTAGTGAAAAGCAAACACAGGGATCTGACCGCCCTATGCAAGGAATATGATGAATTAGCTGAAACACAAGGAAAGCTAGAAGAAAAACTTCAGGAGTTGGAAGCGAATCCCCCAAGTGATGTATATCTCTCATCAAGAGACAGACAAATACTTGATTGGCATTTTGCAAATCTTGAATTTGCTAATGCCACACCTCTCTCAACTCTCTCCCTTAAGCACTGGGATCAGGATGATGACTTTGAGTTCACTGGCAGCCACCTGACAGTAAGGAATGGCTACTCGTGTGTGCCTGTGGCTTTAGCAGAAGGCCTAGACATTAAACTGAATACAGCAGTGCGACAGGTTCGCTACACGGCTTCAGGATGTGAAGTGATAGCTGTGAATACCCGCTCCACGAGTCAAACCTTTATTTATAAATGCGACGCAGTTCTCTGTACCCTTCCCCTGGGTGTGCTGAAGCAGCAGCCACCAGCCGTTCAGTTTGTGCCACCTCTCCCTGAGTGGAAAACATCTGCAGTCCAAAGGATGGGATTTGGCAACCTTAACAAGGTGGTGTTGTGTTTTGATCGGGTGTTCTGGGATCCAAGTGTCAATTTGTTCGGGCATGTTGGCAGTACGACTGCCAGCAGGGGTGAGCTCTTCCTCTTCTGGAACCTCTATAAAGCTCCAATACTGTTGGCACTAGTGGCAGGAGAAGCTGCTGGTATCATGGAAAACATAAGTGACGATGTGATTGTTGGCCGATGCCTGGCCATTCTCAAAGGGATTTTTGGTAGCAGTGCAGTACCTCAGCCCAAAGAAACTGTGGTGTCTCGTTGGCGTGCTGATCCCTGGGCTCGGGGCTCTTATTCCTATGTTGCTGCAGGATCATCTGGAAATGACTATGATTTAATGGCTCAGCCAATCACTCCTGGCCCCTCGATTCCAGGTGCCCCACAGCCGATTCCACGACTCTTCTTTGCGGGAGAACATACGATCCGTAACTACCCAGCCACAGTGCATGGTGCTCTGCTGAGTGGGCTGCGAGAAGCGGGAAGAATTGCAGACCAGTTTTTGGGGGCCATGTATACGCTGCCTCGCCAGGCCACACCAGGTGTTCCTGCACAGCAGTCCCCAAGCATGTGAGACAGATGCATTCTAAGGGAAGAGGCCCATGTGCCTGTTTCTGCCATGTAAGGAAGGCTCTTCTAGCAATACTAGATCCCACTGAGAAAATCCACCCTGGCATCTGGGCTCCTGATCAGCTGATGGAGCTCCTGATTTGACAAAGGAGCTTGCCTCCTTTGAATGACCTAGAGCACAGGGAGGAACTTGTCCATTAGTTTGGAATTGTGTTCTTCGTAAAGACTGAGGCAAGCAAGTGCTGTGAAATAACATCATCTTAGTCCCTTGGTGTGTGGGGTTTTTGTTTTTTTTTTATATTTTGAGAATAAAACTTCATATAAAATTGG', '>TCONS_00000658', 'GCGCGGGCAGCGTGAAGCGAGGCGAGGCAAGGCTTTTCGGACCCACGGAGCGACAGAGCGAGCGGCCCCTACGGCCGTCGGCGGCCCGGCGGCCCGAGATGTTATCTGGGAAGAAGGCGGCAGCCGCGGCGGCGGCGGCTGCAGCGGCAGCAACCGGGACGGAGGCTGGCCCTGGGACAGCAGGCGGCTCCGAGAACGGGTCTGAGGTGGCCGCGCAGCCCGCGGGCCTGTCGGGCCCAGCCGAGGTCGGGCCGGGGGCGGTGGGGGAGCGCACACCCCGCAAGAAAGAGCCTCCGCGGGCCTCGCCCCCCGGGGGCCTGGCGGAACCGCCGGGGTCCGCAGGGCCTCAGGCCGGCCCTACTGTCGTGCCTGGGTCTGCGACCCCCATGGAAACTGGAATAGCAGAGACTCCGGAGGGGCGTCGGACCAGCCGGCGCAAGCGGGCGAAGGTAGAGTACAGAGAGATGGATGAAAGCTTGGCCAACCTCTCAGAAGATGAGTATTATTCAGAAGAAGAGAGAAATGCCAAAGCAGAGAAGGAAAAGAAGCTTCCCCCACCACCCCCTCAAGCCCCACCTGAGGAAGAAAATGAAAGTGAGCCTGAAGAACCATCGGGGCAAGCAGGAGGACTTCAAGACGACAGTTCTGGAGGGTATGGAGACGGCCAAGCATCAGGTGTGGAGGGCGCAGCTTTCCAGAGCCGACTTCCTCATGACCGGATGACTTCTCAAGAAGCAGCCTGTTTTCCAGATATTATCAGTGGACCACAACAGACCCAGAAGGTTTTTCTTTTCATTAGAAACCGCACACTGCAGTTGTGGTTGGATAATCCAAAGATTCAGCTGACATTTGAGGCTACTCTCCAACAATTAGAAGCACCTTATAACAGTGATACTGTGCTTGTCCACCGAGTTCACAGTTATTTAGAGCGTCATGGTCTTATCAACTTCGGCATCTATAAGAGGATAAAACCCCTACCAAGTAA', '>TCONS_00000661', 'TGTTGGCAGTACGACTGCCAGCAGGGGTGAGCTCTTCCTCTTCTGGAACCTCTATAAAGCTCCAATACTGTTGGCACTAGTGGCAGGAGAAGCTGCTGGTATCATGGAAAACATAAGTGACGATGTGATTGTTGGCCGATGCCTGGCCATTCTCAAAGGGATTTTTGGTAGCAGTGCAGTACCTCAGCCCAAAGAAACTGTGGTGTCTCGTTGGCGTGCTGATCCCTGGGCTCGGGGCTCTTATTCCTATGTTGCTGCAGGATCATCTGGAAATGACTATGATTTAATGGCTCAGCCAATCACTCCTGGCCCCTCGATTCCAGGTGCCCCACAGCCGATTCCACGACTCTTCTTTGCGGGAGAACATACGATCCGTAACTACCCAGCCACAGTGCATGGTGCTCTGCTGAGTGGGCTGCGAGAAGCGGGAAGAATTGCAGACCAGTTTTTGGGGGCCATGTATACGCTGCCTCGCCAGGCCACACCAGGTGTTCCTGCACAGCAGTCCCCAAGCATGTGAGACAGATGCATTCTAAGGGAAGAGGCCCATGTGCCTGTTTCTGCCATGTAAGGAAGGCTCTTCTAGCAATACTAGATCCCACTGAGAAAATCCACCCTGGCATCTGGGCTCCTGATCAGCTGATGGAGCTCCTGATTTGACAAAGGAGCTTGCCTCCTTTGAATGACCTAGAGCACAGGGAGGAACTTGTCCATTAGTTTGGAATTGTGTTCTTCGTAAAGACTGAGGCAAGCAAGTGCTGTGAAATAACATCATCTTAGTCCCTTG', '>TCONS_00000662', 'CTGGGCTCGGGGCTCTTATTCCTATGTTGCTGCAGGATCATCTGGAAATGACTATGATTTAATGGCTCAGCCAATCACTCCTGGCCCCTCGATTCCAGGTGCCCCACAGATTTTAGTCATTCCATCTTCGGACCCTTTCAGATAACCAAGAGCAGAGTTAAAAGGATGGGCAGCATTTCTGATTTCTCATAATTTGGTTTTGTAAATCTAGGCAGTCTACATTCAGAATGGAGGAGTCCAGAGTATAAGTCAAATAACATTTTTCCTTATTCAGGTTTTCTCCCTAAAAAACAAAACCATTTTTAATTGCACTTCCATTTTGTAATGGCTCATAGGATTGCTGTAAGGTCTCAATTACTAGGGGCTTCCAGGGCATTTCTGAGAAATAACCCTGGGTCCTTGTCTAGACCCTTATGCCAGACCCCACTCCAAAGAGCGGTAAGAATTCCTTAGTGTCATAGCCCAGACCTGCTGAGCTGCGAGGCTTAAGTGTCCCTGATCACCAAATGTCCTGTGCTTCAGGGTAGCGAGGCTCCCTCCCAGAAGGTAGTTATACATGAGGGAAGGACGCTTTACAACTGGGTATCTAAACTGATGAGAACACATGTTAAGCATCACTTTAGGACTGAGCCTAGGTAGAGTTTTATTGTCTCATTTCTACTTGTCAATTCTGGGAAAGTGCCTACTGATAAGGGAGACTCTTCGATAGAATGATGAATAGTAATTGGGGGGGTCAGCCTTTAAAAAGGTCAACAGCAATTTAAGTACTTAGCAATTTAAGTACAAGAATAAAGGTATATGTGCAGCCTGCCAATTTTCTCTTTTTCCCCTAAAATAGCCGATTCCACGACTCTTCTTTGCGGGAGAACATACGATCCGTAACTACCCAGCCACAGTGCATGGTGCTCTGCTGAGTGGGCTGCGAGAAGCGGGAAGAATTGCAGACCAGTTTTTGGGGGCCATGTATACGCTGCCTCGCCAGGCCACACCAGGTGTTCCTGCACAGCAGTCCCCAAGCATGTGAGACAGATGCATTCTAAGGGAAGAGGCCCATGTGCCTGTTTCTGCCATGTAAGGAAGGCTCTTCTAGCAATACTAGATCCCACTGAGAAAATCCACCCTGGCATCTGGGCTCCTGATCAGCTGATGGAGCTCCTGATTTGACAAAGGAGCTTGCCTCCTTTGAATGACCTAGAGCACAGGGAGGAACTTGTCCATTAGTTTGGAATTGTGTTCTTCGTAAAGACTGAGGCAAGCAAGTGCTGTGAAATAACATCATCTTAGTCCCTTGGTGTGTGGGGTTTTTGTTTTTTTTTTATATTTTGAGAATAAAACTT', '>TCONS_00004737', 'TGCTGGGACAACATGCAAAATAATGAAACTGAACCCCTACCTCACACCATATACAAAAATTAACTGAGAAAAGATCATCGACATAAATGCAAGAGCTAAAACTAAAAAACTCTTAGAAGAAAACATAGGAGTAAATCCTCGTGACATTGGATAAGGCAATGGTTTATTAGATATAACACCAGAAGTAAAGGGAGCAAAAGAATAAATTGGATCTCATCAAAATCACAAATTTCCGTAACCAGAATATGTAAAGAACTCTTACTAGAAAATGATGAAGACAACTCAATTTTTAAAAATGAGCCAAGGATTTGAATAGTCATTTCAACAAAAATATAAAAATGGCTAATAAACACATGAAAAGATGCCCAACATCATTAGTCATTAGGGAACTGCATATCAAAACCACAATGAGATACCACTTTACCCCCAATAGGATGGCTACAGTAAAAGACAAACAATATTAAATGTCAAGAAGGATATGGAGGCTCAACAAGGTGGCTCCACCTGTAATCCCAGCACTTTGGGAGGCCGATGTGGGTGGATGGCTTGAGGCCAGGAGTTTAAGACCATCCTGGCCAACATGGTGAAACCCTGTCTCTACTAAAAATACAAAAACTGGCCAGAGGTGGTGGCAGGTGCCTGTAATCCCAGCTACTCGGCTGGCTAAGGCAGGAGAATCGCTTGAACCCAGGAGGAGGAGGCTGTAGTAAGCCGAGATCACATCACTGCACTCCAGCCTGGGTGGTGACAGAGAGAGACTGTCTCAGCAAAAAAAAAAAAAAAAA', '>TCONS_00004738', 'GCTGGGCGTGGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCTGAGGCAGGCAGATGCCAAGGCAGGAGGACCGCTTATGCCCAGGAGGCCAAGGGTGCAATCTGTGATCGCACCACTGTGCTCCAGCCTGAGCAACAGAGACAGACCCTGTCTCAAAAAAATAGAAAATAAAGGTACTGCTAAGTTTTGCATCAAAAACAATCTTTTCAACATTGTTTAATCCAATGCCTTGACAAAGAAATAATTAACAAGGAATACACTTGGAAAAGGGTGGTCTAGAAAGGGTTCTTAGTCTTTAGGAACAAAAGAGATGAGAAAAATACCTACAGGTAGAAGCAATAATATCTGATGGTATTATTAAAAACAAAACTGTTTCACAGAACCCTAGGATTCTTCAAGATTTAGAATTTTAATTTTTTAAATGTATTACACATTCATGTGTTAAAACAAATCTTAATCCTTTAGAGACAACCAACACAATTAACTTGTGCTGCTTGGGTAACTCAATTTGGGGTAGACTTCAGAATGAAGTTTCTTCTGCCATGTCTGTTCAACTGAGAGCATCCTGAGATGACAAAGCAAGCTTAAAAAAAAAAAAAAAA']\n" - ] - } - ], + "outputs": [], "source": [ "output = []\n", - "gene = \"KDM1A\"\n", + "gene = \"BRD4\"\n", "for oID in gene_oID[gene]:\n", " tID = \">\" + oID_tID[transcripts[oID].id]\n", " output.append(tID)\n", " seq = str(transcripts[oID].seq)\n", " output.append(seq)\n", "\n", - "print(output) \n", " \n", "#output_filename = \"/project/owlmayerTemporary/Sid/blast/test/kdm1a.fa\"\n", "#output_file = open(output_filename,\"w+\")\n", @@ -409,16 +441,206 @@ }, { "cell_type": "code", - "execution_count": 288, + "execution_count": 222, + "metadata": {}, + "outputs": [], + "source": [ + "afilename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/ReferenceData/gencode.v32.primary_assembly.annotation.gtf\"\n", + "afile = open(afilename,\"r\")\n", + "afile_lines = afile.readlines()\n", + "afile.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 298, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['gene' 'transcript' 'exon' 'CDS' 'start_codon' 'stop_codon' 'UTR'\n", + " 'Selenocysteine']\n" + ] + } + ], + "source": [ + "output = []\n", + "for line in afile_lines:\n", + " if (not line.startswith(\"#\")):\n", + " output.append(line.strip().split(\"\\t\"))\n", + "pd_aline = pd.DataFrame(output,columns=[\"chr\",\"source\",\"type\",\"start\",\"stop\",\"a\",\"b\",\"c\",\"info\"])\n", + "pd_aline = pd_aline.astype({'start': 'int32', \"stop\":\"int32\"})\n", + "print(pd_aline[\"type\"].unique())\n", + "#pd_aline = pd_aline[pd_aline[\"type\"] == \"UTR\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 302, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['gene_id \"ENSG00000203667.10\"', ' transcript_id \"ENST00000391839.6\"', ' gene_type \"protein_coding\"', ' gene_name \"COX20\"', ' transcript_type \"lncRNA\"', ' transcript_name \"COX20-202\"', ' level 2', ' transcript_support_level \"1\"', ' hgnc_id \"HGNC:26970\"', ' havana_gene \"OTTHUMG00000040401.2\"', ' havana_transcript \"OTTHUMT00000097176.1\"', ''] transcript\n", + "['gene_id \"ENSG00000203667.10\"', ' transcript_id \"ENST00000391839.6\"', ' gene_type \"protein_coding\"', ' gene_name \"COX20\"', ' transcript_type \"lncRNA\"', ' transcript_name \"COX20-202\"', ' exon_number 1', ' exon_id \"ENSE00001899074.1\"', ' level 2', ' transcript_support_level \"1\"', ' hgnc_id \"HGNC:26970\"', ' havana_gene \"OTTHUMG00000040401.2\"', ' havana_transcript \"OTTHUMT00000097176.1\"', ''] exon\n", + "['gene_id \"ENSG00000203667.10\"', ' transcript_id \"ENST00000391839.6\"', ' gene_type \"protein_coding\"', ' gene_name \"COX20\"', ' transcript_type \"lncRNA\"', ' transcript_name \"COX20-202\"', ' exon_number 2', ' exon_id \"ENSE00003677820.1\"', ' level 2', ' transcript_support_level \"1\"', ' hgnc_id \"HGNC:26970\"', ' havana_gene \"OTTHUMG00000040401.2\"', ' havana_transcript \"OTTHUMT00000097176.1\"', ''] exon\n", + "['gene_id \"ENSG00000203667.10\"', ' transcript_id \"ENST00000391839.6\"', ' gene_type \"protein_coding\"', ' gene_name \"COX20\"', ' transcript_type \"lncRNA\"', ' transcript_name \"COX20-202\"', ' exon_number 3', ' exon_id \"ENSE00001889952.1\"', ' level 2', ' transcript_support_level \"1\"', ' hgnc_id \"HGNC:26970\"', ' havana_gene \"OTTHUMG00000040401.2\"', ' havana_transcript \"OTTHUMT00000097176.1\"', ''] exon\n" + ] + } + ], + "source": [ + "info = list(pd_aline[\"info\"])\n", + "types = list(pd_aline[\"type\"])\n", + "for ann in range(len(info)): \n", + " if \"gene_name\" in info[ann]:\n", + " line = info[ann].split(\";\")\n", + " transID = line[1].split(\" \")[-1][1:-1].split(\".\")[0]\n", + " gene = line[3].split(\" \")[-1][1:-1]\n", + " \n", + " if (transID == \"ENST00000391839\"):\n", + " print(line,types[ann])" + ] + }, + { + "cell_type": "code", + "execution_count": 224, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "pd_aline.to_csv(path_or_buf = \"/home/annaldas/projects/nanopore-transcriptome-analysis/df_utr_regions.csv\",index = False)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "chr11\t65570460\t65570695\tENST00000531405\n", + "chr11\t65571732\t65571888\tENST00000531405\n", + "chr11\t65570477\t65570491\tENST00000309328\n", + "chr11\t65571732\t65571888\tENST00000309328\n", + "chr11\t65570485\t65570695\tENST00000527920\n", + "chr11\t65572478\t65572892\tENST00000527920\n", + "chr11\t65570487\t65570491\tENST00000526877\n", + "chr11\t65571394\t65571858\tENST00000526877\n", + "chr11\t65571732\t65571779\tENST00000533115\n", + "chr11\t65572443\t65572892\tENST00000533115\n", + "chr11\t65573524\t65573942\tENST00000526433\n" + ] + } + ], + "source": [ + "df_utr_regions = pd.read_csv(\"/home/annaldas/projects/nanopore-transcriptome-analysis/df_utr_regions.csv\")\n", + "\n", + "info = list(df_utr_regions[\"info\"])\n", + "chrms = list(df_utr_regions[\"chr\"])\n", + "start = list(df_utr_regions[\"start\"])\n", + "stop = list(df_utr_regions[\"stop\"])\n", + "\n", + "for ann in range(len(info)): \n", + " if \"gene_name\" in info[ann]:\n", + " line = info[ann].split(\";\")\n", + " transID = line[1].split(\" \")[-1][1:-1].split(\".\")[0]\n", + " gene = line[3].split(\" \")[-1][1:-1]\n", + " \n", + " \n", + " if (gene == \"ZNRD2\"):\n", + " print(chrms[ann] + \"\\t\" + str(start[ann]) + \"\\t\" + str(stop[ann]) + \"\\t\" + transID)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "bedfastafile = open(\"/home/annaldas/projects/result/ZNRD2/ZNRD2_utr_regions.fa\")\n", + "bedfastalines = bedfastafile.readlines()\n", + "bedfastafile.close()\n", + "trans_utr = dict()\n", + "for line in bedfastalines:\n", + " if (line.startswith(\">\")):\n", + " trans_id = line[1:].strip()\n", + " if (trans_id not in trans_utr):\n", + " trans_utr[trans_id] = []\n", + " else:\n", + " trans_utr[trans_id].append(line.strip())\n", + "\n", + "transcript_id = \"ENST00000533115\"\n", + "s = '''GGTGACAACGGCAACATGGCCCTGAACGGAGCTGGTGAGGACCTGGGCGGCAGGGGTTTGTGGCTGTGAGGTACGGGAGGCAGCCCACTCCGGCAAGACCCCCAGTCCCTATGCCTCTCTTCCCCAGAAGTCGACGACTTCTCCTGGGAGCCCCCGACTGAGGCGGAGACGAAGACGATCCTCCTCCAAGACAAACAGCGGAAAATCTACTGCGTGGCTTGTCAGGAACTCGACTCAGACGTGGATAAAGATAATCCCGCTCTGAATGCCCAGGCTGCCCTCTCCCAAGCTCGGGAGCACCAGCTGGCCTCAGCCTCAGAGCTCCCCCTGGGCTCTCGACCTGCGCCCCAGCCCCCAGTACCTCGTCCGGAGCACTGTGAGGGAGCTGCAGCAGGACTCAAGGCAGCCCAGGGGCCACCTGCTCCTGCTGTGCCTCCAAATACAGATGTCATGGCCTGCACACAGACAGCCCTCTTGCAGAAGCTGACCTGGGCCTCTGCTGAACTGGGCTCTAGCACCTCCCTGGAGACTAGCATCCAGCTGTGTGGCCTTATCCGCGCATGTGCGGAGGCCCTGCGCAGCCTGCAGCAGCTACAGCACTAAGAGAAGCCCCTGAGAAAAACCCTCTAGAAAA'''" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-1 TTCCTGTGAGCCCGGCGGTGACAACGGCAACATGGCCCTGAACGGAGCTGGTGAGGACCTGGGCGGCAGGGGTTTGTGGCTGTGAGGTACGGGAGGCAGCCCACTCCGGCAAGACCCCCAGTCCCTATGCCTCTCTTCCCCAGAAGTCGACGACTTCTCCTGGGAGCCCCCGACTGAGGCGGAGACGAAGGTGCTGCAGGCGCGACGGGAGCGGCAAGATCGCATCTCCCGGCTC\n", + "-1 AAGAGAAGCCCCTGAGAAAAACCCTCTAGAAAAACAGCTGTTCCTCTGTGTGGTTTGTTTTTTTCCTGGTTCCAAGTGTGCATGCCAGCCCCAGCTCCACTCACCTTTTTCCAGCTTTTGGCCTCTTCACCTCTCCACTCTGCTCTCCTTGACGCC\n", + "1 GTGACAACGGCAAC\n", + "-1 AAGAGAAGCCCCTGAGAAAAACCCTCTAGAAAAACAGCTGTTCCTCTGTGTGGTTTGTTTTTTTCCTGGTTCCAAGTGTGCATGCCAGCCCCAGCTCCACTCACCTTTTTCCAGCTTTTGGCCTCTTCACCTCTCCACTCTGCTCTCCTTGACGCC\n", + "-1 GGCAACATGGCCCTGAACGGAGCTGGTGAGGACCTGGGCGGCAGGGGTTTGTGGCTGTGAGGTACGGGAGGCAGCCCACTCCGGCAAGACCCCCAGTCCCTATGCCTCTCTTCCCCAGAAGTCGACGACTTCTCCTGGGAGCCCCCGACTGAGGCGGAGACGAAGGTGCTGCAGGCGCGACGGGAGCGGCAAGATCGCATCTCCCGGCTC\n", + "-1 AAGAGAGGCCCGGCCCATCCAGAGGGGGTGGGGCAGAGGCGGAGTCTGAGGAGCTGGGGAAGGAACAAAGCGAGGCCTGCGGGCGGCGGCTGGGCTCCGGCGGGGCCGCGGGGTGCGGGGCCTGCGGGCGGCGGCCCGGGCGGAGCGTTGGAGGGAAGGAGGTGGCATCGCCGTCCGCGCCGGCCCCGGCCATGAACGGGCTGCCCTCGGCAGAGGCGCCGGGCGGGGCGGGCTGCGCTTTGGCCGGGCTCCCACCGCTGCCGCGCGGCCTCAGCGGCCTCCTTAATGCGAGCGGGGGCTCGTGGCGGGAGCTGGAGCGCGTCTACAGCCAGCGCAGCCGCATCCACGACGAGCTGAGCCGCGCCGCCCGCGCCCCGGACGGGCCCCGCCACGCCGCCGGCGCCGCCAACGCGG\n", + "5 CAAC\n", + "-1 GAATGCCCAGGCTGCCCTCTCCCAAGCTCGGGAGCACCAGCTGGCCTCAGCCTCAGAGCTCCCCCTGGGCTCTCGACCTGCGCCCCAGCCCCCAGTACCTCGTCCGGAGCACTGTGAGGGAGCTGCAGCAGGACTCAAGGCAGCCCAGGGGCCACCTGCTCCTGCTGTGCCTCCAAATACAGATGTCATGGCCTGCACACAGACAGCCCTCTTGCAGAAGCTGACCTGGGCCTCTGCTGAACTGGGCTCTAGCACCTCCCTGGAGACTAGCATCCAGCTGTGTGGCCTTATCCGCGCATGTGCGGAGGCCCTGCGCAGCCTGCAGCAGCTACAGCACTAAGAGAAGCCCCTGAGAAAAACCCTCTAGAAAAACAGCTGTTCCTCTGTGTGGTTTGTTTTTTTCCTGGTTCCAAGTGTGCATGCCAGCCCCAGCTCCACTCACCTTTTTCCAGCTTTTGGCCTCT\n", + "-1 AAGAGAAGCCCCTGAGAAAAACCCTCTAGAAAAACAGCTGTTCCTCT\n", + "-1 TTTGCGCGACGTGGTCCCACAACCGTTGCCTTTTTAAGAGAGGCCCGGCCCATCCAGAGGGGGTGGGGCAGAGGCGGAGTCTGAGGAGCTGGGGAAGGAACAAAGCGAGGCCTGCGGGCGGCGGCTGGGCTCCGGCGGGGCCGCGGGGTGCGGGGCCTGCGGGCGGCGGCCCGGGCGGAGCGTTGGAGGGAAGGAGGTGGCATCGCCGTCCGCGCCGGCCCCGGCCATGAACGGGCTGCCCTCGGCAGAGGCGCCGGGCGGGGCGGGCTGCGCTTTGGCCGGGCTCCCACCGCTGCCGCGCGGCCTCAGCGGCCTCCTTAATGCGAGCGGGGGCTCGTGGCGGGAGCTGGAGCGCGTCTACAGCCAGCGCAGCCGCATCCACGACGAGCTGAGCCGCGCCGCCCGCGCCCCGGACGGGCCCCGCCACGCCGCCGGCGCCGCCAACGCGG\n", + "ENST00000533115 2 ['AAGAGAAGCCCCTGAGAAAAACCCTCTAGAAAAACAGCTGTTCCTCT', 'TTTGCGCGACGTGGTCCCACAACCGTTGCCTTTTTAAGAGAGGCCCGGCCCATCCAGAGGGGGTGGGGCAGAGGCGGAGTCTGAGGAGCTGGGGAAGGAACAAAGCGAGGCCTGCGGGCGGCGGCTGGGCTCCGGCGGGGCCGCGGGGTGCGGGGCCTGCGGGCGGCGGCCCGGGCGGAGCGTTGGAGGGAAGGAGGTGGCATCGCCGTCCGCGCCGGCCCCGGCCATGAACGGGCTGCCCTCGGCAGAGGCGCCGGGCGGGGCGGGCTGCGCTTTGGCCGGGCTCCCACCGCTGCCGCGCGGCCTCAGCGGCCTCCTTAATGCGAGCGGGGGCTCGTGGCGGGAGCTGGAGCGCGTCTACAGCCAGCGCAGCCGCATCCACGACGAGCTGAGCCGCGCCGCCCGCGCCCCGGACGGGCCCCGCCACGCCGCCGGCGCCGCCAACGCGG']\n", + "-1 GACGACGAGGAGCCTCCCGATGCCAGCCTGCCTCCTGACCCGCCACCCCTTACTGTGCCCCAGACGCACAATGCCCGTGACCAGTGGCTGCAGGATGCCTTCCACATCAGCCTCTGAAGGGCTGGGGGGCAGGGGGCATGCACCCATGCAAAAGGCTCAGAAACTCCCCCTCCGGCAAGCCCTCAGACTTCGGAGCCTGCGCCTTCCCCCCTACCGCCTCACCTCACAGGAGGGCCAGGCATGTATTCCTCAGAGGCGAAACTGCCAAACTCTTTCTCCTGTCTTGGGTTGGCTGGCACTGGGGCGGGCATCTAGGGTACAGCCTCTGCTCATGGCACTGGGCCTCCAGTTCTTCCACATGTGTGCACCCCCAGCTTGGCCAACCCTCAGCCTTGCGGTGGGGCCCGAAGCATCTTCC\n" + ] + } + ], + "source": [ + "for t in trans_utr:\n", + " seq = s\n", + " for utr in trans_utr[t]: \n", + " pos = seq.find(utr)\n", + " print(pos, utr)\n", + " if (pos != -1):\n", + " seq = seq[:pos] + seq[pos + len(utr):]\n", + " if (t == transcript_id):\n", + " print(t, len(trans_utr[t]),trans_utr[t])\n", + "a = seq" + ] + }, + { + "cell_type": "code", + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ - "a = '''TGTTGGCAGTACGACTGCCAGCAGGGGTGAGCTCTTCCTCTTCTGGAACCTCTATAAAGCTCCAATACTGTTGGCACTAGTGGCAGGAGAAGCTGCTGGTATCATGGAAAACATAAGTGACGATGTGATTGTTGGCCGATGCCTGGCCATTCTCAAAGGGATTTTTGGTAGCAGTGCAGTACCTCAGCCCAAAGAAACTGTGGTGTCTCGTTGGCGTGCTGATCCCTGGGCTCGGGGCTCTTATTCCTATGTTGCTGCAGGATCATCTGGAAATGACTATGATTTAATGGCTCAGCCAATCACTCCTGGCCCCTCGATTCCAGGTGCCCCACAGCCGATTCCACGACTCTTCTTTGCGGGAGAACATACGATCCGTAACTACCCAGCCACAGTGCATGGTGCTCTGCTGAGTGGGCTGCGAGAAGCGGGAAGAATTGCAGACCAGTTTTTGGGGGCCATGTATACGCTGCCTCGCCAGGCCACACCAGGTGTTCCTGCACAGCAGTCCCCAAGCATGTGAGACAGATGCATTCTAAGGGAAGAGGCCCATGTGCCTGTTTCTGCCATGTAAGGAAGGCTCTTCTAGCAATACTAGATCCCACTGAGAAAATCCACCCTGGCATCTGGGCTCCTGATCAGCTGATGGAGCTCCTGATTTGACAAAGGAGCTTGCCTCCTTTGAATGACCTAGAGCACAGGGAGGAACTTGTCCATTAGTTTGGAATTGTGTTCTTCGTAAAGACTGAGGCAAGCAAGTGCTGTGAAATAACATCATCTTAGTCCCTTGA'''" + "a = '''GGCGGCGGCGGCGGCGGCGGCGGCGGCGGCTGGGCTGTTTGTTCTGGTCTCCCGCAGCCGAGGAGCCGAAGCAGTGGCGGCGGCAGCGGCTGCGGCGGCTGCCGGCGGTGCCCGCGGGCGAGCGCGGCCTGTGAGCTCGGCAGAGCGGCGGGCGGGCCCCGGCGCCGCGCAGGCAGCTCGGGGAGGGGGCGGCGGCAGCGGGCGGACGGCCGGCGGGGGCGGCGTGCGGCCTAGCGTCTCAGAGTGCCTGGTGAAGAATGTGATGGGATCACTAGCATGTCTGCGGAGAGCGGCCCTGGGACGAGATTGAGAAATCTGCCAGTAATGGGGGATGGACTAGAAACTTCCCAAATGTCTACAACACAGGCCCAGGCCCAACCCCAGCCAGCCAACGCAGCCAGCACCAACCCCCCGCCCCCAGAGACCTCCAACCCTAACAAGCCCAAGAGGCAGACCAACCAACTGCAATACCTGCTCAGAGTGGTGCTCAAGACACTATGGAAACACCAGTTTGCATGGCCTTTCCAGCAGCCTGTGGATGCCGTCAAGCTGAACCTCCCTGATTACTATAAGATCATTAAAACGCCTATGGATATGGGAACAATAAAGAAGCGCTTGGAAAACAACTATTACTGGAATGCTCAGGAATGTATCCAGGACTTCAACACTATGTTTACAAATTGTTACATCTACAACAAGCCTGGAGATGACATAGTCTTAATGGCAGAAGCTCTGGAAAAGCTCTTCTTGCAAAAAATAAATGAGCTACCCACAGAAGAAACCGAGATCATGATAGTCCAGGCAAAAGGAAGAGGACGTGGGAGGAAAGAAACAGGGACAGCAAAACCTGGCGTTTCCACGGTACCAAACACAACTCAAGCATCGACTCCTCCGCAGACCCAGACCCCTCAGCCGAATCCTCCTCCTGTGCAGGCCACGCCTCACCCCTTCCCTGCCGTCACCCCGGACCTCATCGTCCAGACCCCTGTCATGACAGTGGTGCCTCCCCAGCCACTGCAGACGCCCCCGCCAGTGCCCCCCCAGCCACAACCCCCACCCGCTCCAGCTCCCCAGCCCGTACAGAGCCACCCACCCATCATCGCGGCCACCCCACAGCCTGTGAAGACAAAGAAGGGAGTGAAGAGGAAAGCAGACACCACCACCCCCACCACCATTGACCCCATTCACGAGCCACCCTCGCTGCCCCCGGAGCCCAAGACCACCAAGCTGGGCCAGCGGCGGGAGAGCAGCCGGCCTGTGAAACCTCCAAAGAAGGACGTGCCCGACTCTCAGCAGCACCCAGCACCAGAGAAGAGCAGCAAGGTCTCGGAGCAGCTCAAGTGCTGCAGCGGCATCCTCAAGGAGATGTTTGCCAAGAAGCACGCCGCCTACGCCTGGCCCTTCTACAAGCCTGTGGACGTGGAGGCACTGGGCCTACACGACTACTGTGACATCATCAAGCACCCCATGGACATGAGCACAATCAAGTCTAAACTGGAGGCCCGTGAGTACCGTGATGCTCAGGAGTTTGGTGCTGACGTCCGATTGATGTTCTCCAACTGCTATAAGTACAACCCTCCTGACCATGAGGTGGTGGCCATGGCCCGCAAGCTCCAGGATGTGTTCGAAATGCGCTTTGCCAAGATGCCGGACGAGCCTGAGGAGCCAGTGGTGGCCGTGTCCTCCCCGGCAGTGCCCCCTCCCACCAAGGTTGTGGCCCCGCCCTCATCCAGCGACAGCAGCAGCGATAGCTCCTCGGACAGTGACAGTTCGACTGATGACTCTGAGGAGGAGCGAGCCCAGCGGCTGGCTGAGCTCCAGGAGCAGCTCAAAGCCGTGCACGAGCAGCTTGCAGCCCTCTCTCAGCCCCAGCAGAACAAACCAAAGAAAAAGGAGAAAGACAAGAAGGAAAAGAAAAAAGAAAAGCACAAAAGGAAAGAGGAAGTGGAAGAGAATAAAAAAAGCAAAGCCAAGGAACCTCCTCCTAAAAAGACGAAGAAAAATAATAGCAGCAACAGCAATGTGAGCAAGAAGGAGCCAGCGCCCATGAAGAGCAAGCCCCCTCCCACGTATGAGTCGGAGGAAGAGGACAAGTGCAAGCCTATGTCCTATGAGGAGAAGCGGCAGCTCAGCTTGGACATCAACAAGCTCCCCGGCGAGAAGCTGGGCCGCGTGGTGCACATCATCCAGTCACGGGAGCCCTCCCTGAAGAATTCCAACCCCGACGAGATTGAAATCGACTTTGAGACCCTGAAGCCGTCCACACTGCGTGAGCTGGAGCGCTATGTCACCTCCTGTTTGCGGAAGAAAAGGAAACCTCAAGCTGAGAAAGTTGATGTGATTGCCGGCTCCTCCAAGATGAAGGGCTTCTCGTCCTCAGAGTCGGAGAGCTCCAGTGAGTCCAGCTCCTCTGACAGCGAAGACTCCGAAACAGGTCCTGCCTAATCATTGGACACGGACTCTTAATAAAACGGTCTTCAGTTCCAGATTCCTTCCCAGCAAGCTATAGCTTAAGTCCATTTTCTTCCGTGAAAGGGACAGGACTCCATCAAGTTATGGAATTCCTCAGAGCCCTGGGCCTGTCCCCCGGGGTGGATTAGTCATGTCCAGCAGCACACGCCTAGTCCCGCCTTCGGGAAGGCTGCCTGCCTGGCCAGCCGCCCAGGCCTCTCTGTGTAAAGACTGCCTGGCTGTCCTGCCCAGCCTTCCTGGTTCTCTGGGGTCCTCTGGGTGGGTGGCATCTCCTGGAGGGTGATGACAATCCCCAACACATGCATTCATGTGGTGCTACTCTGTGTGCAAAGCCAGACCCCAAGTATGTTTTCTCTCTTTGTCCCATCCCTCTTTTTCTGGGACTTTGGACCCTAACTACTTCCCTCCTGAACCTTGCAGTGACATCAGTCCAGGAGAGCTCTCGTTCAGTGTGCGGAAGAACACTCTGACCTCTAGAGCTGTCCTAGATAAGGAGTGGGAGCTTTAGAGGCAAGGCCTCTAGACCCTGGAAGGCTCAGTGAGGCTCTTCCCACAGCATGCTTCTCACTGGTGCCCTGTAAGGCTCGAGCCACCGCTGACTCTGAGCCTTTTGGAGTCTTTCCTCCTTCGTCTCCATTGTTCCCGTGCATTTCCAAAAGCTTAAGTTGCCTGGTGGGCATTTCCCCAGTTTCTTTGGCCTCCGTCTTCTCAAGTCACATAGGGAAAGTACCTCCTGGAACCAGGCTGCAGTATGCAGGACCTGCCAGGCAGGCACTGGTGAAGGGCCTTGGGCCTATCATCCCCCCAACCCCACCTCACCCCACCCGCCTCCTCTAGTGGGGTGAGTCTGGGCTGGTGGACCAGAGAGGGTGTCACAGACCCTCAGGGACTGCCCCATGGACACCTCTGACTGGTGTTAACAGTGTGAACATTTTCCCCGTCTTCAGTCCCTTAGAATGACGACAGCCCCTGGGGTTGGGGCAGGCGAGTGTGGCCACATCATCCAAGCCCTCCCAGAGACACAAATAGGCTTTTTTGCTCTAAAAATAAATACCAGCCCTTTTTTGGTCACAAATCCAGCATCTCAGCAGAAAACTGCCTGACATGAAAAGTCCCCTGAGGAACTGCATCTGCGTTTCAGGGGCTTTTCATTTTTTCTCCTTTTTTAAAGTGTAGATTGTGGGTGCTTCCTAGAGGCCTGCCTTCTTCTGGAACTGGAAGTGGGCTATCACCATGGGCAAGCCCTTGGGTGCAGGCTCCCCACCTGCCTGGGAACTCTGGCAGCTCTCCTCAGCTCCTTGGGCTTGAGCAGCTGCAACTGCCCCAGATTTGCTGTGGAAGCAGGGGCTAGCCCTGGCCTCACCAGGGCCTCCCGGGGCCCTGCATTGATGCTCAGGAGTTCCTGGGCTGCTCTTGATCCTTTCTGGGCATCCAGCTTCCAGTTAAGCTCTGTTTGCCAAACAAACTATTCTCAGCTGCCCTTTGGCCTGCGCCTGATGTGTTCCTGTTGCAGTCCCGCCTGCCTGAGACAGGAGCAGGCAGGAGAGCCTTCATGCCCAGATTCCCACAGGACAATTGGGGAGCTGCTGGCATTGTCTTTCTGGGAAGATTCTGCTTTCTTGGACCAAATGGCAGCCTGATTACCAGTGTCGGGCCTGCATGCTGCCCCCGACACACGCACGCACGCGCACACACGTGTGCACATGGGCCATAGCCACAAGCCAGCTCTCCTCCAGGGTCCTTTCAACCTCGCTGTCCAGGGACCCTGTCCTTCTTGCCCGTGGGGCTTCCATCTGGCAGAGAACGTTCAGGGCTTGTTGAACTTGAAAGCTCATTAGACTTAAGCTGTCACCTGTGCTTGGTGCCCCAGGAACAGCCAGAGAGGACAGTGCCCACTCACTTCTTGTTGGCAGCCTCCTGTGCAGGAAGTGCCAGCCGGGCCTCGACGCACCAGCTGGCTGTGGGTCCTGAGGAGGGGCGGGAGGCGGCCGCTCAGTGCAGATGGGGACTCCTCTCCTCTGCCCTGACCTTACCCTCCATTACCTCCTTCACTGGAGTGGGGCTGGGGGGTGGGTGGAATCAGTGTTTTAATCGGATTTTTAAAAAACATTTTATTTCTTTGTACAATTACCATCCTATGTAAAGATGAAATTTGTGTTGAGTTGAAGATTGTCATGGAATAAAGATCACACCGTA'''\n", + "\n", + "#a = '''GTACGCGACGGCGGTTGGCGGCGCGCGGGCAGCGTGAAGCGAGGCGAGGCAAGGCTTTTCGGACCCACGGAGCGACAGAGCGAGCGGCCCCTACGGCCGTCGGCGGCCCGGCGGCCCGAGATGTTATCTGGGAAGAAGGCGGCAGCCGCGGCGGCGGCGGCTGCAGCGGCAGCAACCGGGACGGAGGCTGGCCCTGGGACAGCAGGCGGCTCCGAGAACGGGTCTGAGGTGGCCGCGCAGCCCGCGGGCCTGTCGGGCCCAGCCGAGGTCGGGCCGGGGGCGGTGGGGGAGCGCACACCCCGCAAGAAAGAGCCTCCGCGGGCCTCGCCCCCCGGGGGCCTGGCGGAACCGCCGGGGTCCGCAGGGCCTCAGGCCGGCCCTACTGTCGTGCCTGGGTCTGCGACCCCCATGGAAACTGGAATAGCAGAGACTCCGGAGGGGCGTCGGACCAGCCGGCGCAAGCGGGCGAAGGTAGAGTACAGAGAGATGGATGAAAGCTTGGCCAACCTCTCAGAAGATGAGTATTATTCAGAAGAAGAGAGAAATGCCAAAGCAGAGAAGGAAAAGAAGCTTCCCCCACCACCCCCTCAAGCCCCACCTGAGGAAGAAAATGAAAGTGAGCCTGAAGAACCATCGGGGCAAGCAGGAGGACTTCAAGACGACAGTTCTGGAGGGTATGGAGACGGCCAAGCATCAGGTGTGGAGGGCGCAGCTTTCCAGAGCCGACTTCCTCATGACCGGATGACTTCTCAAGAAGCAGCCTGTTTTCCAGATATTATCAGTGGACCACAACAGACCCAGAAGGTTTTTCTTTTCATTAGAAACCGCACACTGCAGTTGTGGTTGGATAATCCAAAGATTCAGCTGACATTTGAGGCTACTCTCCAACAATTAGAAGCACCTTATAACAGTGATACTGTGCTTGTCCACCGAGTTCACAGTTATTTAGAGCGTCATGGTCTTATCAACTTCGGCATCTATAAGAGGATAAAACCCCTACCAACTAAAAAGACAGGAAAGGTAATTATTATAGGCTCTGGGGTCTCAGGCTTGGCAGCAGCTCGACAGTTACAAAGTTTTGGAATGGATGTCACACTTTTGGAAGCCAGGGATCGTGTGGGTGGACGAGTTGCCACATTTCGCAAAGGAAACTATGTAGCTGATCTTGGAGCCATGGTGGTAACAGGTCTTGGAGGGAATCCTATGGCTGTGGTCAGCAAACAAGTAAATATGGAACTGGCCAAGATCAAGCAAAAATGCCCACTTTATGAAGCCAACGGACAAGCTGTTCCTAAAGAGAAAGATGAAATGGTAGAGCAAGAGTTTAACCGGTTGCTAGAAGCTACATCTTACCTTAGTCATCAACTAGACTTCAATGTCCTCAATAATAAGCCTGTGTCCCTTGGCCAGGCATTGGAAGTTGTCATTCAGTTACAAGAGAAGCATGTCAAAGATGAGCAGATTGAACATTGGAAGAAGATAGTGAAAACTCAGGAAGAATTGAAAGAACTTCTTAATAAGATGGTAAATTTGAAAGAGAAAATTAAAGAACTCCATCAGCAATACAAAGAAGCATCTGAAGTAAAGCCACCCAGAGATATTACTGCCGAGTTCTTAGTGAAAAGCAAACACAGGGATCTGACCGCCCTATGCAAGGAATATGATGAATTAGCTGAAACACAAGGAAAGCTAGAAGAAAAACTTCAGGAGTTGGAAGCGAATCCCCCAAGTGATGTATATCTCTCATCAAGAGACAGACAAATACTTGATTGGCATTTTGCAAATCTTGAATTTGCTAATGCCACACCTCTCTCAACTCTCTCCCTTAAGCACTGGGATCAGGATGATGACTTTGAGTTCACTGGCAGCCACCTGACAGTAAGGAATGGCTACTCGTGTGTGCCTGTGGCTTTAGCAGAAGGCCTAGACATTAAACTGAATACAGCAGTGCGACAGGTTCGCTACACGGCTTCAGGATGTGAAGTGATAGCTGTGAATACCCGCTCCACGAGTCAAACCTTTATTTATAAATGCGACGCAGTTCTCTGTACCCTTCCCCTGGGTGTGCTGAAGCAGCAGCCACCAGCCGTTCAGTTTGTGCCACCTCTCCCTGAGTGGAAAACATCTGCAGTCCAAAGGATGGGATTTGGCAACCTTAACAAGGTGGTGTTGTGTTTTGATCGGGTGTTCTGGGATCCAAGTGTCAATTTGTTCGGGCATGTTGGCAGTACGACTGCCAGCAGGGGTGAGCTCTTCCTCTTCTGGAACCTCTATAAAGCTCCAATACTGTTGGCACTAGTGGCAGGAGAAGCTGCTGGTATCATGGAAAACATAAGTGACGATGTGATTGTTGGCCGATGCCTGGCCATTCTCAAAGGGATTTTTGGTAGCAGTGCAGTACCTCAGCCCAAAGAAACTGTGGTGTCTCGTTGGCGTGCTGATCCCTGGGCTCGGGGCTCTTATTCCTATGTTGCTGCAGGATCATCTGGAAATGACTATGATTTAATGGCTCAGCCAATCACTCCTGGCCCCTCGATTCCAGGTGCCCCACAGATTTTAGTCATTCCATCTTCGGACCCTTTCAGATAACCAAGAGCAGAGTTAAAAGGATGGGCAGCATTTCTGATTTCTCATAATTTGGTTTTGTAAATCTAGGCAGTCTACATTCAGAATGGAGGAGTCCAGAGTATAAGTCAAATAACATTTTTCCTTATTCAGGTTTTCTCCCTAAAAAACAAAACCATTTTTAATTGCACTTCCATTTTGTAATGGCTCATAGGATTGCTGTAAGGTCTCAATTACTAGGGGCTTCCAGGGCATTTCTGAGAAATAACCCTGGGTCCTTGTCTAGACCCTTATGCCAGACCCCACTCCAAAGAGCGGTAAGAATTCCTTAGTGTCATAGCCCAGACCTGCTGAGCTGCGAGGCTTAAGTGTCCCTGATCACCAAATGTCCTGTGCTTCAGGGTAGCGAGGCTCCCTCCCAGAAGGTAGTTATACATGAGGGAAGGACGCTTTACAACTGGGTATCTAAACTGATGAGAACACATGTTAAGCATCACTTTAGGACTGAGCCTAGGTAGAGTTTTATTGTCTCATTTCTACTTGTCAATTCTGGGAAAGTGCCTACTGATAAGGGAGACTCTTCGATAGAATGATGAATAGTAATTGGGGGGGTCAGCCTTTAAAAAGGTCAACAGCAATTTAAGTACTTAGCAATTTAAGTACAAGAATAAAGGTATATGTGCAGCCTGCCAATTTTCTCTTTTTCCCCTAAAATAGCCGATTCCACGACTCTTCTTTGCGGGAGAACATACGATCCGTAACTACCCAGCCACAGTGCATGGTGCTCTGCTGAGTGGGCTGCGAGAAGCGGGAAGAATTGCAGACCAGTTTTTGGGGGCCATGTATACGCTGCCTCGCCAGGCCACACCAGGTGTTCCTGCACAGCAGTCCCCAAGCATGTGAGACAGATGCATTCTAAGGGAAGAGGCCCATGTGCCTGTTTCTGCCATGTAAGGAAGGCTCTTCTAGCAATACTAGATCCCACTGAGAAAATCCACCCTGGCATCTGGGCTCCTGATCAGCTGATGGAGCTCCTGATTTGACAAAGGAGCTTGCCTCCTTTGAATGACCTAGAGCACAGGGAGGAACTTGTCCATTAGTTTGGAATTGTGTTCTTCGTAAAGACTGAGGCAAGCAAGTGCTGTGAAATAACATCATCTTAGTCCCTTGGTGTGTGGGGTTTTTGTTTTTTTTTTATATTTTGAGAATAAAACTTCATATAAAAA'''\n", + "\n", + "#a = '''GCTTGGCGCGTGCGTACGCGACGGCGGTTGGCGGCGCGCGGGCAGCGTGAAGCGAGGCGAGGCAAGGCTTTTCGGACCCACGGAGCGACAGAGCGAGCGGCCCCTACGGCCGTCGGCGGCCCGGCGGCCCGAGATGTTATCTGGGAAGAAGGCGGCAGCCGCGGCGGCGGCGGCTGCAGCGGCAGCAACCGGGACGGAGGCTGGCCCTGGGACAGCAGGCGGCTCCGAGAACGGGTCTGAGGTGGCCGCGCAGCCCGCGGGCCTGTCGGGCCCAGCCGAGGTCGGGCCGGGGGCGGTGGGGGAGCGCACACCCCGCAAGAAAGAGCCTCCGCGGGCCTCGCCCCCCGGGGGCCTGGCGGAACCGCCGGGGTCCGCAGGGCCTCAGGCCGGCCCTACTGTCGTGCCTGGGTCTGCGACCCCCATGGAAACTGGAATAGCAGAGACTCCGGAGGGGCGTCGGACCAGCCGGCGCAAGCGGGCGAAGGTAGAGTACAGAGAGATGGATGAAAGCTTGGCCAACCTCTCAGAAGATGAGTATTATTCAGAAGAAGAGAGAAATGCCAAAGCAGAGAAGGAAAAGAAGCTTCCCCCACCACCCCCTCAAGCCCCACCTGAGGAAGAAAATGAAAGTGAGCCTGAAGAACCATCGGGGCAAGCAGGAGGACTTCAAGACGACAGTTCTGGAGGGTATGGAGACGGCCAAGCATCAGGTGTGGAGGGCGCAGCTTTCCAGAGCCGACTTCCTCATGACCGGATGACTTCTCAAGAAGCAGCCTGTTTTCCAGATATTATCAGTGGACCACAACAGACCCAGAAGGTTTTTCTTTTCATTAGAAACCGCACACTGCAGTTGTGGTTGGATAATCCAAAGATTCAGCTGACATTTGAGGCTACTCTCCAACAATTAGAAGCACCTTATAACAGTGATACTGTGCTTGTCCACCGAGTTCACAGTTATTTAGAGCGTCATGGTCTTATCAACTTCGGCATCTATAAGAGGATAAAACCCCTACCAACTAAAAAGACAGGAAAGGTAATTATTATAGGCTCTGGGGTCTCAGGCTTGGCAGCAGCTCGACAGTTACAAAGTTTTGGAATGGATGTCACACTTTTGGAAGCCAGGGATCGTGTGGGTGGACGAGTTGCCACATTTCGCAAAGGAAACTATGTAGCTGATCTTGGAGCCATGGTGGTAACAGGTCTTGGAGGGAATCCTATGGCTGTGGTCAGCAAACAAGTAAATATGGAACTGGCCAAGATCAAGCAAAAATGCCCACTTTATGAAGCCAACGGACAAGCTGTTCCTAAAGAGAAAGATGAAATGGTAGAGCAAGAGTTTAACCGGTTGCTAGAAGCTACATCTTACCTTAGTCATCAACTAGACTTCAATGTCCTCAATAATAAGCCTGTGTCCCTTGGCCAGGCATTGGAAGTTGTCATTCAGTTACAAGAGAAGCATGTCAAAGATGAGCAGATTGAACATTGGAAGAAGATAGTGAAAACTCAGGAAGAATTGAAAGAACTTCTTAATAAGATGGTAAATTTGAAAGAGAAAATTAAAGAACTCCATCAGCAATACAAAGAAGCATCTGAAGTAAAGCCACCCAGAGATATTACTGCCGAGTTCTTAGTGAAAAGCAAACACAGGGATCTGACCGCCCTATGCAAGGAATATGATGAATTAGCTGAAACACAAGGAAAGCTAGAAGAAAAACTTCAGGAGTTGGAAGCGAATCCCCCAAGTGATGTATATCTCTCATCAAGAGACAGACAAATACTTGATTGGCATTTTGCAAATCTTGAATTTGCTAATGCCACACCTCTCTCAACTCTCTCCCTTAAGCACTGGGATCAGGATGATGACTTTGAGTTCACTGGCAGCCACCTGACAGTAAGGAATGGCTACTCGTGTGTGCCTGTGGCTTTAGCAGAAGGCCTAGACATTAAACTGAATACAGCAGTGCGACAGGTTCGCTACACGGCTTCAGGATGTGAAGTGATAGCTGTGAATACCCGCTCCACGAGTCAAACCTTTATTTATAAATGCGACGCAGTTCTCTGTACCCTTCCCCTGGGTGTGCTGAAGCAGCAGCCACCAGCCGTTCAGTTTGTGCCACCTCTCCCTGAGTGGAAAACATCTGCAGTCCAAAGGATGGGATTTGGCAACCTTAACAAGGTGGTGTTGTGTTTTGATCGGGTGTTCTGGGATCCAAGTGTCAATTTGTTCGGGCATGTTGGCAGTACGACTGCCAGCAGGGGTGAGCTCTTCCTCTTCTGGAACCTCTATAAAGCTCCAATACTGTTGGCACTAGTGGCAGGAGAAGCTGCTGGTATCATGGAAAACATAAGTGACGATGTGATTGTTGGCCGATGCCTGGCCATTCTCAAAGGGATTTTTGGTAGCAGTGCAGTACCTCAGCCCAAAGAAACTGTGGTGTCTCGTTGGCGTGCTGATCCCTGGGCTCGGGGCTCTTATTCCTATGTTGCTGCAGGATCATCTGGAAATGACTATGATTTAATGGCTCAGCCAATCACTCCTGGCCCCTCGATTCCAGGTGCCCCACAGCCGATTCCACGACTCTTCTTTGCGGGAGAACATACGATCCGTAACTACCCAGCCACAGTGCATGGTGCTCTGCTGAGTGGGCTGCGAGAAGCGGGAAGAATTGCAGACCAGTTTTTGGGGGCCATGTATACGCTGCCTCGCCAGGCCACACCAGGTGTTCCTGCACAGCAGTCCCCAAGCATGTGAGACAGATGCATTCTAAGGGAAGAGGCCCATGTGCCTGTTTCTGCCATGTAAGGAAGGCTCTTCTAGCAATACTAGATCCCACTGAGAAAATCCACCCTGGCATCTGGGCTCCTGATCAGCTGATGGAGCTCCTGATTTGACAAAGGAGCTTGCCTCCTTTGAATGACCTAGAGCACAGGGAGGAACTTGTCCATTAGTTTGGAATTGTGTTCTTCGTAAAGACTGAGGCAAGCAAGTGCTGTGAAATAACATCATCTTAGTCCCTTGGTGTGTGGGGTTTTTGTTTTTTTTTTATATTTTGAGAATAAAACTTCATATAAAA'''\n", + "a = '''GGGAAGTGTCTGAACAAAACAAGGAACAAAAATGAGTGGGTGGGATGAGATGGAAGAAACCCCAAAGAACCTAAATACGTCACACTTGAAGTGGCACAGTGGTATATGAGAAAGTGCCTGAGTGGGCATGGACATCGAAGAAGTGGGGGGACCTCTGGCTGGATGCCTGAACCACACCCTCACAGGGAGGTTTGGCAGATCCAAAGACAATAAAGGACCTTTCAGGTCAAAAAGAGGGGATGTAGGAGAATAAACAGTGCTAGAAATGTTTCAATCAGTTTGTCTCTGCATACATATAAATTATATACTTGTATCTTACATCTTAGGGGCTCTTCTTGGGCGTCACCCGTGCCTTGTGGCTGGGGCATGTACATACACAAGTGGACACACAGGCAGAGCAGCCACCTGTGGGCTTTCTTGGACAGGAATGTGTGTGTATGTGCATGGGGGGTGAGGATCCTATTTTGGGGGATGTAGACTTATATCTAAGAGTATCTGGTTAACCCTGAGCTTAAATGAAAGGAGGAGTTAGGTTGAGGCAGGCAAGGTGGGAGAAGTGGCCCAAGTCCTTTGGTGAGTGGGGGGACAGGATGGAGTAGGGGGGACAGGATGGAGTAGGGTGGAGGGGAAGAATACTGTCTGGAGTCTGGCCAGGGTTCTGCTAGAGCACACCCTCCACCTCAGCCAGGGTCCACAAGGATGGGTACCGGGCTCTGCGTCACAGCTTCAGCTTGGGGTGGTTGCTATGAGTCTGCGTGGCTCCCGCCCAGGGCAGACAGGGACAGGTCACAGGAGAGGGGCTAGGTAATCCCTGGCAGTAGTTCCTGTACAGAGGTGGTCTGGGGTCCAGGGGGTCCCCTGGGCCTAGCCTAGGCAACAGTTGGTTCACAAAGAAATGTCAGGGAGACGCCAGCATTAAAAAAAGAGAGATGTGTTTATTCCATGATCAGTACAGACCAAATGCATATTCACCGTATGAAAGTCAAACCAGTCAGTGACTCCAGAGTTTGGCCAACACTGAGGCACCAGCGTCGTGGTGTAGAGTGGGTTCTCATGGCACGCGTAACCTCACCAGGGGCTCCAATTATAAAAATTAAAAAAAAAAAAAAAAAAAAG'''\n", + "\n", + "a = '''GCTCAGTCCTCCAGGCGTCGGTACTCAGCGGTGTTGGAACTTCGTTGCTTGCTTGCCTGTGCGCGCGTGCGCGGACATGGCCTCAAACGATTATACCCAACAAGCAACCCAAAGCTATGGGGCCTACCCCACCCAGCCCGGGCAGGGCTATTCCCAGCAGAGCAGTCAGCCCTACGGACAGCAGAGTTACAGTGGTTATAGCCAGTCCACGGACACTTCAGGCTATGGCCAGAGCAGCTATTCTTCTTATGGCCAGAGCCAGAACACAGGCTATGGAACTCAGTCAACTCCCCAGGGATATGGCTCGACTGGCGGCTATGGCAGTAGCCAGAGCTCCCAATCGTCTTACGGGCAGCAGTCCTCCTACCCTGGCTATGGCCAGCAGCCAGCTCCCAGCAGCACCTCGGGAAGTTACGGTAGCAGTTCTCAGAGCAGCAGCTATGGGCAGCCCCAGAGTGGGAGCTACAGCCAGCAGCCTAGCTATGGTGGACAGCAGCAAAGCTATGGACAGCAGCAAAGCTATAATCCCCCTCAGGGCTATGGACAGCAGAACCAGTACAACAGCAGCAGTGGTGGTGGAGGTGGAGGTGGAGGTGGAGGTAACTATGGCCAAGATCAATCCTCCATGAGTAGTGGTGGTGGCAGTGGTGGCGGTTATGGCAATCAAGACCAGAGTGGTGGAGGTGGCAGCGGTGGCTATGGACAGCAGGACCGTGGAGGCCGCGGCAGGGGTGGCAGTGGTGGCGGCGGCGGCGGCGGCGGTGGTGGTTACAACCGCAGCAGTGGTGGCTATGAACCCAGAGGTCGTGGAGGTGGCCGTGGAGGCAGAGGTGGCATGGGGTAGGTGTCTCATGAGCCAGGGAGTATCTTTGGTGGGGAGTGTGGAGGATTGCATGAATCTCCCTGAAGCCAGTCCCTAGTGCATGGTTTAGTATTCTTGTTGTCTAGGGATCTGTGAGGGCTTTGATTTGGGGGCAGTGACTTTCTTTTTACATCCCCATTTTATTTTTGTGAGAACTTGGGAGCCTGAACTCCCATCCATACCACTGAATAGAGATTTTGAGTAATGATACTTGTTTCCAAAAAAAAAGAAACCATACATAGATACGTATGGATTGGAGTCATTAATATCCTAGGCAAGAAACATGGAAGTGAAGACTTCTTTCTCTGCAAGGGAAACCGATGATCCCACTCCTGGGAAATAGTAGGGAAACTTGGTATGTGTATTCCCATGTGTCCTCTAGGGAGTTGGTAATGGTTAACCTGACTTCAGCTTCCAGGAATTGGCTACTCTTCCCGTTTTCTATAGTCATTTGAATCCACGAGCTTGATTTGCACTAATTTGACCGACATTGATTTTGTGTGTGACTTGGTTTATGGGGCCAGCTGACTGAAGTAAGCAGACCTTTTGGGCAAAAATATGCTTTGACAGTGGTCTCCCACCTATTTGTTCCACTGTCTGCCTTCCCCTGGTTACTTAAAATTCATCAGCTTGTCCAACTGGACCTTCTTTCCTTCCTGCTGAAGTTGATTTGAAGTAAAACCTTAGATTTGATGTTAAAACAGTTGTCAAATCTGTTGGTAAATAAGATTTGAAGGACCCTACTCTGTCTCCCTTGAAAAAGGGGAGGAATGTCAGTGTTACTGTTTTTGGAAAAAGTAGATTTTTAAACCGAGTTTGGAAATGGTAAGTATGCAGAGGTGGGTGGGGGCAATCTCAAAAACGTGCAAAAATGAGGAAAACAAAAATGAGGAAATGTGTGCGTGTGTTTAATGCAAAACTTTAAAAAGAAAAACAACTGTTATGTGACTGTTAACTTGCTCTGCATTTTATGTGCCACAGGTATGAAAGGTGACATTGCAAAATACTCCGCTCTTCTCGCAGTGTAGAAGGGGTGACCCCGGGGGTTGGGGGAGATCAAAAACAGCTCAGTAGTTAGGACAGAGCTTAGCTAAGTTTGTCTTGCTTTAAGGGGAAGTTGCCTTTGGTTTTGACTTTTTATGGAATGGGGTTGGGTCTGCTTGCTGCTTTCAAAGCAAAAACCACAAAAATGTGTTCAAGGCTACCCCAGCCTGGTGTGAAATGTCTTCTGGGTAAATTGGGGTAGGGTTTTTAAACCAACTACTTGGTTGTCAACCACTTGCGACAAGAGGAAAAAAAAACATCTGCTCCATCGGAAGAACGACCAAGGAAAATGGGTTATTTTTTTTCCAGAGGAAATAGATAACGTAACCTTTTAAAGCAAAATCTTTATAAACTGTGTCTGAGAAATTGCACACGTGTGTGTGACATGCTCAAAGGTCAGACAAGGGGTGGTCAGGAAGGGATGTATTTTAGTAGCCACTTGTATCTTTTTCCAAAAACACCTACCCATGTTTGGGGAATGTTAAACAAAATCAAAAAACAACCTTTTGTAGCCGTTGGAAGCTTCATGTCCTTTCTTCTAACTTGTCTTCTCCAGCGGAAGTGACCGTGGTGGCTTCAATAAATTTGGTGGTAAGTGAACAGAGTTTCCAAAATTCCCAACTCCCAGCAATGCTTTGTCTGATTGTTCATTTGCAGATGTCTTAGCGTGTTAATTTAAATGTCAAAGGTTTTGAGGTGTCCAGAACCACCTCCAGAAAGGGGTAGGGTAGAATGCCACCTGTTGCCTGGTGTGTGCTAACCTGGAGCAGGTAGGGGTAAGACTCAATAGTCATCTTTTACCAAATGGGTTTGCCCCAGGTTAATAAGAGGGGTCTAGTAGGCCTTGGACTGGGCCGTTGCCACACCTGGCACTTAGTGACCATCATCATGAGAAACTGGAGAGTGCGTGCTGGAACACGTGGTGCCATCTTGGCTTTAGGATCCTTTTGATCGTTGTGTCCAAGGCTTGTGTGTGTGTGAGTGTGTGGGAGACAACTCCGAATGTTTAATTCTGGAAGAGGGATGTAACATTGCCCTGAGGATGGTGAAGTTGGTATACATTTATAAAGTACGGAATGGTGTCAATGAATGCAATTCTATGTATATGGACTTAACTGAGATGGGCAAATAGAAACTAGCTCTGGGAAGGAACATGTGCACTACTTCAAGAAAGATTGGAAGCATGTGTGGCTCATGGGAAATAACCAGGTCTTAAACAGCACAAACTGAATTCGTGGACCAGGAAGGTCTTAAACAGCACAAACTGAATTCATGGAAAAATGACAAATTTGAGAAGTCTCCCAGTAAGCTGGAACTTTTCTGGTTTGGTTAACAAAAGGTTTCTTGATTTGTTTCAAGATTTAAAGCCAAAGGTGTGGGTTCATGACTTAGGTGTCATTGCGTGTGGGTACAATATTTATATATGGCGAATTCAGATAAACATTGGTCAAAGATGGTCTCTGGAAAAACAAAATAGAGGCTGCATTACGGAAATAAGATTTCTGGTCTGTTCCCTGGGACATGCTTAAAAAATACAATAGCTATTATGTATGGTTTTTATTTTCATGTGGTTTCGGGGAAACAACACGGTTTTAAGGATGGTTTCTAAAGATGAAATTAAAAATTGTTCCACAAGGGTTAAGTGTCTGGTGGTAAAGTTGGGAGAAACTGGATGGATGCACATCGCATGGCTGGTGGCGAGCCCATCTCTCTTCTCTCGGGTGAGAGAACCGGGCCAAGCTGAGTTGGTTTGTTCACTTTAATGGGTCTCCGTTTCCCCTGCCACCTGTGCTGAGGACATTTCCCAGCCTGAGCTGGGGGAGGCAGCATTTGCTGAAGTGTGGAGTTGTCTCTGTGGAGACTCAAGTTACAGATCTTAAGGGGCCTGCCTAGAATTTTCTCCTCTGGGCAGGCGACCCAGGAAAGGGTTTGGAGTGAGGCTGTGAGCACTTACTTGATATTTTACAAGTTTGGATTTGGTGTTAATTTTTTTCCTTGTCCGTTTTTTCCTGTTGACTAACGGCTCATCTTTTCCTTGTTTTTGTTTTTTTTTTGTTCTTTTTTTCCATGTCACTAAAGGCCCTCGGGACCAAGGATCACGTCATGACTCCGAACAGGATAATTCAGACAACAACACCATCTTTGTGCAAGGCCTGGGTGAGAATGTTACAATTGAGTCTGTGGCTGATTACTTCAAGCAGATTGGTATTATTAAGACAAACAAGAAAACGGGACAGCCCATGATTAATTTGTACACAGACAGGGAAACTGGCAAGCTGAAGGGAGAGGCAACGGTCTCTTTTGATGACCCACCTTCAGCTAAAGCAGCTATTGACTGGTTTGATGGTAAAGAATTCTCCGGAAATCCTATCAAGGTCTCATTTGCTACTCGCCGGGCAGACTTTAATCGGGGTGGTGGCAATGGTCGTGGAGGCCGAGGGCGAGGAGGACCCATGGGCCGTGGAGGCTATGGAGGTGGTGGCAGTGGTGGTGGTGGCCGAGGAGGATTTCCCAGTGGAGGTGGTGGCGGTGGAGGACAGCAGCGAGCTGGTGACTGGAAGTGTCCTAATCCCACCTGTGAGAATATGAACTTCTCTTGGAGGAATGAATGCAACCAGTGTAAGGCCCCTAAACCAGATGGCCCAGGAGGGGGACCAGGTGGCTCTCACATGGGGGGTAACTACGGGGATGATCGTCGTGGTGGCAGAGGAGGCTATGATCGAGGCGGCTACCGGGGCCGCGGCGGGGACCGTGGAGGCTTCCGAGGGGGCCGGGGTGGTGGGGACAGAGGTGGCTTTGGCCCTGGCAAGATGGATTCCAGGGGTGAGCACAGACAGGATCGCAGGGAGAGGCCGTATTAATTAGCCTGGCTCCCCAGGTTCTGGAACAGCTTTTTGTCCTGTACCCAGTGTTACCCTCGTTATTTTGTAACCTTCCAATTCCTGATCACCCAAGGGTTTTTTTGTGTCGGACTATGTAATTGTAACTATACCTCTGGTTCCCATTAAAAGTGACCATTTTAGTTAAA'''\n", + "\n", + "a = '''GGTGACAACGGCAACATGGCCCTGAACGGAGCTGGTGAGGACCTGGGCGGCAGGGGTTTGTGGCTGTGAGGTACGGGAGGCAGCCCACTCCGGCAAGACCCCCAGTCCCTATGCCTCTCTTCCCCAGAAGTCGACGACTTCTCCTGGGAGCCCCCGACTGAGGCGGAGACGAAGACGATCCTCCTCCAAGACAAACAGCGGAAAATCTACTGCGTGGCTTGTCAGGAACTCGACTCAGACGTGGATAAAGATAATCCCGCTCTGAATGCCCAGGCTGCCCTCTCCCAAGCTCGGGAGCACCAGCTGGCCTCAGCCTCAGAGCTCCCCCTGGGCTCTCGACCTGCGCCCCAGCCCCCAGTACCTCGTCCGGAGCACTGTGAGGGAGCTGCAGCAGGACTCAAGGCAGCCCAGGGGCCACCTGCTCCTGCTGTGCCTCCAAATACAGATGTCATGGCCTGCACACAGACAGCCCTCTTGCAGAAGCTGACCTGGGCCTCTGCTGAACTGGGCTCTAGCACCTCCCTGGAGACTAGCATCCAGCTGTGTGGCCTTATCCGCGCATGTGCGGAGGCCCTGCGCAGCCTGCAGCAGCTACAGCACTAAGAGAAGCCCCTGAGAAAAACCCTCTAGAAAAA'''" ] }, { "cell_type": "code", - "execution_count": 291, + "execution_count": 24, "metadata": { "scrolled": true }, @@ -427,9 +649,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "CWQYDCQQG_ALPLLEPL_SSNTVGTSGRRSCWYHGKHK_RCDCWPM+PGHSQRDFW_QCSTSAQRNCGVSLAC_SLGSGLLFLCCCRIIWK_L_FNGSANHSWPLDSRCPTADSTTLLCGRTYDP_LPSHSAWCSAEWAARSGKNCRPVFGGHVYAASPGHTRCSCTAVPKHVRQM+HSKGRGPCACFCHVRKALLAILDPTEKIHPGIWAPDQLM+ELLI_QRSLPPLNDLEHREELVH_FGIVFFVKTEASKCCEITSS_SL\n", - "VGSTTASRGELFLFWNLYKAPILLALVAGEAAGIM+ENISDDVIVGRCLAILKGIFGSSAVPQPKETVVSRWRADPWARGSYSYVAAGSSGNDYDLM+AQPITPGPSIPGAPQPIPRLFFAGEHTIRNYPATVHGALLSGLREAGRIADQFLGAM+YTLPRQATPGVPAQQSPSM+_DRCILREEAHVPVSAM+_GRLF_QY_IPLRKSTLASGLLIS_WSS_FDKGACLL_M+T_STGRNLSISLELCSS_RLRQASAVK_HHLSPL\n", - "LAVRLPAGVSSSSSGTSIKLQYCWH_WQEKLLVSWKT_VTM+_LLADAWPFSKGFLVAVQYLSPKKLWCLVGVLIPGLGALIPM+LLQDHLEM+TM+I_WLSQSLLAPRFQVPHSRFHDSSLRENIRSVTTQPQCM+VLC_VGCEKREELQTSFWGPCIRCLARPHQVFLHSSPQACETDAF_GKRPM+CLFLPCKEGSSSNTRSH_ENPPWHLGS_SADGAPDLTKELASFE_PRAQGGTCPLVWNCVLRKD_GKQVL_NNIILVP_\n" + "GDNGNM+ALNGAGEDLGGRGLWL_GTGGSPLRQDPQSLCLSSPEVDDFSWEPPTEAETKTILLQDKQRKIYCVACQELDSDVDKDNPALNAQAALSQAREHQLASASELPLGSRPAPQPPVPRPEHCEGAAAGLKAAQGPPAPAVPPNTDVM+ACTQTALLQKLTWASAELGSSTSLETSIQLCGLIRACAEALRSLQQLQH_EKPLRKTL_K\n", + "VTTATWP_TELVRTWAAGVCGCEVREAAHSGKTPSPYASLPQKSTTSPGSPRLRRRRRRSSSKTNSGKSTAWLVRNSTQTWIKIIPL_M+PRLPSPKLGSTSWPQPQSSPWALDLRPSPQYLVRSTVRELQQDSRQPRGHLLLLCLQIQM+SWPAHRQPSCRS_PGPLLNWALAPPWRLASSCVALSAHVRRPCAACSSYSTKRSP_EKPSRK\n", + "_QRQHGPERSW_GPGRQGFVAVRYGRQPTPARPPVPM+PLFPRSRRLLLGAPD_GGDEDDPPPRQTAENLLRGLSGTRLRRG_R_SRSECPGCPLPSSGAPAGLSLRAPPGLSTCAPAPSTSSGAL_GSCSRTQGSPGATCSCCASKYRCHGLHTDSPLAEADLGLC_TGL_HLPGD_HPAVWPYPRM+CGGPAQPAAATALREAPEKNPLEK\n" ] } ], @@ -478,28 +700,102 @@ }, { "cell_type": "code", - "execution_count": 217, + "execution_count": 184, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'IPR000504', 'IPR012677'}\n" - ] + "data": { + "text/plain": [ + "4437" + ] + }, + "execution_count": 184, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "M+SAESGPGTRLRNLPVM+GDGLETSQM+STTQAQAQPQPANAASTNPPPPETSNPNKPKRQTNQLQYLLRVVLKTLWKHQFAWPFQQPVDAVKLNLPDYYKIIKTPM+DM+GTIKKRLENNYYWNAQECIQDFNTM+FTNCYIYNKPGDDIVLM+AEALEKLFLQKINELPTEETEIM+IVQAKGRGRGRKETGTAKPGVSTVPNTTQASTPPQTQTPQPNPPPVQATPHPFPAVTPDLIVQTPVM+TVVP\n", + "a.find(\"\n", + " \n", + " \")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ "\n", "\n", "domains = dict()\n", "\n", - "file = open(\"/home/annaldas/projects/result/snrnp70/snrnp70.gff3\", \"r\")\n", + "file = open(\"/home/annaldas/projects/result/ZNRD2/ZNRD2_blastx.gff3\", \"r\")\n", "lines = file.readlines()\n", "file.close()\n", "\n", "\n", + "for line in lines:\n", + " if (line.startswith(\">\")):\n", + " break\n", + " \n", + " if (not line.startswith(\"#\")):\n", + " data = line.split(\"\\t\")\n", + " seqid,source,attr = data[0],data[1],data[8]\n", + " if (seqid not in domains): \n", + " domains[seqid] = set()\n", + " if (source != \".\" and \"Dbxref\" in attr):\n", + " Dbxref = attr.split(\";\")[-1]\n", + " IPR = Dbxref.split(\"=\")[-1][10:-2]\n", + " if (IPR != []):\n", + " domains[seqid].add(IPR)\n", + "\n", + "try:\n", + " key,value = domains.popitem()\n", + " \n", + " domains[key] = value\n", + " common_domains = value\n", + " for transcript in domains:\n", + " curr = domains[transcript]\n", + " common_domains = common_domains.intersection(curr)\n", + "except:\n", + " common_domains = []\n", + "\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'TCONS_00010063|ENST00000533115': {'IPR039499'},\n", + " 'TCONS_00010061|TCONS_00010060': {'IPR009563'},\n", + " 'TCONS_00010064|ENST00000533115': {'IPR009563'},\n", + " 'TCONS_00011857|ENST00000531405': {'IPR009563'},\n", + " 'TCONS_00010062|ENST00000533115': {'IPR009563'},\n", + " 'TCONS_00010060|ENST00000533115': {'IPR009563'}}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "domains" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "for line in lines:\n", " if (line.startswith(\">\")):\n", " break\n", @@ -519,8 +815,15 @@ "common_domains = value\n", "for transcript in domains:\n", " curr = domains[transcript]\n", - " common_domains.intersection(curr)\n", - "\n", + " common_domains.intersection(curr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "file = open(\"/home/annaldas/projects/result/snrnp70/snrnp70_protein_analysis.txt\",\"w+\")\n", "file.write(\"Gene:\\t%s\\n\" %(\"SNRNP30\"))\n", "file.write(\"Common Domains:\\t\")\n", @@ -530,24 +833,8 @@ " specific_domains = domains[transcript].difference(common_domains)\n", " file.write(\"%s: %s \\n\" %(transcript,\",\".join(list(specific_domains))))\n", "file.close()\n", - "print(common_domains)\n", - " \n", - " " + "print(common_domains)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/translation_protein.py b/translation_protein.py index d59213b..79c4fdb 100644 --- a/translation_protein.py +++ b/translation_protein.py @@ -1,6 +1,6 @@ from Bio import SeqIO -def translate(seq): +def translate(seq,orf): seq = seq.upper() seq = seq.replace("\n","") @@ -26,7 +26,7 @@ def translate(seq): protein = "" exon = False translating = True - i = 0 + i = orf while (translating): codon = seq[i:i+3] @@ -45,7 +45,7 @@ def translate(seq): protein += table[codon] i += 3 else: - i += 1 + i += 3 return protein @@ -56,8 +56,20 @@ def translate(seq): output = [] for transcript in transcripts: - output.append(">" + str(transcripts[transcript].id)) - output.append(translate(str(transcripts[transcript].seq))) + protein = translate(str(transcripts[transcript].seq),0) + if (protein != ""): + output.append(">" + str(transcripts[transcript].id) + "_1") + output.append(protein) + + protein = translate(str(transcripts[transcript].seq),1) + if (protein != ""): + output.append(">" + str(transcripts[transcript].id) + "_2") + output.append(protein) + + protein = translate(str(transcripts[transcript].seq),2) + if (protein != ""): + output.append(">" + str(transcripts[transcript].id) + "_3") + output.append(protein) output_filename = snakemake.output[0] output_file = open(output_filename,"w+")