Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
isoform_differentiation/scratchwork.ipynb
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
553 lines (553 sloc)
14.9 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from Bio import SeqIO" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Import the nanopore annotation file\n", | |
"\n", | |
"annotation_filename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/Results/GffCompare/nanopore.combined.gtf\"\n", | |
"annotate_df = pd.read_csv(annotation_filename,sep = \"\\t\", header = None)\n", | |
"annotate_df = annotate_df[annotate_df[2] != \"exon\"]\n", | |
"annotate_lines = list(annotate_df[8])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Mapping gene name to oID\n", | |
"# Mapping oID to transcript id\n", | |
"# Mapping transcript id to exons\n", | |
"\n", | |
"ABI2_info = []\n", | |
"\n", | |
"gene_oID = dict()\n", | |
"oID_tID = dict()\n", | |
"#tID_exon = dict()\n", | |
"\n", | |
"for ann in annotate_lines: \n", | |
" if \"gene_name\" in ann:\n", | |
" line = ann.split(\";\")\n", | |
" tID = line[0].split(\" \")[-1][1:-1]\n", | |
" gene = line[2].split(\" \")[-1][1:-1]\n", | |
" oID = line[3].split(\" \")[-1][1:-1]\n", | |
" \n", | |
" if (gene not in gene_oID): gene_oID[gene] = [oID]\n", | |
" else: gene_oID[gene].append(oID)\n", | |
" \n", | |
" if (oID not in oID_tID): oID_tID[oID] = tID\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Import transcript isoform sequences\n", | |
"\n", | |
"transcripts_filename = \"/project/Neurodifferentiation_System/NanoporeResults/Results/Pinfish/polished_transcripts.fas\"\n", | |
"transcripts = SeqIO.index(transcripts_filename, \"fasta\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"filename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/Results/Quantification/all_counts.txt\"\n", | |
"df = pd.read_csv(filename)\n", | |
"group = df.groupby([\"gene_name\"])\n", | |
"for gene,item in group:\n", | |
" if(gene == \"BRD4\"):\n", | |
" print(item.sort_values(\"transcript_id\"))\n", | |
" break" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"l=[]\n", | |
"for fn in [\"/home/annaldas/projects/nanopore-transcriptome-analysis/Direct/Quantification/OJ32.directed.counts\"]:\n", | |
" df=pd.read_csv(fn,delimiter='\\t',usecols=[0,1],index_col=[1])\n", | |
" df['counts']=df['counts']*1000000/df['counts'].sum()\n", | |
" df=df.rename(columns={'counts':fn.split('.counts')[0].split('/')[-1]})\n", | |
" l.append(df)\n", | |
"df=l[0]\n", | |
"for i in range(1,len(l)):\n", | |
" df=df.join(l[i],how='outer') \n", | |
"df=df.fillna(0)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"annotation_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/GffCompare/nanopore.combined_filt.gtf\"\n", | |
"annotate_df = pd.read_csv(annotation_filename,sep = \"\\t\", header = None)\n", | |
"annotate_df = annotate_df[annotate_df[2] != \"exon\"]\n", | |
"annotate_lines = list(annotate_df[8])\n", | |
"chrms = list(annotate_df[0])\n", | |
"start = list(annotate_df[3])\n", | |
"stop = list(annotate_df[4])\n", | |
"\n", | |
"gene_oID = dict()\n", | |
"oID_gene = dict()\n", | |
"oID_tID = dict()\n", | |
"tID_oID = dict()\n", | |
"\n", | |
"#tID_exon = dict()\n", | |
"\n", | |
"for ann in range(len(annotate_lines)): \n", | |
" if \"gene_name\" in annotate_lines[ann]:\n", | |
" line = annotate_lines[ann].split(\";\")\n", | |
" tID = line[0].split(\" \")[-1][1:-1]\n", | |
" gene = line[2].split(\" \")[-1][1:-1]\n", | |
" oID = line[3].split(\" \")[-1][1:-1]\n", | |
" transID = line[4].split(\" \")[-1][1:-1].split(\".\")[0]\n", | |
" \n", | |
" if (gene not in gene_oID): gene_oID[gene] = [oID]\n", | |
" else: gene_oID[gene].append(oID)\n", | |
" \n", | |
" if (oID not in oID_tID): oID_tID[oID] = tID\n", | |
" if (tID not in tID_oID): \n", | |
" tID_oID[tID] = oID\n", | |
" else:\n", | |
" print(\"this sucks\")\n", | |
" if (oID not in gene_oID): oID_gene[oID] = gene" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"transcripts_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Pinfish/corrected_transcriptome_polished_collapsed.fas\"\n", | |
"transcripts = SeqIO.index(transcripts_filename, \"fasta\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"output = []\n", | |
"reads = dict()\n", | |
"\n", | |
"for gene in gene_oID:\n", | |
" for oID in gene_oID[gene]:\n", | |
" #reads[transcripts[oID].id] = \"\"\n", | |
" print(oID)\n", | |
" break\n", | |
" break\n", | |
" tID = \">\" + oID_tID[transcripts[oID].id]\n", | |
" output.append(tID)\n", | |
" seq = str(transcripts[oID].seq)\n", | |
" output.append(seq)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'TCONS_00003260'" | |
] | |
}, | |
"execution_count": 29, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"oID_tID[\"47acab47-d123-4db4-a724-04c6cc495805|9\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# bam -> sam : samtools view -h -o out.sam day3_2.genome.bam\n", | |
"# sam -> bam: samtools view -bS test.sam > test.bam\n", | |
"# samtools index test.bam\n", | |
"output = []\n", | |
"t_r = dict()\n", | |
"count1 = 0\n", | |
"count2 = 0" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out.sam\"\n", | |
"file = open(filename,\"r\")\n", | |
"lines = file.readlines()\n", | |
"file.close()\n", | |
"\n", | |
"\n", | |
"for line in lines:\n", | |
" if (not line.startswith(\"@\")): #and (\"chr\" not in line.split(\"\\t\")[2]) and (\"GL\" not in line.split(\"\\t\")[2])):\n", | |
" t = line.split(\"\\t\")[0]\n", | |
" r = line.split(\"\\t\")[2]\n", | |
" try:\n", | |
" t_r[t] = oID_tID[r]\n", | |
" count1 += 1\n", | |
" except:\n", | |
" count2 += 0\n", | |
" \n", | |
" #new_line = line.strip() + \"\\tzz:\" + read\n", | |
" # output.append(new_line)\n", | |
" #else:\n", | |
" # output.append(line.strip())\n", | |
"\n", | |
"#filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out_tag.sam\"\n", | |
"#file = open(filename,\"w+\")\n", | |
"#lines = file.readlines()\n", | |
"#file.close()\n", | |
" \n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 62, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out.sam\"\n", | |
"file = open(filename,\"r\")\n", | |
"genome = file.readlines()\n", | |
"file.close()\n", | |
"\n", | |
"output = []\n", | |
"count = 0\n", | |
"for line in genome:\n", | |
" if (not line.startswith(\"@\")):\n", | |
" t = line.split(\"\\t\")[0]\n", | |
" try: \n", | |
" new_line = line.strip() + \"\\tzz:Z:\" + t_r[t]\n", | |
" output.append(new_line)\n", | |
" except:\n", | |
" output.append(line.strip())\n", | |
" else:\n", | |
" output.append(line.strip())\n", | |
" \n", | |
"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out.sam\"\n", | |
"file = open(filename,\"w+\")\n", | |
"file.write(\"\\n\".join(output))\n", | |
"file.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"output = []\n", | |
"t_r = dict()\n", | |
"count1 = 0\n", | |
"count2 = 0\n", | |
"for line in lines:\n", | |
" if (not line.startswith(\"@\")): #and (\"chr\" not in line.split(\"\\t\")[2]) and (\"GL\" not in line.split(\"\\t\")[2])):\n", | |
" t = line.split(\"\\t\")[0]\n", | |
" r = line.split(\"\\t\")[2]\n", | |
" try:\n", | |
" t_r[t] = oID_tID[r]\n", | |
" count1 += 1\n", | |
" except:\n", | |
" count2 += 1\n", | |
" \n", | |
" #new_line = line.strip() + \"\\tzz:\" + read\n", | |
" # output.append(new_line)\n", | |
" #else:\n", | |
" # output.append(line.strip())\n", | |
"\n", | |
"#filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out_tag.sam\"\n", | |
"#file = open(filename,\"w+\")\n", | |
"#lines = file.readlines()\n", | |
"#file.close()\n", | |
" \n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"7ed9c5b9-665b-44d6-baad-ac3e275323dd\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"'TCONS_00000001'" | |
] | |
}, | |
"execution_count": 40, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"oID_tID[\"cc2e8b66-6da7-4245-9544-c06d33b50252|3\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"oID_geneID = dict()\n", | |
"geneID_oID = dict()\n", | |
"\n", | |
"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Pinfish/raw_transcripts.gff\"\n", | |
"gff = open(filename,\"r\")\n", | |
"lines = gff.readlines()\n", | |
"gff.close()\n", | |
"for line in lines:\n", | |
" if (not line.startswith(\"##\")):\n", | |
" chrm = line.split(\"\\t\")[0]\n", | |
" start = line.split(\"\\t\")[3]\n", | |
" stop = line.split(\"\\t\")[4]\n", | |
" s = line.split(\"\\t\")[-1]\n", | |
" g_t = s.split(\";\")\n", | |
" if (len(g_t) > 2):\n", | |
" geneid = g_t[0].split('\"')[1]\n", | |
" transcriptid = g_t[1].split('\"')[1]\n", | |
" transcriptid = transcriptid.split(\"|\")[0]\n", | |
" oID_geneID[transcriptid] = [chrm,start,stop] #geneid\n", | |
" #if geneid not in geneID_oID:\n", | |
" # geneID_oID[geneid] = transcriptid\n", | |
" #else:\n", | |
" # geneID_oID[geneid]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for oID in gene_oID[\"EBF2\"]:\n", | |
" try:\n", | |
" print(oID,oID_geneID[oID.split(\"|\")[0]])\n", | |
" except:\n", | |
" print(oID)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"read_cluster = dict()\n", | |
"cluster_read = dict()\n", | |
"\n", | |
"\n", | |
"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Pinfish/cluster_memberships.tsv\"\n", | |
"gff = open(filename,\"r\")\n", | |
"lines = gff.readlines()\n", | |
"gff.close()\n", | |
"for line in lines:\n", | |
" read = line.strip().split(\"\\t\")[0]\n", | |
" cluster = line.strip().split(\"\\t\")[1]\n", | |
" \n", | |
" read_cluster[read] = cluster\n", | |
" \n", | |
" if cluster not in cluster_read:\n", | |
" cluster_read[cluster] = [read]\n", | |
" else:\n", | |
" cluster_read[cluster].append(read)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"read_cluster[\"47acab47-d123-4db4-a724-04c6cc495805\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"cluster_read[\"8938ec8c-85dc-4bc8-a107-13d5281ba933\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"oID_geneID[\"142e462f-f586-42fc-97b9-b2e3bfa1fd0d9\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"geneID_oID[\"142e462f-f586-42fc-97b9-b2e3bfa1fd0d\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Pinfish/clustered_pol_transcripts.gff\"\n", | |
"gff = open(filename,\"r\")\n", | |
"lines = gff.readlines()\n", | |
"gff.close()\n", | |
"count = 0\n", | |
"for line in lines:\n", | |
" if (not line.startswith(\"##\")):\n", | |
" chrm = line.split(\"\\t\")[0]\n", | |
" start = line.split(\"\\t\")[3]\n", | |
" stop = line.split(\"\\t\")[4]\n", | |
" s = line.split(\"\\t\")[-1]\n", | |
" g_t = s.split(\";\")\n", | |
" if (len(g_t) > 2) and (chrm == \"chr1\") and (int(start) >= 10000) and (int(stop) <= 100000):\n", | |
" print(g_t,chrm,start,stop)\n", | |
" count += 1\n", | |
" if (count > 1000):\n", | |
" break\n", | |
" #if (len(g_t) > 2):\n", | |
" # geneid = g_t[0].split('\"')[1]\n", | |
" # transcriptid = g_t[1].split('\"')[1]\n", | |
" # transcriptid = transcriptid.split(\"|\")[0]\n", | |
" # if ()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import misopy" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |