Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from Bio import SeqIO"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Import the nanopore annotation file\n",
"\n",
"annotation_filename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/Results/GffCompare/nanopore.combined.gtf\"\n",
"annotate_df = pd.read_csv(annotation_filename,sep = \"\\t\", header = None)\n",
"annotate_df = annotate_df[annotate_df[2] != \"exon\"]\n",
"annotate_lines = list(annotate_df[8])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Mapping gene name to oID\n",
"# Mapping oID to transcript id\n",
"# Mapping transcript id to exons\n",
"\n",
"ABI2_info = []\n",
"\n",
"gene_oID = dict()\n",
"oID_tID = dict()\n",
"#tID_exon = dict()\n",
"\n",
"for ann in annotate_lines: \n",
" if \"gene_name\" in ann:\n",
" line = ann.split(\";\")\n",
" tID = line[0].split(\" \")[-1][1:-1]\n",
" gene = line[2].split(\" \")[-1][1:-1]\n",
" oID = line[3].split(\" \")[-1][1:-1]\n",
" \n",
" if (gene not in gene_oID): gene_oID[gene] = [oID]\n",
" else: gene_oID[gene].append(oID)\n",
" \n",
" if (oID not in oID_tID): oID_tID[oID] = tID\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Import transcript isoform sequences\n",
"\n",
"transcripts_filename = \"/project/Neurodifferentiation_System/NanoporeResults/Results/Pinfish/polished_transcripts.fas\"\n",
"transcripts = SeqIO.index(transcripts_filename, \"fasta\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"filename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/Results/Quantification/all_counts.txt\"\n",
"df = pd.read_csv(filename)\n",
"group = df.groupby([\"gene_name\"])\n",
"for gene,item in group:\n",
" if(gene == \"BRD4\"):\n",
" print(item.sort_values(\"transcript_id\"))\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"l=[]\n",
"for fn in [\"/home/annaldas/projects/nanopore-transcriptome-analysis/Direct/Quantification/OJ32.directed.counts\"]:\n",
" df=pd.read_csv(fn,delimiter='\\t',usecols=[0,1],index_col=[1])\n",
" df['counts']=df['counts']*1000000/df['counts'].sum()\n",
" df=df.rename(columns={'counts':fn.split('.counts')[0].split('/')[-1]})\n",
" l.append(df)\n",
"df=l[0]\n",
"for i in range(1,len(l)):\n",
" df=df.join(l[i],how='outer') \n",
"df=df.fillna(0)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"annotation_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/GffCompare/nanopore.combined_filt.gtf\"\n",
"annotate_df = pd.read_csv(annotation_filename,sep = \"\\t\", header = None)\n",
"annotate_df = annotate_df[annotate_df[2] != \"exon\"]\n",
"annotate_lines = list(annotate_df[8])\n",
"chrms = list(annotate_df[0])\n",
"start = list(annotate_df[3])\n",
"stop = list(annotate_df[4])\n",
"\n",
"gene_oID = dict()\n",
"oID_gene = dict()\n",
"oID_tID = dict()\n",
"tID_oID = dict()\n",
"\n",
"#tID_exon = dict()\n",
"\n",
"for ann in range(len(annotate_lines)): \n",
" if \"gene_name\" in annotate_lines[ann]:\n",
" line = annotate_lines[ann].split(\";\")\n",
" tID = line[0].split(\" \")[-1][1:-1]\n",
" gene = line[2].split(\" \")[-1][1:-1]\n",
" oID = line[3].split(\" \")[-1][1:-1]\n",
" transID = line[4].split(\" \")[-1][1:-1].split(\".\")[0]\n",
" \n",
" if (gene not in gene_oID): gene_oID[gene] = [oID]\n",
" else: gene_oID[gene].append(oID)\n",
" \n",
" if (oID not in oID_tID): oID_tID[oID] = tID\n",
" if (tID not in tID_oID): \n",
" tID_oID[tID] = oID\n",
" else:\n",
" print(\"this sucks\")\n",
" if (oID not in gene_oID): oID_gene[oID] = gene"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"transcripts_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Pinfish/corrected_transcriptome_polished_collapsed.fas\"\n",
"transcripts = SeqIO.index(transcripts_filename, \"fasta\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"output = []\n",
"reads = dict()\n",
"\n",
"for gene in gene_oID:\n",
" for oID in gene_oID[gene]:\n",
" #reads[transcripts[oID].id] = \"\"\n",
" print(oID)\n",
" break\n",
" break\n",
" tID = \">\" + oID_tID[transcripts[oID].id]\n",
" output.append(tID)\n",
" seq = str(transcripts[oID].seq)\n",
" output.append(seq)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'TCONS_00003260'"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"oID_tID[\"47acab47-d123-4db4-a724-04c6cc495805|9\"]"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"# bam -> sam : samtools view -h -o out.sam day3_2.genome.bam\n",
"# sam -> bam: samtools view -bS test.sam > test.bam\n",
"# samtools index test.bam\n",
"output = []\n",
"t_r = dict()\n",
"count1 = 0\n",
"count2 = 0"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out.sam\"\n",
"file = open(filename,\"r\")\n",
"lines = file.readlines()\n",
"file.close()\n",
"\n",
"\n",
"for line in lines:\n",
" if (not line.startswith(\"@\")): #and (\"chr\" not in line.split(\"\\t\")[2]) and (\"GL\" not in line.split(\"\\t\")[2])):\n",
" t = line.split(\"\\t\")[0]\n",
" r = line.split(\"\\t\")[2]\n",
" try:\n",
" t_r[t] = oID_tID[r]\n",
" count1 += 1\n",
" except:\n",
" count2 += 0\n",
" \n",
" #new_line = line.strip() + \"\\tzz:\" + read\n",
" # output.append(new_line)\n",
" #else:\n",
" # output.append(line.strip())\n",
"\n",
"#filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out_tag.sam\"\n",
"#file = open(filename,\"w+\")\n",
"#lines = file.readlines()\n",
"#file.close()\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out.sam\"\n",
"file = open(filename,\"r\")\n",
"genome = file.readlines()\n",
"file.close()\n",
"\n",
"output = []\n",
"count = 0\n",
"for line in genome:\n",
" if (not line.startswith(\"@\")):\n",
" t = line.split(\"\\t\")[0]\n",
" try: \n",
" new_line = line.strip() + \"\\tzz:Z:\" + t_r[t]\n",
" output.append(new_line)\n",
" except:\n",
" output.append(line.strip())\n",
" else:\n",
" output.append(line.strip())\n",
" \n",
"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out.sam\"\n",
"file = open(filename,\"w+\")\n",
"file.write(\"\\n\".join(output))\n",
"file.close()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"output = []\n",
"t_r = dict()\n",
"count1 = 0\n",
"count2 = 0\n",
"for line in lines:\n",
" if (not line.startswith(\"@\")): #and (\"chr\" not in line.split(\"\\t\")[2]) and (\"GL\" not in line.split(\"\\t\")[2])):\n",
" t = line.split(\"\\t\")[0]\n",
" r = line.split(\"\\t\")[2]\n",
" try:\n",
" t_r[t] = oID_tID[r]\n",
" count1 += 1\n",
" except:\n",
" count2 += 1\n",
" \n",
" #new_line = line.strip() + \"\\tzz:\" + read\n",
" # output.append(new_line)\n",
" #else:\n",
" # output.append(line.strip())\n",
"\n",
"#filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out_tag.sam\"\n",
"#file = open(filename,\"w+\")\n",
"#lines = file.readlines()\n",
"#file.close()\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"7ed9c5b9-665b-44d6-baad-ac3e275323dd\n"
]
},
{
"data": {
"text/plain": [
"'TCONS_00000001'"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"oID_tID[\"cc2e8b66-6da7-4245-9544-c06d33b50252|3\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"oID_geneID = dict()\n",
"geneID_oID = dict()\n",
"\n",
"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Pinfish/raw_transcripts.gff\"\n",
"gff = open(filename,\"r\")\n",
"lines = gff.readlines()\n",
"gff.close()\n",
"for line in lines:\n",
" if (not line.startswith(\"##\")):\n",
" chrm = line.split(\"\\t\")[0]\n",
" start = line.split(\"\\t\")[3]\n",
" stop = line.split(\"\\t\")[4]\n",
" s = line.split(\"\\t\")[-1]\n",
" g_t = s.split(\";\")\n",
" if (len(g_t) > 2):\n",
" geneid = g_t[0].split('\"')[1]\n",
" transcriptid = g_t[1].split('\"')[1]\n",
" transcriptid = transcriptid.split(\"|\")[0]\n",
" oID_geneID[transcriptid] = [chrm,start,stop] #geneid\n",
" #if geneid not in geneID_oID:\n",
" # geneID_oID[geneid] = transcriptid\n",
" #else:\n",
" # geneID_oID[geneid]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for oID in gene_oID[\"EBF2\"]:\n",
" try:\n",
" print(oID,oID_geneID[oID.split(\"|\")[0]])\n",
" except:\n",
" print(oID)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"read_cluster = dict()\n",
"cluster_read = dict()\n",
"\n",
"\n",
"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Pinfish/cluster_memberships.tsv\"\n",
"gff = open(filename,\"r\")\n",
"lines = gff.readlines()\n",
"gff.close()\n",
"for line in lines:\n",
" read = line.strip().split(\"\\t\")[0]\n",
" cluster = line.strip().split(\"\\t\")[1]\n",
" \n",
" read_cluster[read] = cluster\n",
" \n",
" if cluster not in cluster_read:\n",
" cluster_read[cluster] = [read]\n",
" else:\n",
" cluster_read[cluster].append(read)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"read_cluster[\"47acab47-d123-4db4-a724-04c6cc495805\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"cluster_read[\"8938ec8c-85dc-4bc8-a107-13d5281ba933\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"oID_geneID[\"142e462f-f586-42fc-97b9-b2e3bfa1fd0d9\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"geneID_oID[\"142e462f-f586-42fc-97b9-b2e3bfa1fd0d\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Pinfish/clustered_pol_transcripts.gff\"\n",
"gff = open(filename,\"r\")\n",
"lines = gff.readlines()\n",
"gff.close()\n",
"count = 0\n",
"for line in lines:\n",
" if (not line.startswith(\"##\")):\n",
" chrm = line.split(\"\\t\")[0]\n",
" start = line.split(\"\\t\")[3]\n",
" stop = line.split(\"\\t\")[4]\n",
" s = line.split(\"\\t\")[-1]\n",
" g_t = s.split(\";\")\n",
" if (len(g_t) > 2) and (chrm == \"chr1\") and (int(start) >= 10000) and (int(stop) <= 100000):\n",
" print(g_t,chrm,start,stop)\n",
" count += 1\n",
" if (count > 1000):\n",
" break\n",
" #if (len(g_t) > 2):\n",
" # geneid = g_t[0].split('\"')[1]\n",
" # transcriptid = g_t[1].split('\"')[1]\n",
" # transcriptid = transcriptid.split(\"|\")[0]\n",
" # if ()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import misopy"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}