scratchwork.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from Bio import SeqIO"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import the nanopore annotation file\n",
    "\n",
    "annotation_filename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/Results/GffCompare/nanopore.combined.gtf\"\n",
    "annotate_df = pd.read_csv(annotation_filename,sep = \"\\t\", header = None)\n",
    "annotate_df = annotate_df[annotate_df[2]  != \"exon\"]\n",
    "annotate_lines = list(annotate_df[8])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Mapping gene name to oID\n",
    "# Mapping oID to transcript id\n",
    "# Mapping transcript id to exons\n",
    "\n",
    "ABI2_info = []\n",
    "\n",
    "gene_oID = dict()\n",
    "oID_tID = dict()\n",
    "#tID_exon = dict()\n",
    "\n",
    "for ann in annotate_lines:        \n",
    "    if \"gene_name\" in ann:\n",
    "        line = ann.split(\";\")\n",
    "        tID = line[0].split(\" \")[-1][1:-1]\n",
    "        gene = line[2].split(\" \")[-1][1:-1]\n",
    "        oID = line[3].split(\" \")[-1][1:-1]\n",
    "        \n",
    "        if (gene not in gene_oID): gene_oID[gene] = [oID]\n",
    "        else: gene_oID[gene].append(oID)\n",
    "            \n",
    "        if (oID not in oID_tID): oID_tID[oID] = tID\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import transcript isoform sequences\n",
    "\n",
    "transcripts_filename = \"/project/Neurodifferentiation_System/NanoporeResults/Results/Pinfish/polished_transcripts.fas\"\n",
    "transcripts = SeqIO.index(transcripts_filename, \"fasta\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/Results/Quantification/all_counts.txt\"\n",
    "df = pd.read_csv(filename)\n",
    "group = df.groupby([\"gene_name\"])\n",
    "for gene,item in group:\n",
    "    if(gene == \"BRD4\"):\n",
    "        print(item.sort_values(\"transcript_id\"))\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "l=[]\n",
    "for fn in [\"/home/annaldas/projects/nanopore-transcriptome-analysis/Direct/Quantification/OJ32.directed.counts\"]:\n",
    "    df=pd.read_csv(fn,delimiter='\\t',usecols=[0,1],index_col=[1])\n",
    "    df['counts']=df['counts']*1000000/df['counts'].sum()\n",
    "    df=df.rename(columns={'counts':fn.split('.counts')[0].split('/')[-1]})\n",
    "    l.append(df)\n",
    "df=l[0]\n",
    "for i in range(1,len(l)):\n",
    "    df=df.join(l[i],how='outer')        \n",
    "df=df.fillna(0)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "annotation_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/GffCompare/nanopore.combined_filt.gtf\"\n",
    "annotate_df = pd.read_csv(annotation_filename,sep = \"\\t\", header = None)\n",
    "annotate_df = annotate_df[annotate_df[2]  != \"exon\"]\n",
    "annotate_lines = list(annotate_df[8])\n",
    "chrms = list(annotate_df[0])\n",
    "start = list(annotate_df[3])\n",
    "stop = list(annotate_df[4])\n",
    "\n",
    "gene_oID = dict()\n",
    "oID_gene = dict()\n",
    "oID_tID = dict()\n",
    "tID_oID = dict()\n",
    "\n",
    "#tID_exon = dict()\n",
    "\n",
    "for ann in range(len(annotate_lines)):        \n",
    "    if \"gene_name\" in annotate_lines[ann]:\n",
    "        line = annotate_lines[ann].split(\";\")\n",
    "        tID = line[0].split(\" \")[-1][1:-1]\n",
    "        gene = line[2].split(\" \")[-1][1:-1]\n",
    "        oID = line[3].split(\" \")[-1][1:-1]\n",
    "        transID = line[4].split(\" \")[-1][1:-1].split(\".\")[0]\n",
    "        \n",
    "        if (gene not in gene_oID): gene_oID[gene] = [oID]\n",
    "        else: gene_oID[gene].append(oID)\n",
    "            \n",
    "        if (oID not in oID_tID): oID_tID[oID] = tID\n",
    "        if (tID not in tID_oID): \n",
    "            tID_oID[tID] = oID\n",
    "        else:\n",
    "            print(\"this sucks\")\n",
    "        if (oID not in gene_oID): oID_gene[oID] = gene"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "transcripts_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Pinfish/corrected_transcriptome_polished_collapsed.fas\"\n",
    "transcripts = SeqIO.index(transcripts_filename, \"fasta\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output = []\n",
    "reads = dict()\n",
    "\n",
    "for gene in gene_oID:\n",
    "    for oID in gene_oID[gene]:\n",
    "        #reads[transcripts[oID].id] = \"\"\n",
    "        print(oID)\n",
    "        break\n",
    "    break\n",
    "    tID = \">\" + oID_tID[transcripts[oID].id]\n",
    "    output.append(tID)\n",
    "    seq = str(transcripts[oID].seq)\n",
    "    output.append(seq)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'TCONS_00003260'"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "oID_tID[\"47acab47-d123-4db4-a724-04c6cc495805|9\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "# bam -> sam : samtools view -h -o out.sam day3_2.genome.bam\n",
    "# sam -> bam: samtools view -bS test.sam > test.bam\n",
    "# samtools index test.bam\n",
    "output = []\n",
    "t_r = dict()\n",
    "count1 = 0\n",
    "count2 = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out.sam\"\n",
    "file = open(filename,\"r\")\n",
    "lines = file.readlines()\n",
    "file.close()\n",
    "\n",
    "\n",
    "for line in lines:\n",
    "    if (not line.startswith(\"@\")): #and (\"chr\" not in line.split(\"\\t\")[2]) and (\"GL\" not in line.split(\"\\t\")[2])):\n",
    "        t = line.split(\"\\t\")[0]\n",
    "        r = line.split(\"\\t\")[2]\n",
    "        try:\n",
    "            t_r[t] = oID_tID[r]\n",
    "            count1 += 1\n",
    "        except:\n",
    "            count2 += 0\n",
    "        \n",
    "        #new_line = line.strip() + \"\\tzz:\" + read\n",
    "    #    output.append(new_line)\n",
    "    #else:\n",
    "    #    output.append(line.strip())\n",
    "\n",
    "#filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out_tag.sam\"\n",
    "#file = open(filename,\"w+\")\n",
    "#lines = file.readlines()\n",
    "#file.close()\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out.sam\"\n",
    "file = open(filename,\"r\")\n",
    "genome = file.readlines()\n",
    "file.close()\n",
    "\n",
    "output = []\n",
    "count = 0\n",
    "for line in genome:\n",
    "    if (not line.startswith(\"@\")):\n",
    "        t = line.split(\"\\t\")[0]\n",
    "        try: \n",
    "            new_line = line.strip() + \"\\tzz:Z:\" + t_r[t]\n",
    "            output.append(new_line)\n",
    "        except:\n",
    "            output.append(line.strip())\n",
    "    else:\n",
    "        output.append(line.strip())\n",
    "    \n",
    "filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out.sam\"\n",
    "file = open(filename,\"w+\")\n",
    "file.write(\"\\n\".join(output))\n",
    "file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "output = []\n",
    "t_r = dict()\n",
    "count1 = 0\n",
    "count2 = 0\n",
    "for line in lines:\n",
    "    if (not line.startswith(\"@\")): #and (\"chr\" not in line.split(\"\\t\")[2]) and (\"GL\" not in line.split(\"\\t\")[2])):\n",
    "        t = line.split(\"\\t\")[0]\n",
    "        r = line.split(\"\\t\")[2]\n",
    "        try:\n",
    "            t_r[t] = oID_tID[r]\n",
    "            count1 += 1\n",
    "        except:\n",
    "            count2 += 1\n",
    "        \n",
    "        #new_line = line.strip() + \"\\tzz:\" + read\n",
    "    #    output.append(new_line)\n",
    "    #else:\n",
    "    #    output.append(line.strip())\n",
    "\n",
    "#filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out_tag.sam\"\n",
    "#file = open(filename,\"w+\")\n",
    "#lines = file.readlines()\n",
    "#file.close()\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7ed9c5b9-665b-44d6-baad-ac3e275323dd\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'TCONS_00000001'"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "oID_tID[\"cc2e8b66-6da7-4245-9544-c06d33b50252|3\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "oID_geneID = dict()\n",
    "geneID_oID = dict()\n",
    "\n",
    "filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Pinfish/raw_transcripts.gff\"\n",
    "gff = open(filename,\"r\")\n",
    "lines = gff.readlines()\n",
    "gff.close()\n",
    "for line in lines:\n",
    "    if (not line.startswith(\"##\")):\n",
    "        chrm = line.split(\"\\t\")[0]\n",
    "        start = line.split(\"\\t\")[3]\n",
    "        stop = line.split(\"\\t\")[4]\n",
    "        s = line.split(\"\\t\")[-1]\n",
    "        g_t = s.split(\";\")\n",
    "        if (len(g_t) > 2):\n",
    "            geneid = g_t[0].split('\"')[1]\n",
    "            transcriptid = g_t[1].split('\"')[1]\n",
    "            transcriptid = transcriptid.split(\"|\")[0]\n",
    "            oID_geneID[transcriptid] = [chrm,start,stop] #geneid\n",
    "            #if geneid not in geneID_oID:\n",
    "            #    geneID_oID[geneid] = transcriptid\n",
    "            #else:\n",
    "             #   geneID_oID[geneid]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for oID in gene_oID[\"EBF2\"]:\n",
    "    try:\n",
    "        print(oID,oID_geneID[oID.split(\"|\")[0]])\n",
    "    except:\n",
    "        print(oID)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "read_cluster = dict()\n",
    "cluster_read = dict()\n",
    "\n",
    "\n",
    "filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Pinfish/cluster_memberships.tsv\"\n",
    "gff = open(filename,\"r\")\n",
    "lines = gff.readlines()\n",
    "gff.close()\n",
    "for line in lines:\n",
    "    read = line.strip().split(\"\\t\")[0]\n",
    "    cluster = line.strip().split(\"\\t\")[1]\n",
    "    \n",
    "    read_cluster[read] = cluster\n",
    "    \n",
    "    if cluster not in cluster_read:\n",
    "        cluster_read[cluster] = [read]\n",
    "    else:\n",
    "        cluster_read[cluster].append(read)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "read_cluster[\"47acab47-d123-4db4-a724-04c6cc495805\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "cluster_read[\"8938ec8c-85dc-4bc8-a107-13d5281ba933\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "oID_geneID[\"142e462f-f586-42fc-97b9-b2e3bfa1fd0d9\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "geneID_oID[\"142e462f-f586-42fc-97b9-b2e3bfa1fd0d\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Pinfish/clustered_pol_transcripts.gff\"\n",
    "gff = open(filename,\"r\")\n",
    "lines = gff.readlines()\n",
    "gff.close()\n",
    "count = 0\n",
    "for line in lines:\n",
    "    if (not line.startswith(\"##\")):\n",
    "        chrm = line.split(\"\\t\")[0]\n",
    "        start = line.split(\"\\t\")[3]\n",
    "        stop = line.split(\"\\t\")[4]\n",
    "        s = line.split(\"\\t\")[-1]\n",
    "        g_t = s.split(\";\")\n",
    "        if (len(g_t) > 2) and (chrm == \"chr1\") and (int(start) >= 10000) and (int(stop) <= 100000):\n",
    "            print(g_t,chrm,start,stop)\n",
    "            count += 1\n",
    "            if (count > 1000):\n",
    "                break\n",
    "        #if (len(g_t) > 2):\n",
    "        #    geneid = g_t[0].split('\"')[1]\n",
    "        #    transcriptid = g_t[1].split('\"')[1]\n",
    "        #    transcriptid = transcriptid.split(\"|\")[0]\n",
    "        #    if ()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import misopy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"from Bio import SeqIO"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Import the nanopore annotation file\n",
	"\n",
	"annotation_filename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/Results/GffCompare/nanopore.combined.gtf\"\n",
	"annotate_df = pd.read_csv(annotation_filename,sep = \"\\t\", header = None)\n",
	"annotate_df = annotate_df[annotate_df[2] != \"exon\"]\n",
	"annotate_lines = list(annotate_df[8])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Mapping gene name to oID\n",
	"# Mapping oID to transcript id\n",
	"# Mapping transcript id to exons\n",
	"\n",
	"ABI2_info = []\n",
	"\n",
	"gene_oID = dict()\n",
	"oID_tID = dict()\n",
	"#tID_exon = dict()\n",
	"\n",
	"for ann in annotate_lines: \n",
	" if \"gene_name\" in ann:\n",
	" line = ann.split(\";\")\n",
	" tID = line[0].split(\" \")[-1][1:-1]\n",
	" gene = line[2].split(\" \")[-1][1:-1]\n",
	" oID = line[3].split(\" \")[-1][1:-1]\n",
	" \n",
	" if (gene not in gene_oID): gene_oID[gene] = [oID]\n",
	" else: gene_oID[gene].append(oID)\n",
	" \n",
	" if (oID not in oID_tID): oID_tID[oID] = tID\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Import transcript isoform sequences\n",
	"\n",
	"transcripts_filename = \"/project/Neurodifferentiation_System/NanoporeResults/Results/Pinfish/polished_transcripts.fas\"\n",
	"transcripts = SeqIO.index(transcripts_filename, \"fasta\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"filename = \"/home/annaldas/projects/nanopore-transcriptome-analysis/Results/Quantification/all_counts.txt\"\n",
	"df = pd.read_csv(filename)\n",
	"group = df.groupby([\"gene_name\"])\n",
	"for gene,item in group:\n",
	" if(gene == \"BRD4\"):\n",
	" print(item.sort_values(\"transcript_id\"))\n",
	" break"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"l=[]\n",
	"for fn in [\"/home/annaldas/projects/nanopore-transcriptome-analysis/Direct/Quantification/OJ32.directed.counts\"]:\n",
	" df=pd.read_csv(fn,delimiter='\\t',usecols=[0,1],index_col=[1])\n",
	" df['counts']=df['counts']*1000000/df['counts'].sum()\n",
	" df=df.rename(columns={'counts':fn.split('.counts')[0].split('/')[-1]})\n",
	" l.append(df)\n",
	"df=l[0]\n",
	"for i in range(1,len(l)):\n",
	" df=df.join(l[i],how='outer') \n",
	"df=df.fillna(0)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"annotation_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/GffCompare/nanopore.combined_filt.gtf\"\n",
	"annotate_df = pd.read_csv(annotation_filename,sep = \"\\t\", header = None)\n",
	"annotate_df = annotate_df[annotate_df[2] != \"exon\"]\n",
	"annotate_lines = list(annotate_df[8])\n",
	"chrms = list(annotate_df[0])\n",
	"start = list(annotate_df[3])\n",
	"stop = list(annotate_df[4])\n",
	"\n",
	"gene_oID = dict()\n",
	"oID_gene = dict()\n",
	"oID_tID = dict()\n",
	"tID_oID = dict()\n",
	"\n",
	"#tID_exon = dict()\n",
	"\n",
	"for ann in range(len(annotate_lines)): \n",
	" if \"gene_name\" in annotate_lines[ann]:\n",
	" line = annotate_lines[ann].split(\";\")\n",
	" tID = line[0].split(\" \")[-1][1:-1]\n",
	" gene = line[2].split(\" \")[-1][1:-1]\n",
	" oID = line[3].split(\" \")[-1][1:-1]\n",
	" transID = line[4].split(\" \")[-1][1:-1].split(\".\")[0]\n",
	" \n",
	" if (gene not in gene_oID): gene_oID[gene] = [oID]\n",
	" else: gene_oID[gene].append(oID)\n",
	" \n",
	" if (oID not in oID_tID): oID_tID[oID] = tID\n",
	" if (tID not in tID_oID): \n",
	" tID_oID[tID] = oID\n",
	" else:\n",
	" print(\"this sucks\")\n",
	" if (oID not in gene_oID): oID_gene[oID] = gene"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"transcripts_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Pinfish/corrected_transcriptome_polished_collapsed.fas\"\n",
	"transcripts = SeqIO.index(transcripts_filename, \"fasta\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"output = []\n",
	"reads = dict()\n",
	"\n",
	"for gene in gene_oID:\n",
	" for oID in gene_oID[gene]:\n",
	" #reads[transcripts[oID].id] = \"\"\n",
	" print(oID)\n",
	" break\n",
	" break\n",
	" tID = \">\" + oID_tID[transcripts[oID].id]\n",
	" output.append(tID)\n",
	" seq = str(transcripts[oID].seq)\n",
	" output.append(seq)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'TCONS_00003260'"
	]
	},
	"execution_count": 29,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"oID_tID[\"47acab47-d123-4db4-a724-04c6cc495805\|9\"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 49,
	"metadata": {},
	"outputs": [],
	"source": [
	"# bam -> sam : samtools view -h -o out.sam day3_2.genome.bam\n",
	"# sam -> bam: samtools view -bS test.sam > test.bam\n",
	"# samtools index test.bam\n",
	"output = []\n",
	"t_r = dict()\n",
	"count1 = 0\n",
	"count2 = 0"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 55,
	"metadata": {},
	"outputs": [],
	"source": [
	"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out.sam\"\n",
	"file = open(filename,\"r\")\n",
	"lines = file.readlines()\n",
	"file.close()\n",
	"\n",
	"\n",
	"for line in lines:\n",
	" if (not line.startswith(\"@\")): #and (\"chr\" not in line.split(\"\\t\")[2]) and (\"GL\" not in line.split(\"\\t\")[2])):\n",
	" t = line.split(\"\\t\")[0]\n",
	" r = line.split(\"\\t\")[2]\n",
	" try:\n",
	" t_r[t] = oID_tID[r]\n",
	" count1 += 1\n",
	" except:\n",
	" count2 += 0\n",
	" \n",
	" #new_line = line.strip() + \"\\tzz:\" + read\n",
	" # output.append(new_line)\n",
	" #else:\n",
	" # output.append(line.strip())\n",
	"\n",
	"#filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out_tag.sam\"\n",
	"#file = open(filename,\"w+\")\n",
	"#lines = file.readlines()\n",
	"#file.close()\n",
	" \n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": 62,
	"metadata": {},
	"outputs": [],
	"source": [
	"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out.sam\"\n",
	"file = open(filename,\"r\")\n",
	"genome = file.readlines()\n",
	"file.close()\n",
	"\n",
	"output = []\n",
	"count = 0\n",
	"for line in genome:\n",
	" if (not line.startswith(\"@\")):\n",
	" t = line.split(\"\\t\")[0]\n",
	" try: \n",
	" new_line = line.strip() + \"\\tzz:Z:\" + t_r[t]\n",
	" output.append(new_line)\n",
	" except:\n",
	" output.append(line.strip())\n",
	" else:\n",
	" output.append(line.strip())\n",
	" \n",
	"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out.sam\"\n",
	"file = open(filename,\"w+\")\n",
	"file.write(\"\\n\".join(output))\n",
	"file.close()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 42,
	"metadata": {},
	"outputs": [],
	"source": [
	"output = []\n",
	"t_r = dict()\n",
	"count1 = 0\n",
	"count2 = 0\n",
	"for line in lines:\n",
	" if (not line.startswith(\"@\")): #and (\"chr\" not in line.split(\"\\t\")[2]) and (\"GL\" not in line.split(\"\\t\")[2])):\n",
	" t = line.split(\"\\t\")[0]\n",
	" r = line.split(\"\\t\")[2]\n",
	" try:\n",
	" t_r[t] = oID_tID[r]\n",
	" count1 += 1\n",
	" except:\n",
	" count2 += 1\n",
	" \n",
	" #new_line = line.strip() + \"\\tzz:\" + read\n",
	" # output.append(new_line)\n",
	" #else:\n",
	" # output.append(line.strip())\n",
	"\n",
	"#filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/IGV/out_tag.sam\"\n",
	"#file = open(filename,\"w+\")\n",
	"#lines = file.readlines()\n",
	"#file.close()\n",
	" \n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 40,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"7ed9c5b9-665b-44d6-baad-ac3e275323dd\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"'TCONS_00000001'"
	]
	},
	"execution_count": 40,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"oID_tID[\"cc2e8b66-6da7-4245-9544-c06d33b50252\|3\"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"oID_geneID = dict()\n",
	"geneID_oID = dict()\n",
	"\n",
	"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Pinfish/raw_transcripts.gff\"\n",
	"gff = open(filename,\"r\")\n",
	"lines = gff.readlines()\n",
	"gff.close()\n",
	"for line in lines:\n",
	" if (not line.startswith(\"##\")):\n",
	" chrm = line.split(\"\\t\")[0]\n",
	" start = line.split(\"\\t\")[3]\n",
	" stop = line.split(\"\\t\")[4]\n",
	" s = line.split(\"\\t\")[-1]\n",
	" g_t = s.split(\";\")\n",
	" if (len(g_t) > 2):\n",
	" geneid = g_t[0].split('\"')[1]\n",
	" transcriptid = g_t[1].split('\"')[1]\n",
	" transcriptid = transcriptid.split(\"\|\")[0]\n",
	" oID_geneID[transcriptid] = [chrm,start,stop] #geneid\n",
	" #if geneid not in geneID_oID:\n",
	" # geneID_oID[geneid] = transcriptid\n",
	" #else:\n",
	" # geneID_oID[geneid]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"for oID in gene_oID[\"EBF2\"]:\n",
	" try:\n",
	" print(oID,oID_geneID[oID.split(\"\|\")[0]])\n",
	" except:\n",
	" print(oID)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"read_cluster = dict()\n",
	"cluster_read = dict()\n",
	"\n",
	"\n",
	"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Pinfish/cluster_memberships.tsv\"\n",
	"gff = open(filename,\"r\")\n",
	"lines = gff.readlines()\n",
	"gff.close()\n",
	"for line in lines:\n",
	" read = line.strip().split(\"\\t\")[0]\n",
	" cluster = line.strip().split(\"\\t\")[1]\n",
	" \n",
	" read_cluster[read] = cluster\n",
	" \n",
	" if cluster not in cluster_read:\n",
	" cluster_read[cluster] = [read]\n",
	" else:\n",
	" cluster_read[cluster].append(read)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"read_cluster[\"47acab47-d123-4db4-a724-04c6cc495805\"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": false
	},
	"outputs": [],
	"source": [
	"cluster_read[\"8938ec8c-85dc-4bc8-a107-13d5281ba933\"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"oID_geneID[\"142e462f-f586-42fc-97b9-b2e3bfa1fd0d9\"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"geneID_oID[\"142e462f-f586-42fc-97b9-b2e3bfa1fd0d\"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Pinfish/clustered_pol_transcripts.gff\"\n",
	"gff = open(filename,\"r\")\n",
	"lines = gff.readlines()\n",
	"gff.close()\n",
	"count = 0\n",
	"for line in lines:\n",
	" if (not line.startswith(\"##\")):\n",
	" chrm = line.split(\"\\t\")[0]\n",
	" start = line.split(\"\\t\")[3]\n",
	" stop = line.split(\"\\t\")[4]\n",
	" s = line.split(\"\\t\")[-1]\n",
	" g_t = s.split(\";\")\n",
	" if (len(g_t) > 2) and (chrm == \"chr1\") and (int(start) >= 10000) and (int(stop) <= 100000):\n",
	" print(g_t,chrm,start,stop)\n",
	" count += 1\n",
	" if (count > 1000):\n",
	" break\n",
	" #if (len(g_t) > 2):\n",
	" # geneid = g_t[0].split('\"')[1]\n",
	" # transcriptid = g_t[1].split('\"')[1]\n",
	" # transcriptid = transcriptid.split(\"\|\")[0]\n",
	" # if ()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import misopy"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.4"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}