Untitled9.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Jelena's Fake Data\n",
    "\n",
    "filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/ReferenceData/gencode.v32.primary_assembly.annotation.sorted.gtf\"\n",
    "df_all = pd.read_csv(filename, skiprows = 5, header = None, sep = \"\\t\")\n",
    "info = list(df_all[8])\n",
    "\n",
    "gene_genetypes = dict()\n",
    "for i in info:\n",
    "    gene_type = i.split(\"gene_type\")[1].split('\"')[1]\n",
    "    gene_name = i.split(\"gene_name\")[1].split('\"')[1]\n",
    "    gene_genetypes[gene_name] = gene_type\n",
    "    \n",
    "filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Quantification/all_counts_deseq2norm.txt\"\n",
    "df_all = pd.read_csv(filename, sep = \",\")\n",
    "df_all = df_all.dropna()\n",
    "df_all[\"gene_type\"] = df_all[\"gene_name\"].apply(lambda x: gene_genetypes[x])\n",
    "\n",
    "df_all.to_csv(\"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Quantification/all_counts_deseq2norm_jelena.csv\", index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = \"/project/owlmayerTemporary/Sid/mass_spec/protein_seq_db/ASAnalysis.csv\"\n",
    "as_analysis_file = open(filename,\"r\")\n",
    "as_lines_all = as_analysis_file.readlines()\n",
    "as_analysis_file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['id', 'isoform_id', 'ES', 'ES_genomic_start', 'ES_genomic_end', 'MEE', 'MEE_genomic_start', 'MEE_genomic_end', 'MES', 'MES_genomic_start', 'MES_genomic_end', 'IR', 'IR_genomic_start', 'IR_genomic_end', 'A5', 'A5_genomic_start', 'A5_genomic_end', 'A3', 'A3_genomic_start', 'A3_genomic_end', 'ATSS', 'ATSS_genomic_start', 'ATSS_genomic_end', 'ATTS', 'ATTS_genomic_start', 'ATTS_genomic_end']\n"
     ]
    }
   ],
   "source": [
    "as_lines = as_lines_all\n",
    "header = as_lines[0].strip().split(\",\")\n",
    "as_lines = as_lines[1:]\n",
    "print(header)\n",
    "# A3SS, A5SS, MXE, RI, SE \n",
    "se_lines = []\n",
    "tcons_list = []\n",
    "for line in as_lines:\n",
    "    line = line.strip().split(\",\")\n",
    "    tcons_list.append(line[1])\n",
    "    #SE\n",
    "    #for i in range(int(line[header.index(\"ES\")])):\n",
    "    #    start = line[header.index(\"ES_genomic_start\")].split(\";\")[i]\n",
    "    #    end = line[header.index(\"ES_genomic_end\")].split(\";\")[i]\n",
    "    #    print(start)\n",
    "        \n",
    "    \n",
    "   # print(line)\n",
    "    #break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "annotation_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/GffCompare/nanopore.combined_filt.gtf\"\n",
    "annotate_df = pd.read_csv(annotation_filename,sep = \"\\t\", header = None)\n",
    "annotate_df = annotate_df[annotate_df[2]  == \"transcript\"]\n",
    "annotate_lines = list(annotate_df[8])\n",
    "chrms = list(annotate_df[0])\n",
    "start = list(annotate_df[3])\n",
    "stop = list(annotate_df[4])\n",
    "\n",
    "tID_gene = dict()\n",
    "gene_oID = dict()\n",
    "\n",
    "for ann in range(len(annotate_lines)):        \n",
    "    if \"gene_name\" in annotate_lines[ann]:\n",
    "        line = annotate_lines[ann].split(\";\")\n",
    "        tID = line[0].split(\" \")[-1][1:-1]\n",
    "        gene = line[2].split(\" \")[-1][1:-1]\n",
    "        oID = line[3].split(\" \")[-1][1:-1]\n",
    "        tID = line[0].split(\" \")[-1][1:-1]\n",
    "        enst = line[4].split(\" \")[-1][1:-1].split(\".\")[0]\n",
    "        \n",
    "        if (gene not in gene_oID): gene_oID[gene] = [oID]\n",
    "        else: gene_oID[gene].append(oID)\n",
    "            \n",
    "        tID_gene[tID] = gene\n",
    "            \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "transcript_genes = set()\n",
    "for tcons in tcons_list:\n",
    "    gene = tID_gene[tcons]\n",
    "    \n",
    "    if (gene in gene_oID and len(gene_oID[gene]) > 0):\n",
    "        transcript_genes.add(gene)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Lists/splicing_events.txt\"\n",
    "splicing_file = open(filename, \"w+\")\n",
    "splicing_file.writelines(\"\\n\".join(list(transcript_genes)))\n",
    "splicing_file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1484"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(list(transcript_genes))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Jelena's Fake Data\n",
	"\n",
	"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/ReferenceData/gencode.v32.primary_assembly.annotation.sorted.gtf\"\n",
	"df_all = pd.read_csv(filename, skiprows = 5, header = None, sep = \"\\t\")\n",
	"info = list(df_all[8])\n",
	"\n",
	"gene_genetypes = dict()\n",
	"for i in info:\n",
	" gene_type = i.split(\"gene_type\")[1].split('\"')[1]\n",
	" gene_name = i.split(\"gene_name\")[1].split('\"')[1]\n",
	" gene_genetypes[gene_name] = gene_type\n",
	" \n",
	"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Quantification/all_counts_deseq2norm.txt\"\n",
	"df_all = pd.read_csv(filename, sep = \",\")\n",
	"df_all = df_all.dropna()\n",
	"df_all[\"gene_type\"] = df_all[\"gene_name\"].apply(lambda x: gene_genetypes[x])\n",
	"\n",
	"df_all.to_csv(\"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Quantification/all_counts_deseq2norm_jelena.csv\", index = False)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [],
	"source": [
	"filename = \"/project/owlmayerTemporary/Sid/mass_spec/protein_seq_db/ASAnalysis.csv\"\n",
	"as_analysis_file = open(filename,\"r\")\n",
	"as_lines_all = as_analysis_file.readlines()\n",
	"as_analysis_file.close()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 36,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"['id', 'isoform_id', 'ES', 'ES_genomic_start', 'ES_genomic_end', 'MEE', 'MEE_genomic_start', 'MEE_genomic_end', 'MES', 'MES_genomic_start', 'MES_genomic_end', 'IR', 'IR_genomic_start', 'IR_genomic_end', 'A5', 'A5_genomic_start', 'A5_genomic_end', 'A3', 'A3_genomic_start', 'A3_genomic_end', 'ATSS', 'ATSS_genomic_start', 'ATSS_genomic_end', 'ATTS', 'ATTS_genomic_start', 'ATTS_genomic_end']\n"
	]
	}
	],
	"source": [
	"as_lines = as_lines_all\n",
	"header = as_lines[0].strip().split(\",\")\n",
	"as_lines = as_lines[1:]\n",
	"print(header)\n",
	"# A3SS, A5SS, MXE, RI, SE \n",
	"se_lines = []\n",
	"tcons_list = []\n",
	"for line in as_lines:\n",
	" line = line.strip().split(\",\")\n",
	" tcons_list.append(line[1])\n",
	" #SE\n",
	" #for i in range(int(line[header.index(\"ES\")])):\n",
	" # start = line[header.index(\"ES_genomic_start\")].split(\";\")[i]\n",
	" # end = line[header.index(\"ES_genomic_end\")].split(\";\")[i]\n",
	" # print(start)\n",
	" \n",
	" \n",
	" # print(line)\n",
	" #break"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 56,
	"metadata": {},
	"outputs": [],
	"source": [
	"annotation_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/GffCompare/nanopore.combined_filt.gtf\"\n",
	"annotate_df = pd.read_csv(annotation_filename,sep = \"\\t\", header = None)\n",
	"annotate_df = annotate_df[annotate_df[2] == \"transcript\"]\n",
	"annotate_lines = list(annotate_df[8])\n",
	"chrms = list(annotate_df[0])\n",
	"start = list(annotate_df[3])\n",
	"stop = list(annotate_df[4])\n",
	"\n",
	"tID_gene = dict()\n",
	"gene_oID = dict()\n",
	"\n",
	"for ann in range(len(annotate_lines)): \n",
	" if \"gene_name\" in annotate_lines[ann]:\n",
	" line = annotate_lines[ann].split(\";\")\n",
	" tID = line[0].split(\" \")[-1][1:-1]\n",
	" gene = line[2].split(\" \")[-1][1:-1]\n",
	" oID = line[3].split(\" \")[-1][1:-1]\n",
	" tID = line[0].split(\" \")[-1][1:-1]\n",
	" enst = line[4].split(\" \")[-1][1:-1].split(\".\")[0]\n",
	" \n",
	" if (gene not in gene_oID): gene_oID[gene] = [oID]\n",
	" else: gene_oID[gene].append(oID)\n",
	" \n",
	" tID_gene[tID] = gene\n",
	" \n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 60,
	"metadata": {},
	"outputs": [],
	"source": [
	"transcript_genes = set()\n",
	"for tcons in tcons_list:\n",
	" gene = tID_gene[tcons]\n",
	" \n",
	" if (gene in gene_oID and len(gene_oID[gene]) > 0):\n",
	" transcript_genes.add(gene)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 61,
	"metadata": {},
	"outputs": [],
	"source": [
	"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Lists/splicing_events.txt\"\n",
	"splicing_file = open(filename, \"w+\")\n",
	"splicing_file.writelines(\"\\n\".join(list(transcript_genes)))\n",
	"splicing_file.close()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 62,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1484"
	]
	},
	"execution_count": 62,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(list(transcript_genes))"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.4"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}