Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
{
"cells": [
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Jelena's Fake Data\n",
"\n",
"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/ReferenceData/gencode.v32.primary_assembly.annotation.sorted.gtf\"\n",
"df_all = pd.read_csv(filename, skiprows = 5, header = None, sep = \"\\t\")\n",
"info = list(df_all[8])\n",
"\n",
"gene_genetypes = dict()\n",
"for i in info:\n",
" gene_type = i.split(\"gene_type\")[1].split('\"')[1]\n",
" gene_name = i.split(\"gene_name\")[1].split('\"')[1]\n",
" gene_genetypes[gene_name] = gene_type\n",
" \n",
"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Quantification/all_counts_deseq2norm.txt\"\n",
"df_all = pd.read_csv(filename, sep = \",\")\n",
"df_all = df_all.dropna()\n",
"df_all[\"gene_type\"] = df_all[\"gene_name\"].apply(lambda x: gene_genetypes[x])\n",
"\n",
"df_all.to_csv(\"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Quantification/all_counts_deseq2norm_jelena.csv\", index = False)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"filename = \"/project/owlmayerTemporary/Sid/mass_spec/protein_seq_db/ASAnalysis.csv\"\n",
"as_analysis_file = open(filename,\"r\")\n",
"as_lines_all = as_analysis_file.readlines()\n",
"as_analysis_file.close()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['id', 'isoform_id', 'ES', 'ES_genomic_start', 'ES_genomic_end', 'MEE', 'MEE_genomic_start', 'MEE_genomic_end', 'MES', 'MES_genomic_start', 'MES_genomic_end', 'IR', 'IR_genomic_start', 'IR_genomic_end', 'A5', 'A5_genomic_start', 'A5_genomic_end', 'A3', 'A3_genomic_start', 'A3_genomic_end', 'ATSS', 'ATSS_genomic_start', 'ATSS_genomic_end', 'ATTS', 'ATTS_genomic_start', 'ATTS_genomic_end']\n"
]
}
],
"source": [
"as_lines = as_lines_all\n",
"header = as_lines[0].strip().split(\",\")\n",
"as_lines = as_lines[1:]\n",
"print(header)\n",
"# A3SS, A5SS, MXE, RI, SE \n",
"se_lines = []\n",
"tcons_list = []\n",
"for line in as_lines:\n",
" line = line.strip().split(\",\")\n",
" tcons_list.append(line[1])\n",
" #SE\n",
" #for i in range(int(line[header.index(\"ES\")])):\n",
" # start = line[header.index(\"ES_genomic_start\")].split(\";\")[i]\n",
" # end = line[header.index(\"ES_genomic_end\")].split(\";\")[i]\n",
" # print(start)\n",
" \n",
" \n",
" # print(line)\n",
" #break"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"annotation_filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/GffCompare/nanopore.combined_filt.gtf\"\n",
"annotate_df = pd.read_csv(annotation_filename,sep = \"\\t\", header = None)\n",
"annotate_df = annotate_df[annotate_df[2] == \"transcript\"]\n",
"annotate_lines = list(annotate_df[8])\n",
"chrms = list(annotate_df[0])\n",
"start = list(annotate_df[3])\n",
"stop = list(annotate_df[4])\n",
"\n",
"tID_gene = dict()\n",
"gene_oID = dict()\n",
"\n",
"for ann in range(len(annotate_lines)): \n",
" if \"gene_name\" in annotate_lines[ann]:\n",
" line = annotate_lines[ann].split(\";\")\n",
" tID = line[0].split(\" \")[-1][1:-1]\n",
" gene = line[2].split(\" \")[-1][1:-1]\n",
" oID = line[3].split(\" \")[-1][1:-1]\n",
" tID = line[0].split(\" \")[-1][1:-1]\n",
" enst = line[4].split(\" \")[-1][1:-1].split(\".\")[0]\n",
" \n",
" if (gene not in gene_oID): gene_oID[gene] = [oID]\n",
" else: gene_oID[gene].append(oID)\n",
" \n",
" tID_gene[tID] = gene\n",
" \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"transcript_genes = set()\n",
"for tcons in tcons_list:\n",
" gene = tID_gene[tcons]\n",
" \n",
" if (gene in gene_oID and len(gene_oID[gene]) > 0):\n",
" transcript_genes.add(gene)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"filename = \"/project/owlmayerTemporary/Sid/nanopore-analysis/Results_5_1/Lists/splicing_events.txt\"\n",
"splicing_file = open(filename, \"w+\")\n",
"splicing_file.writelines(\"\\n\".join(list(transcript_genes)))\n",
"splicing_file.close()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1484"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(list(transcript_genes))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}