01_scRNA_preprocessing.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e1b92587-9337-4c8b-ae0f-fc4a50a15a9b",
   "metadata": {},
   "source": [
    "# scRNA tutorial from scanpy\n"
   ]
  },
  {
   "cell_type": "raw",
   "id": "8bd3fb16-cb23-4b78-946e-c96cba29eb9a",
   "metadata": {},
   "source": [
    "!pip3 install scanpy anndata anndata2ri\n",
    "!pip3 install scrublet bbknn scanorama\n",
    "!pip3 install rpy2 logging\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4335e122-6033-40c3-bf91-fac7f005e810",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import os \n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import scipy as sp\n",
    "import seaborn as sn\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib import rcParams\n",
    "from matplotlib import colors\n",
    "\n",
    "\n",
    "import scanpy as sc\n",
    "import anndata as ann\n",
    "import scrublet as scr #doublet detection\n",
    "import scanorama as scan #batch correction\n",
    "import scanpy.external as sce #external modules\n",
    "\n",
    "#R interface\n",
    "import rpy2.rinterface_lib.callbacks\n",
    "import logging\n",
    "from rpy2.robjects import pandas2ri\n",
    "import anndata2ri\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7ca0343b-b877-4a11-9d97-fca83f8dd063",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The rpy2.ipython extension is already loaded. To reload it, use:\n",
      "  %reload_ext rpy2.ipython\n",
      "-----\n",
      "anndata     0.10.5.post1\n",
      "scanpy      1.9.8\n",
      "-----\n",
      "PIL                 10.0.1\n",
      "anndata2ri          1.3.1\n",
      "annoy               NA\n",
      "asttokens           NA\n",
      "backcall            0.2.0\n",
      "cffi                1.15.1\n",
      "comm                0.1.4\n",
      "cycler              0.10.0\n",
      "cython_runtime      NA\n",
      "dateutil            2.8.2\n",
      "debugpy             1.8.0\n",
      "decorator           5.1.1\n",
      "executing           1.2.0\n",
      "fbpca               NA\n",
      "h5py                3.10.0\n",
      "igraph              0.11.3\n",
      "intervaltree        NA\n",
      "ipykernel           6.25.2\n",
      "jedi                0.19.0\n",
      "jinja2              3.1.3\n",
      "joblib              1.3.2\n",
      "kiwisolver          1.4.5\n",
      "leidenalg           0.10.2\n",
      "llvmlite            0.42.0\n",
      "markupsafe          2.1.5\n",
      "matplotlib          3.8.0\n",
      "mpl_toolkits        NA\n",
      "natsort             8.4.0\n",
      "numba               0.59.0\n",
      "numpy               1.26.0\n",
      "packaging           23.1\n",
      "pandas              2.1.1\n",
      "parso               0.8.3\n",
      "patsy               0.5.3\n",
      "pexpect             4.8.0\n",
      "pickleshare         0.7.5\n",
      "platformdirs        3.10.0\n",
      "prompt_toolkit      3.0.39\n",
      "psutil              5.9.5\n",
      "ptyprocess          0.7.0\n",
      "pure_eval           0.2.2\n",
      "pydev_ipython       NA\n",
      "pydevconsole        NA\n",
      "pydevd              2.9.5\n",
      "pydevd_file_utils   NA\n",
      "pydevd_plugins      NA\n",
      "pydevd_tracing      NA\n",
      "pygments            2.16.1\n",
      "pyparsing           3.1.1\n",
      "pytz                2023.3.post1\n",
      "rpy2                3.5.15\n",
      "scanorama           1.7.4\n",
      "scipy               1.11.2\n",
      "scrublet            NA\n",
      "seaborn             0.13.2\n",
      "session_info        1.0.0\n",
      "six                 1.16.0\n",
      "sklearn             1.4.1.post1\n",
      "sortedcontainers    2.4.0\n",
      "stack_data          0.6.2\n",
      "statsmodels         0.14.0\n",
      "texttable           1.7.0\n",
      "threadpoolctl       3.2.0\n",
      "tornado             6.3.3\n",
      "traitlets           5.10.0\n",
      "tzlocal             NA\n",
      "wcwidth             0.2.6\n",
      "zmq                 25.1.1\n",
      "-----\n",
      "IPython             8.15.0\n",
      "jupyter_client      8.3.1\n",
      "jupyter_core        5.3.1\n",
      "-----\n",
      "Python 3.11.0 (main, Jun  4 2023, 22:34:19) [GCC 11.3.0]\n",
      "Linux-5.15.0-76-generic-x86_64-with-glibc2.35\n",
      "-----\n",
      "Session information updated at 2024-03-07 20:31\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_875756/690074142.py:6: DeprecationWarning: The global conversion available with activate() is deprecated and will be removed in the next major release. Use a local converter.\n",
      "  anndata2ri.activate()\n"
     ]
    }
   ],
   "source": [
    "#Note: this can be commented out to get more verbose R output\n",
    "rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)\n",
    "\n",
    "# Automatically convert rpy2 outputs to pandas dataframes\n",
    "pandas2ri.activate()\n",
    "anndata2ri.activate()\n",
    "%load_ext rpy2.ipython\n",
    "\n",
    "plt.rcParams['figure.figsize']=(8,8) #rescale figures\n",
    "sc.settings.verbosity = 3\n",
    "#sc.set_figure_params(dpi=200, dpi_save=300)\n",
    "sc.logging.print_versions()\n",
    "\n",
    "sn.set_context(context='poster')\n",
    "\n",
    "## Of note, this notebook was created as part of a workshop, \n",
    "## so we use extra large legend texts in all seaborn plots. \n",
    "## You can set the context as well to 'talk' or 'paper'.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bf9903dc-a88e-47e5-b4cc-a57cbb7a4542",
   "metadata": {},
   "source": [
    "**not able to install packages from R**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b9a9301-be67-4fa0-93a1-a9f8d8a57c37",
   "metadata": {},
   "outputs": [],
   "source": [
    "## tried to install R package, but was not successful \n",
    "\n",
    "import rpy2.robjects.packages as rpackages\n",
    "\n",
    "# Define a list of R packages to install\n",
    "packnames = ['scran', 'RColorBrewer', 'DropletUtils']\n",
    "\n",
    "# Function to check if package is installed\n",
    "def install_r_packages(packnames):\n",
    "    # Import rpy2's package module\n",
    "    utils = rpackages.importr('utils')\n",
    "    # Select a mirror for R packages\n",
    "    utils.chooseCRANmirror(ind=1)  # select the first mirror in the list\n",
    "    # Convert list to string\n",
    "    packnames_str = ' '.join(packnames)\n",
    "    # Install R packages\n",
    "    utils.install_packages(packnames_str)\n",
    "\n",
    "# Call the function to install packages\n",
    "install_r_packages(packnames)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf6544f2-defe-453c-884b-ba604c7a6316",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R\n",
    "# Load libraries from correct lib Paths for my environment - ignore this!\n",
    "#.libPaths(.libPaths()[c(3,2,1)])\n",
    "\n",
    "# Load all the R libraries we will be using in the notebook\n",
    "library(scran)\n",
    "library(RColorBrewer)\n",
    "library(DropletUtils)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8fb6d0d7-c39a-424e-a580-4ada9e1b2296",
   "metadata": {},
   "source": [
    "## Set project file paths"
   ]
  },
  {
   "cell_type": "raw",
   "id": "2c8b2e31-564d-4913-a1bc-3f8243f0371a",
   "metadata": {},
   "source": [
    "File path to the raw data. They are usually stored at a different location than the rest of the project."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "84cfa14f-4b52-4aa0-b221-d0b2fb7f5740",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "file_path = '/nexus/posix0/MAGE-flaski/service/posit/home/sjiang/scRNA_analysis/data/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a9f45c6e-ab93-4091-998d-90175b1b5f0a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "file_path_raw = file_path + '3k_PBMC/' # all processed data and anndata files\n",
    "data_dir = file_path + 'day1_beginner/data/' \n",
    "table_dir = file_path + 'day1_beginner/tables/' # all tabular data output, e.g. in .csv or .xls file format\n",
    "sc.settings.figdir = file_path + 'day1_beginner/figures/' # change figure path from POSIX path"
   ]
  },
  {
   "cell_type": "raw",
   "id": "20a5ac70-d5ef-4872-8b48-7ba90bcf35dc",
   "metadata": {},
   "source": [
    "Comment: When repeat analyses, it is helpful to set a date variable and add it to every figure and table (see datetime Python package).\n",
    "e.g., \n",
    "\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "from datetime import datetime\n",
    "\n",
    "# Generate some example data\n",
    "x = [1, 2, 3, 4, 5]\n",
    "y = [10, 15, 13, 18, 20]\n",
    "\n",
    "# Plot the data\n",
    "plt.plot(x, y)\n",
    "plt.xlabel('X-axis')\n",
    "plt.ylabel('Y-axis')\n",
    "\n",
    "# Get the current date\n",
    "current_date = datetime.now().strftime('%Y-%m-%d')\n",
    "\n",
    "# Add the date to the figure\n",
    "plt.text(0.2, 1.05, current_date, fontsize=13,\n",
    "         transform=plt.gca().transAxes,  ha='center'\n",
    "        )\n",
    "\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "66690d34-7618-47a9-b029-37d54b60feb8",
   "metadata": {},
   "source": [
    "# Read Data"
   ]
  },
  {
   "cell_type": "raw",
   "id": "cd8322d8-e01b-4d49-904c-131827bd7ab9",
   "metadata": {},
   "source": [
    "The dataset consists of 4k PBMCs (Human) provided by 10X Genomics. The data is an mtx directory with an mtx file (i.e. count matrix), two tsv files with barcodes (i.e. cell indices) and features (i.e. gene symbols). Scanpy unpacks the files (if the files are in gz archive format) and creates an anndata object with the read_10x_mtx function.\n",
    "\n",
    "The dataset is not filtered, yet."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "faabd40e-18cb-446f-af80-d7f30a381e67",
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "file_path_raw = file_path_raw + 'raw_gene_bc_matrices/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7517090c-0105-4a12-8643-9fd0cf50a974",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_raw = sc.read_10x_mtx(path=file_path_raw)\n",
    "adata_raw.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a09a8a27-4604-49f0-95ec-d7139a8028d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Total number of observations: {:d}'.format(adata_raw.n_obs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1785bcf8-8f49-483b-b3a4-e7375fda0be4",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/nexus/posix0/MAGE-flaski/service/hpc/home/sjiang'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70433400-c250-4f2c-9eed-9efdeae81e3b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "jupytext": {
   "cell_markers": "{{{,}}}"
  },
  "kernelspec": {
   "display_name": "Python 3.11.0",
   "language": "python",
   "name": "py3.11.0"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}