Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
DiffBrainNet/01_DiffExp/01_setup.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
179 lines (148 sloc)
6.91 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################## | |
## Project: DexStim Mouse Brain | |
## Date: 22.07.2020 | |
## Author: Nathalie | |
################################################### | |
# RNA-seq Analysis. Part 1. Setting up the environment | |
# This R script "setup" is for: | |
# 1. loading the nessesary packages | |
# 2. loading and cleaning data (metha and readcounts) | |
# 3. pre-filtering | |
# 4. perfom simple descriptive statistics | |
# set working directory to source file location | |
setwd("~/Documents/ownCloud/DexStim_RNAseq_Mouse") | |
# 1. Packages ---- | |
## Library Loadings | |
library(DESeq2) # DE analysis | |
library(limma) # DE analysis (and MAplots for DESeq2) | |
library(sva) # Surrogate variable analysis (SVA) | |
library(Biobase) | |
library(variancePartition) # canonical correlation analysis (CCA) | |
# 2. Loading and cleaning the data ---- | |
# Data pathways | |
db_metha <- "data/Covariates_RNAseq_Mouse_brain.csv" | |
db_data <- "data/06_featureCounts/20190313_Anthi_Mouse_Brain_Dex.fC" | |
# Loading | |
covariates <- read.csv(db_metha) | |
countdata <- read.csv(file = db_data, sep = "\t", header = TRUE, row.names = 1) # removed very first line before actual header | |
countdata <- countdata[,6:ncol(countdata)] #Remove first six columns (geneid, chr, start, end, strand, length) | |
ncol(countdata) #[1] 297 | |
nrow(countdata) #[1] 27998 | |
# Adjust the sample names | |
colnames(countdata) <- gsub("AK_S[0-9]*.cutadapt.extract.Aligned.sortedByCoord.out.dedup.bam", "", colnames(countdata)) | |
colnames(countdata) <- gsub("X05_umi_dedup.mpg_L[0-9]*_", "", colnames(countdata)) | |
colnames(countdata) <- sub("[.]", "_", colnames(countdata)) | |
colnames(countdata) <- sub("[.]", "", colnames(countdata)) | |
# Adjust the data in files (delete unknown or not-matching) | |
all_col_names <- colnames(countdata) | |
cov_row_names <- as.vector(covariates$Sample_ID) | |
outersect <- function(x, y) { #function to identify the non-shared items in two vectors | |
sort(c(setdiff(x, y), setdiff(y, x)))} | |
to_delete <- outersect(all_col_names, cov_row_names) | |
print(to_delete) | |
covariates <- covariates[!covariates$Sample_ID %in% to_delete,] | |
countdata <- countdata[,!colnames(countdata) %in% to_delete] | |
# 3. Pre-filtering ---- | |
# Convert to matrix | |
countdata <- as.matrix(countdata) | |
# 3.1 Gene-based filtering ---- | |
# Keep genes quantified in at least one full treatment group | |
region <- as.character(unique(covariates$Region)) | |
dex <- unique(covariates$Dex) | |
filtered_data <- matrix(nrow = nrow(countdata), ncol = ncol(countdata)) | |
# Go through all combinations of brain region and dex treatment | |
for (r in region){ | |
for (d in dex){ | |
sample_ids <- covariates$Sample_ID[covariates$Region == r & covariates$Dex == d] | |
# Subset counts to samples of this dex/region group | |
sub_counts <- countdata[,colnames(countdata)%in%sample_ids] | |
# Copy gene expression data if a gene is expressed in all samples of the dex/region group | |
for (k in 1:nrow(countdata)) { | |
if (length(which(sub_counts[k,]==0)) == 0){ | |
filtered_data[k,] <- countdata[k,] | |
} | |
} | |
} | |
} | |
colnames(filtered_data) <- colnames(countdata) | |
rownames(filtered_data) <- rownames(countdata) | |
filtered_data <- na.omit(filtered_data) | |
# 12976 genes/rows and 295 samples/columns | |
# 3.2 Sample-based filtering ---- | |
# Check 0 read counts (assess if there are samples with way too low counts --> indicating errors) | |
# Should be more than 60% | |
# 90% of the data present is roughly 11700 => 1300 may be zeros | |
#With this cut-off there are 291 samples left (4 are deleted) | |
# 80% of the data present is roughly 10400 => 2600 may be zeros | |
#With this cut-off there are 295 samples left (none are deleted) | |
# 70% of the data present is roughly 9100 => 3900 may be zeros | |
#With this cut-off there are 295 samples left (none are deleted) | |
delete_ids <- NULL | |
for (i in 1:ncol(filtered_data)){ | |
if (length(which(filtered_data[,i]==0)) > 3900){ | |
delete_ids <- c(delete_ids, colnames(filtered_data)[i]) | |
} | |
} | |
filtered_data <- filtered_data[,!colnames(filtered_data) %in% delete_ids] | |
# 4. Simple descriptive statistics (Iuliia) ---- | |
# Genes per brain region/condition table (data$Region, data$Dex) | |
## To report how many genes are left after pre-filtering | |
genes_per_reg_cond <- matrix(nrow = length(region), ncol = length(dex)) | |
rownames(genes_per_reg_cond) <- region | |
colnames(genes_per_reg_cond) <- dex | |
for (r in region){ | |
for (d in dex){ | |
sample_ids <- covariates$Sample_ID[covariates$Region == r & covariates$Dex == d] | |
sub_counts <- filtered_data[,colnames(filtered_data) %in% sample_ids] | |
sub_counts <- rbind(sub_counts, NA) | |
#keep the present genes number and calculate the median value of those | |
for (k in 1:ncol(sub_counts)) { | |
sub_counts[nrow(sub_counts), k] <- length(which(sub_counts[,k] > 0)) | |
} | |
genes_per_reg_cond[r, toString(d)] <- median(as.numeric(sub_counts[nrow(sub_counts),])) | |
} | |
} | |
## Adjust the tables again | |
#####!!!!!Check why countdata delets here | |
all_col_names <- colnames(countdata) | |
sample_filter_col_names <- colnames(filtered_data) | |
to_delete <- outersect(all_col_names,sample_filter_col_names) | |
covariates <- as.data.frame(covariates[!covariates$Sample_ID %in% to_delete,]) #data.frame for DESeq2 | |
countdata <- (countdata[,!colnames(countdata) %in% to_delete]) | |
data_row_names <- rownames(countdata) | |
sample_filter_row_names <- rownames(filtered_data) | |
to_delete <- outersect(data_row_names, sample_filter_row_names) | |
countdata <- as.matrix(countdata[!rownames(countdata) %in% to_delete,]) | |
backupcov<-covariates | |
#ncol(countdata) #295 | |
#nrow(countdata) #12976 | |
#Build a histogram of mouse weight | |
hist(as.numeric(covariates$mouse_weight.g.[covariates$Dex==0])) | |
hist(as.numeric(covariates$mouse_weight.g.[covariates$Dex==1])) | |
shapiro.test(as.numeric(covariates$mouse_weight.g.[covariates$Dex==0])) | |
shapiro.test(as.numeric(covariates$mouse_weight.g.[covariates$Dex==1])) | |
wilcox.test(as.numeric(covariates$mouse_weight.g.)~covariates$Dex) | |
ggplot(covariates,aes(x=mouse_weight.g., color=Dex, fill=Dex))+ | |
geom_histogram(alpha=0.6, binwidth = 0.5) | |
# Assign Conditions for the Data Analysis ---- | |
str(covariates) | |
covariates$Dex = as.factor(covariates$Dex) | |
covariates$Region = as.factor(covariates$Region) | |
covariates$Region <- relevel(covariates$Region, "CER") | |
covariates$Animal = as.factor(covariates$Animal) | |
covariates$Injection_volume <- as.factor(covariates$Injection_volume) | |
covariates$Researcher = as.factor(covariates$Researcher) | |
covariates$date_of_punching = as.factor(covariates$date_of_punching) | |
covariates$Plate = as.factor(covariates$Plate) | |
covariates$Lane = as.factor(covariates$Lane) | |
covariates$Row = as.factor(covariates$Row) | |
covariates$cryostat = as.factor(covariates$cryostat) | |
covariates$Sample_Well = as.factor(covariates$Sample_Well) | |
covariates$index = as.factor(covariates$index) | |
covariates$index2 = as.factor(covariates$index2) | |
covariates$I5_Index_ID = as.factor(covariates$I5_Index_ID) | |
covariates$I7_Index_ID = as.factor(covariates$I7_Index_ID) | |
# sort covariates table to have same order as countdata | |
rownames(covariates) <- covariates$Sample_ID | |
idx<-match(colnames(countdata),rownames(covariates)) | |
covariates<-covariates[idx,] | |