Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
DiffBrainNet/01_DiffExp/00_functions.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
223 lines (194 sloc)
7.03 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################## | |
## Project: DexStim Mouse Brain | |
## Date: 22.07.2020 | |
## Author: Nathalie | |
################################################### | |
# RNA-seq Analysis. Functions that are needed more often | |
# Outlier detection on dex and baseline samples separately | |
outlier_removal <- function(dds) { | |
# Dex samples | |
outlier <- TRUE | |
iter <- 1 | |
while (outlier) { | |
# Run PCA | |
dds_dex <- dds[, colData(dds)$Dex == 1] | |
dds_dex <- estimateSizeFactors(dds_dex) | |
vsd_dex <- vst(dds_dex, blind = TRUE) | |
pc_dex <- | |
as.data.frame(pca(vsd_dex@assays@data[[1]], removeVar = 0.2)$rotated) | |
pc_dex$sample <- rownames(pc_dex) | |
splom(as.data.frame(pc_dex[, 1:10]), | |
cex = 2, pch = '*') | |
# Label any sample which is more than 2.5SD away from the mean in PC1 as outlier | |
outlier_dex <- | |
which(abs(pc_dex[, 1] - mean(pc_dex[, 1])) > (2.5 * sd(pc_dex[, 1]))) | |
pc_dex$outlier <- FALSE | |
pc_dex$outlier[outlier_dex] <- TRUE | |
ggplot(pc_dex, aes( | |
x = PC1, | |
y = PC2, | |
color = outlier, | |
label = sample | |
)) + | |
geom_point(size = 3) + | |
geom_text_repel() + | |
xlab("PC1") + | |
ylab("PC2") + | |
coord_fixed() + | |
ggtitle("PCA: Dex Samples") | |
ggsave(paste0(prefix_plots, "outlier_Dex_iter", iter, ".png")) | |
# Subset deseq object / remove outliers | |
keep_names <- | |
setdiff(colData(dds)$Sample_ID, pc_dex$sample[outlier_dex]) | |
dds <- dds[, keep_names] | |
iter <- iter + 1 | |
if (length(outlier_dex) == 0) { | |
outlier <- FALSE | |
} | |
} | |
# Baseline samples | |
outlier <- TRUE | |
iter <- 1 | |
while (outlier) { | |
# Run PCA | |
dds_base <- dds[, colData(dds)$Dex == 0] | |
dds_base <- estimateSizeFactors(dds_base) | |
vsd_base <- vst(dds_base, blind = TRUE) | |
pc_base <- | |
as.data.frame(pca(vsd_base@assays@data[[1]], removeVar = 0.2)$rotated) | |
pc_base$sample <- rownames(pc_base) | |
splom(as.data.frame(pc_base[, 1:10]), | |
cex = 2, pch = '*') | |
# Label any sample which is more than 2.5SD away from the mean in PC1 as outlier | |
outlier_base <- | |
which(abs(pc_base[, 1] - mean(pc_base[, 1])) > (2.5 * sd(pc_base[, 1]))) | |
pc_base$outlier <- FALSE | |
pc_base$outlier[outlier_base] <- TRUE | |
ggplot(pc_base, aes( | |
x = PC1, | |
y = PC2, | |
color = outlier, | |
label = sample | |
)) + | |
geom_point(size = 3) + | |
geom_text_repel() + | |
xlab("PC1") + | |
ylab("PC2") + | |
coord_fixed() + | |
ggtitle("PCA: Baseline Samples") | |
ggsave(paste0(prefix_plots, "outlier_Baseline_iter", iter, ".png")) | |
# Subset deseq object / remove outliers | |
keep_names <- | |
setdiff(colData(dds)$Sample_ID, pc_base$sample[outlier_base]) | |
dds <- dds[, keep_names] | |
iter <- iter + 1 | |
if (length(outlier_base) == 0) { | |
outlier <- FALSE | |
} | |
} | |
# Rerun normalization etc on combined dataset | |
dds <- estimateSizeFactors(dds) | |
vsd <- vst(dds, blind = TRUE) | |
pc <- | |
as.data.frame(pca(vsd@assays@data[[1]], removeVar = 0.2)$rotated) | |
pc$Dex <- colData(dds)$Dex | |
ggplot(pc, aes(x = PC1, y = PC2, color = Dex)) + | |
geom_point(size = 3) + | |
xlab("PC1") + | |
ylab("PC2") + | |
coord_fixed() + | |
ggtitle("PCA: Outlier Deleted") | |
ggsave(paste0(prefix_plots, "outlierDeleted.png")) | |
png( | |
filename = paste0(prefix_plots, "pairsplot_outlierDeleted.png"), | |
width = 900, | |
height = 900 | |
) | |
print(splom( | |
as.data.frame(pc[, 1:10]), | |
col = colData(dds)$Dex, | |
cex = 2, | |
pch = '*' | |
)) | |
dev.off() | |
# add new PCs to colData | |
colData(dds)[, c(26:35)] <- pc[, c(1:10)] | |
return(dds) | |
} | |
# Batch identification with SVA | |
batch_sva <- function(dds){ | |
#possible batch effects | |
#Explanations: Sample_ID is not a batch, Animal is same as Mouse_ID, mouse_weight.g. is linearly correlated | |
#with Injection_volume, also colinearity between Researcher, date_of_punching and cryostat, | |
#sample wells and indices are covered by plate. lane and row ... | |
# --> maybe remove conc.RNA.ng.ml and vol.for.25.ng.input --> colinearity to Dex? --> should not be removed | |
pos_batch <- c("Dex", "Injection_volume", "date_of_punching", | |
"Plate", "Lane", "Row", "RIN", "RIN2") | |
cov_pc <- colData(dds)[,c(pos_batch,paste0("PC", seq(1:10)))] | |
# 1. Surrogate Variable Analysis (SVA) ---- | |
mod <- model.matrix(~ Dex, colData(dds)) | |
mod0 <- model.matrix(~ 1, colData(dds)) | |
# Calculate SVs (on normalized counts --> according to documentation) | |
# Comment: don't use num.sv function, apparently it's only for microarray data | |
norm <- counts(dds, normalized = TRUE) | |
svobj <- svaseq(norm, mod, mod0) | |
n.sv <- svobj$n.sv #Number of significant surrogate variables is | |
# Add significant SVs to covariates | |
coln <- colnames(cov_pc) | |
cov_pc <- cbind(cov_pc,svobj$sv[,1:n.sv]) | |
colnames(cov_pc) <- c(coln, paste0("SV", seq(1:n.sv))) | |
# 2. Canonical Correlation Analysis (CCA) ---- | |
form <- as.formula(paste0("~ Dex + | |
Injection_volume + | |
date_of_punching + | |
Plate + Row + Lane + | |
RIN + RIN2 + | |
PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10+", | |
paste(paste0("SV", seq(1:n.sv)), collapse = "+"))) | |
# Calculate the correlation coefficients | |
C <- canCorPairs(form, cov_pc) | |
# Plot the results using Canonical correlation | |
png(filename = paste0(prefix_plots, "cca_sorted.png"), width = 800, height = 800) | |
plotCorrMatrix(C) | |
dev.off() | |
png(filename = paste0(prefix_plots, "cca_unsorted.png"), width = 800, height = 800) | |
plotCorrMatrix(C, sort = FALSE) | |
dev.off() | |
# 3. P-values for correlations | |
pval_corr <- matrix(1, nrow = 10 + n.sv, ncol = ncol(cov_pc) - (10 + n.sv)) | |
rownames(pval_corr) <- | |
c(paste0("PC", seq(1:10)), paste0("SV", seq(1:n.sv))) | |
colnames(pval_corr) <- pos_batch | |
# Calc pvalue for factor covariates using ANOVA | |
for (cov in names(Filter(is.factor, cov_pc))){ | |
for (v in c(paste0("PC", seq(1:10)), paste0("SV", seq(1:n.sv)))) { | |
f <- paste0(v, "~", cov) | |
p <- summary(aov(as.formula(f), data = cov_pc))[[1]]$`Pr(>F)`[[1]] | |
pval_corr[v, cov] <- p | |
} | |
} | |
# Calc pvalues for numeric covariates using linear model | |
for (cov in names(Filter(is.numeric, cov_pc[,1:(ncol(cov_pc)-(10+n.sv))]))){ | |
for (v in c(paste0("PC", seq(1:10)), paste0("SV", seq(1:n.sv)))) { | |
f <- paste0(v, "~", cov) | |
p <- summary(lm(as.formula(f), data = cov_pc))$coefficients[2,4] | |
pval_corr[v, cov] <- p | |
} | |
} | |
# Plot p-values | |
pval_corr | |
pheatmap(pval_corr, cluster_rows = FALSE) | |
pheatmap(pval_corr) | |
pheatmap(pval_corr, cluster_rows = FALSE, | |
filename = paste0(prefix_plots, "batchevaluation_pval.png")) | |
dev.off() | |
return(list("cov_pc" = cov_pc, "n.sv" = n.sv)) | |
} | |
# Print data in correct format for kimono (at least for this region) | |
print_kimono <- function(vsd, cov_data){ | |
# Write vst transformed data to file | |
write.table(assay(vsd), file=paste0(prefix_tables, "expression_vsd.txt"),sep="\t", quote = F) | |
# Write biological variables and SVs to file | |
biol <- cov_data[,c("Dex", "Region", grep(colnames(colData(ddssva)), pattern="SV", value=T))] | |
write.table(biol, file=paste0(prefix_tables, "bio_variables.txt"),sep="\t", quote = F) | |
} |