Skip to content

Commit

Permalink
initial commit diffbrainnet
Browse files Browse the repository at this point in the history
  • Loading branch information
ngerstner committed Mar 4, 2022
0 parents commit 8340144
Show file tree
Hide file tree
Showing 137 changed files with 13,000 additions and 0 deletions.
Binary file added 01_DiffExp/.DS_Store
Binary file not shown.
223 changes: 223 additions & 0 deletions 01_DiffExp/00_functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
##################################################
## Project: DexStim Mouse Brain
## Date: 22.07.2020
## Author: Nathalie
###################################################
# RNA-seq Analysis. Functions that are needed more often

# Outlier detection on dex and baseline samples separately
outlier_removal <- function(dds) {
# Dex samples
outlier <- TRUE
iter <- 1
while (outlier) {
# Run PCA
dds_dex <- dds[, colData(dds)$Dex == 1]
dds_dex <- estimateSizeFactors(dds_dex)
vsd_dex <- vst(dds_dex, blind = TRUE)
pc_dex <-
as.data.frame(pca(vsd_dex@assays@data[[1]], removeVar = 0.2)$rotated)
pc_dex$sample <- rownames(pc_dex)
splom(as.data.frame(pc_dex[, 1:10]),
cex = 2, pch = '*')

# Label any sample which is more than 2.5SD away from the mean in PC1 as outlier
outlier_dex <-
which(abs(pc_dex[, 1] - mean(pc_dex[, 1])) > (2.5 * sd(pc_dex[, 1])))
pc_dex$outlier <- FALSE
pc_dex$outlier[outlier_dex] <- TRUE

ggplot(pc_dex, aes(
x = PC1,
y = PC2,
color = outlier,
label = sample
)) +
geom_point(size = 3) +
geom_text_repel() +
xlab("PC1") +
ylab("PC2") +
coord_fixed() +
ggtitle("PCA: Dex Samples")
ggsave(paste0(prefix_plots, "outlier_Dex_iter", iter, ".png"))

# Subset deseq object / remove outliers
keep_names <-
setdiff(colData(dds)$Sample_ID, pc_dex$sample[outlier_dex])
dds <- dds[, keep_names]
iter <- iter + 1
if (length(outlier_dex) == 0) {
outlier <- FALSE
}
}

# Baseline samples
outlier <- TRUE
iter <- 1
while (outlier) {
# Run PCA
dds_base <- dds[, colData(dds)$Dex == 0]
dds_base <- estimateSizeFactors(dds_base)
vsd_base <- vst(dds_base, blind = TRUE)
pc_base <-
as.data.frame(pca(vsd_base@assays@data[[1]], removeVar = 0.2)$rotated)
pc_base$sample <- rownames(pc_base)
splom(as.data.frame(pc_base[, 1:10]),
cex = 2, pch = '*')

# Label any sample which is more than 2.5SD away from the mean in PC1 as outlier
outlier_base <-
which(abs(pc_base[, 1] - mean(pc_base[, 1])) > (2.5 * sd(pc_base[, 1])))
pc_base$outlier <- FALSE
pc_base$outlier[outlier_base] <- TRUE

ggplot(pc_base, aes(
x = PC1,
y = PC2,
color = outlier,
label = sample
)) +
geom_point(size = 3) +
geom_text_repel() +
xlab("PC1") +
ylab("PC2") +
coord_fixed() +
ggtitle("PCA: Baseline Samples")
ggsave(paste0(prefix_plots, "outlier_Baseline_iter", iter, ".png"))

# Subset deseq object / remove outliers
keep_names <-
setdiff(colData(dds)$Sample_ID, pc_base$sample[outlier_base])
dds <- dds[, keep_names]
iter <- iter + 1
if (length(outlier_base) == 0) {
outlier <- FALSE
}
}

# Rerun normalization etc on combined dataset
dds <- estimateSizeFactors(dds)
vsd <- vst(dds, blind = TRUE)
pc <-
as.data.frame(pca(vsd@assays@data[[1]], removeVar = 0.2)$rotated)
pc$Dex <- colData(dds)$Dex

ggplot(pc, aes(x = PC1, y = PC2, color = Dex)) +
geom_point(size = 3) +
xlab("PC1") +
ylab("PC2") +
coord_fixed() +
ggtitle("PCA: Outlier Deleted")
ggsave(paste0(prefix_plots, "outlierDeleted.png"))

png(
filename = paste0(prefix_plots, "pairsplot_outlierDeleted.png"),
width = 900,
height = 900
)
print(splom(
as.data.frame(pc[, 1:10]),
col = colData(dds)$Dex,
cex = 2,
pch = '*'
))
dev.off()

# add new PCs to colData
colData(dds)[, c(26:35)] <- pc[, c(1:10)]

return(dds)
}



# Batch identification with SVA
batch_sva <- function(dds){
#possible batch effects
#Explanations: Sample_ID is not a batch, Animal is same as Mouse_ID, mouse_weight.g. is linearly correlated
#with Injection_volume, also colinearity between Researcher, date_of_punching and cryostat,
#sample wells and indices are covered by plate. lane and row ...
# --> maybe remove conc.RNA.ng.ml and vol.for.25.ng.input --> colinearity to Dex? --> should not be removed
pos_batch <- c("Dex", "Injection_volume", "date_of_punching",
"Plate", "Lane", "Row", "RIN", "RIN2")
cov_pc <- colData(dds)[,c(pos_batch,paste0("PC", seq(1:10)))]


# 1. Surrogate Variable Analysis (SVA) ----
mod <- model.matrix(~ Dex, colData(dds))
mod0 <- model.matrix(~ 1, colData(dds))
# Calculate SVs (on normalized counts --> according to documentation)
# Comment: don't use num.sv function, apparently it's only for microarray data
norm <- counts(dds, normalized = TRUE)
svobj <- svaseq(norm, mod, mod0)
n.sv <- svobj$n.sv #Number of significant surrogate variables is
# Add significant SVs to covariates
coln <- colnames(cov_pc)
cov_pc <- cbind(cov_pc,svobj$sv[,1:n.sv])
colnames(cov_pc) <- c(coln, paste0("SV", seq(1:n.sv)))


# 2. Canonical Correlation Analysis (CCA) ----
form <- as.formula(paste0("~ Dex +
Injection_volume +
date_of_punching +
Plate + Row + Lane +
RIN + RIN2 +
PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10+",
paste(paste0("SV", seq(1:n.sv)), collapse = "+")))

# Calculate the correlation coefficients
C <- canCorPairs(form, cov_pc)
# Plot the results using Canonical correlation
png(filename = paste0(prefix_plots, "cca_sorted.png"), width = 800, height = 800)
plotCorrMatrix(C)
dev.off()
png(filename = paste0(prefix_plots, "cca_unsorted.png"), width = 800, height = 800)
plotCorrMatrix(C, sort = FALSE)
dev.off()


# 3. P-values for correlations
pval_corr <- matrix(1, nrow = 10 + n.sv, ncol = ncol(cov_pc) - (10 + n.sv))
rownames(pval_corr) <-
c(paste0("PC", seq(1:10)), paste0("SV", seq(1:n.sv)))
colnames(pval_corr) <- pos_batch
# Calc pvalue for factor covariates using ANOVA
for (cov in names(Filter(is.factor, cov_pc))){
for (v in c(paste0("PC", seq(1:10)), paste0("SV", seq(1:n.sv)))) {
f <- paste0(v, "~", cov)
p <- summary(aov(as.formula(f), data = cov_pc))[[1]]$`Pr(>F)`[[1]]
pval_corr[v, cov] <- p
}
}
# Calc pvalues for numeric covariates using linear model
for (cov in names(Filter(is.numeric, cov_pc[,1:(ncol(cov_pc)-(10+n.sv))]))){
for (v in c(paste0("PC", seq(1:10)), paste0("SV", seq(1:n.sv)))) {
f <- paste0(v, "~", cov)
p <- summary(lm(as.formula(f), data = cov_pc))$coefficients[2,4]
pval_corr[v, cov] <- p
}
}

# Plot p-values
pval_corr
pheatmap(pval_corr, cluster_rows = FALSE)
pheatmap(pval_corr)
pheatmap(pval_corr, cluster_rows = FALSE,
filename = paste0(prefix_plots, "batchevaluation_pval.png"))
dev.off()

return(list("cov_pc" = cov_pc, "n.sv" = n.sv))
}


# Print data in correct format for kimono (at least for this region)
print_kimono <- function(vsd, cov_data){

# Write vst transformed data to file
write.table(assay(vsd), file=paste0(prefix_tables, "expression_vsd.txt"),sep="\t", quote = F)

# Write biological variables and SVs to file
biol <- cov_data[,c("Dex", "Region", grep(colnames(colData(ddssva)), pattern="SV", value=T))]
write.table(biol, file=paste0(prefix_tables, "bio_variables.txt"),sep="\t", quote = F)
}
Loading

0 comments on commit 8340144

Please sign in to comment.