diff --git a/01_DiffExp/.DS_Store b/01_DiffExp/.DS_Store new file mode 100644 index 0000000..e017d4d Binary files /dev/null and b/01_DiffExp/.DS_Store differ diff --git a/01_DiffExp/00_functions.R b/01_DiffExp/00_functions.R new file mode 100644 index 0000000..fbfc69a --- /dev/null +++ b/01_DiffExp/00_functions.R @@ -0,0 +1,223 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 22.07.2020 +## Author: Nathalie +################################################### +# RNA-seq Analysis. Functions that are needed more often + +# Outlier detection on dex and baseline samples separately +outlier_removal <- function(dds) { + # Dex samples + outlier <- TRUE + iter <- 1 + while (outlier) { + # Run PCA + dds_dex <- dds[, colData(dds)$Dex == 1] + dds_dex <- estimateSizeFactors(dds_dex) + vsd_dex <- vst(dds_dex, blind = TRUE) + pc_dex <- + as.data.frame(pca(vsd_dex@assays@data[[1]], removeVar = 0.2)$rotated) + pc_dex$sample <- rownames(pc_dex) + splom(as.data.frame(pc_dex[, 1:10]), + cex = 2, pch = '*') + + # Label any sample which is more than 2.5SD away from the mean in PC1 as outlier + outlier_dex <- + which(abs(pc_dex[, 1] - mean(pc_dex[, 1])) > (2.5 * sd(pc_dex[, 1]))) + pc_dex$outlier <- FALSE + pc_dex$outlier[outlier_dex] <- TRUE + + ggplot(pc_dex, aes( + x = PC1, + y = PC2, + color = outlier, + label = sample + )) + + geom_point(size = 3) + + geom_text_repel() + + xlab("PC1") + + ylab("PC2") + + coord_fixed() + + ggtitle("PCA: Dex Samples") + ggsave(paste0(prefix_plots, "outlier_Dex_iter", iter, ".png")) + + # Subset deseq object / remove outliers + keep_names <- + setdiff(colData(dds)$Sample_ID, pc_dex$sample[outlier_dex]) + dds <- dds[, keep_names] + iter <- iter + 1 + if (length(outlier_dex) == 0) { + outlier <- FALSE + } + } + + # Baseline samples + outlier <- TRUE + iter <- 1 + while (outlier) { + # Run PCA + dds_base <- dds[, colData(dds)$Dex == 0] + dds_base <- estimateSizeFactors(dds_base) + vsd_base <- vst(dds_base, blind = TRUE) + pc_base <- + as.data.frame(pca(vsd_base@assays@data[[1]], removeVar = 0.2)$rotated) + pc_base$sample <- rownames(pc_base) + splom(as.data.frame(pc_base[, 1:10]), + cex = 2, pch = '*') + + # Label any sample which is more than 2.5SD away from the mean in PC1 as outlier + outlier_base <- + which(abs(pc_base[, 1] - mean(pc_base[, 1])) > (2.5 * sd(pc_base[, 1]))) + pc_base$outlier <- FALSE + pc_base$outlier[outlier_base] <- TRUE + + ggplot(pc_base, aes( + x = PC1, + y = PC2, + color = outlier, + label = sample + )) + + geom_point(size = 3) + + geom_text_repel() + + xlab("PC1") + + ylab("PC2") + + coord_fixed() + + ggtitle("PCA: Baseline Samples") + ggsave(paste0(prefix_plots, "outlier_Baseline_iter", iter, ".png")) + + # Subset deseq object / remove outliers + keep_names <- + setdiff(colData(dds)$Sample_ID, pc_base$sample[outlier_base]) + dds <- dds[, keep_names] + iter <- iter + 1 + if (length(outlier_base) == 0) { + outlier <- FALSE + } + } + + # Rerun normalization etc on combined dataset + dds <- estimateSizeFactors(dds) + vsd <- vst(dds, blind = TRUE) + pc <- + as.data.frame(pca(vsd@assays@data[[1]], removeVar = 0.2)$rotated) + pc$Dex <- colData(dds)$Dex + + ggplot(pc, aes(x = PC1, y = PC2, color = Dex)) + + geom_point(size = 3) + + xlab("PC1") + + ylab("PC2") + + coord_fixed() + + ggtitle("PCA: Outlier Deleted") + ggsave(paste0(prefix_plots, "outlierDeleted.png")) + + png( + filename = paste0(prefix_plots, "pairsplot_outlierDeleted.png"), + width = 900, + height = 900 + ) + print(splom( + as.data.frame(pc[, 1:10]), + col = colData(dds)$Dex, + cex = 2, + pch = '*' + )) + dev.off() + + # add new PCs to colData + colData(dds)[, c(26:35)] <- pc[, c(1:10)] + + return(dds) +} + + + +# Batch identification with SVA +batch_sva <- function(dds){ + #possible batch effects + #Explanations: Sample_ID is not a batch, Animal is same as Mouse_ID, mouse_weight.g. is linearly correlated + #with Injection_volume, also colinearity between Researcher, date_of_punching and cryostat, + #sample wells and indices are covered by plate. lane and row ... + # --> maybe remove conc.RNA.ng.ml and vol.for.25.ng.input --> colinearity to Dex? --> should not be removed + pos_batch <- c("Dex", "Injection_volume", "date_of_punching", + "Plate", "Lane", "Row", "RIN", "RIN2") + cov_pc <- colData(dds)[,c(pos_batch,paste0("PC", seq(1:10)))] + + + # 1. Surrogate Variable Analysis (SVA) ---- + mod <- model.matrix(~ Dex, colData(dds)) + mod0 <- model.matrix(~ 1, colData(dds)) + # Calculate SVs (on normalized counts --> according to documentation) + # Comment: don't use num.sv function, apparently it's only for microarray data + norm <- counts(dds, normalized = TRUE) + svobj <- svaseq(norm, mod, mod0) + n.sv <- svobj$n.sv #Number of significant surrogate variables is + # Add significant SVs to covariates + coln <- colnames(cov_pc) + cov_pc <- cbind(cov_pc,svobj$sv[,1:n.sv]) + colnames(cov_pc) <- c(coln, paste0("SV", seq(1:n.sv))) + + + # 2. Canonical Correlation Analysis (CCA) ---- + form <- as.formula(paste0("~ Dex + + Injection_volume + + date_of_punching + + Plate + Row + Lane + + RIN + RIN2 + + PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10+", + paste(paste0("SV", seq(1:n.sv)), collapse = "+"))) + + # Calculate the correlation coefficients + C <- canCorPairs(form, cov_pc) + # Plot the results using Canonical correlation + png(filename = paste0(prefix_plots, "cca_sorted.png"), width = 800, height = 800) + plotCorrMatrix(C) + dev.off() + png(filename = paste0(prefix_plots, "cca_unsorted.png"), width = 800, height = 800) + plotCorrMatrix(C, sort = FALSE) + dev.off() + + + # 3. P-values for correlations + pval_corr <- matrix(1, nrow = 10 + n.sv, ncol = ncol(cov_pc) - (10 + n.sv)) + rownames(pval_corr) <- + c(paste0("PC", seq(1:10)), paste0("SV", seq(1:n.sv))) + colnames(pval_corr) <- pos_batch + # Calc pvalue for factor covariates using ANOVA + for (cov in names(Filter(is.factor, cov_pc))){ + for (v in c(paste0("PC", seq(1:10)), paste0("SV", seq(1:n.sv)))) { + f <- paste0(v, "~", cov) + p <- summary(aov(as.formula(f), data = cov_pc))[[1]]$`Pr(>F)`[[1]] + pval_corr[v, cov] <- p + } + } + # Calc pvalues for numeric covariates using linear model + for (cov in names(Filter(is.numeric, cov_pc[,1:(ncol(cov_pc)-(10+n.sv))]))){ + for (v in c(paste0("PC", seq(1:10)), paste0("SV", seq(1:n.sv)))) { + f <- paste0(v, "~", cov) + p <- summary(lm(as.formula(f), data = cov_pc))$coefficients[2,4] + pval_corr[v, cov] <- p + } + } + + # Plot p-values + pval_corr + pheatmap(pval_corr, cluster_rows = FALSE) + pheatmap(pval_corr) + pheatmap(pval_corr, cluster_rows = FALSE, + filename = paste0(prefix_plots, "batchevaluation_pval.png")) + dev.off() + + return(list("cov_pc" = cov_pc, "n.sv" = n.sv)) +} + + +# Print data in correct format for kimono (at least for this region) +print_kimono <- function(vsd, cov_data){ + + # Write vst transformed data to file + write.table(assay(vsd), file=paste0(prefix_tables, "expression_vsd.txt"),sep="\t", quote = F) + + # Write biological variables and SVs to file + biol <- cov_data[,c("Dex", "Region", grep(colnames(colData(ddssva)), pattern="SV", value=T))] + write.table(biol, file=paste0(prefix_tables, "bio_variables.txt"),sep="\t", quote = F) +} \ No newline at end of file diff --git a/01_DiffExp/01_setup.R b/01_DiffExp/01_setup.R new file mode 100644 index 0000000..cb7442e --- /dev/null +++ b/01_DiffExp/01_setup.R @@ -0,0 +1,179 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 22.07.2020 +## Author: Nathalie +################################################### +# RNA-seq Analysis. Part 1. Setting up the environment + +# This R script "setup" is for: +# 1. loading the nessesary packages +# 2. loading and cleaning data (metha and readcounts) +# 3. pre-filtering +# 4. perfom simple descriptive statistics + +# set working directory to source file location +setwd("~/Documents/ownCloud/DexStim_RNAseq_Mouse") + +# 1. Packages ---- + +## Library Loadings +library(DESeq2) # DE analysis +library(limma) # DE analysis (and MAplots for DESeq2) +library(sva) # Surrogate variable analysis (SVA) +library(Biobase) +library(variancePartition) # canonical correlation analysis (CCA) + +# 2. Loading and cleaning the data ---- + +# Data pathways +db_metha <- "data/Covariates_RNAseq_Mouse_brain.csv" +db_data <- "data/06_featureCounts/20190313_Anthi_Mouse_Brain_Dex.fC" + +# Loading +covariates <- read.csv(db_metha) +countdata <- read.csv(file = db_data, sep = "\t", header = TRUE, row.names = 1) # removed very first line before actual header +countdata <- countdata[,6:ncol(countdata)] #Remove first six columns (geneid, chr, start, end, strand, length) +ncol(countdata) #[1] 297 +nrow(countdata) #[1] 27998 + +# Adjust the sample names +colnames(countdata) <- gsub("AK_S[0-9]*.cutadapt.extract.Aligned.sortedByCoord.out.dedup.bam", "", colnames(countdata)) +colnames(countdata) <- gsub("X05_umi_dedup.mpg_L[0-9]*_", "", colnames(countdata)) +colnames(countdata) <- sub("[.]", "_", colnames(countdata)) +colnames(countdata) <- sub("[.]", "", colnames(countdata)) + +# Adjust the data in files (delete unknown or not-matching) +all_col_names <- colnames(countdata) +cov_row_names <- as.vector(covariates$Sample_ID) + +outersect <- function(x, y) { #function to identify the non-shared items in two vectors + sort(c(setdiff(x, y), setdiff(y, x)))} +to_delete <- outersect(all_col_names, cov_row_names) +print(to_delete) + +covariates <- covariates[!covariates$Sample_ID %in% to_delete,] +countdata <- countdata[,!colnames(countdata) %in% to_delete] + +# 3. Pre-filtering ---- + +# Convert to matrix +countdata <- as.matrix(countdata) + +# 3.1 Gene-based filtering ---- +# Keep genes quantified in at least one full treatment group +region <- as.character(unique(covariates$Region)) +dex <- unique(covariates$Dex) +filtered_data <- matrix(nrow = nrow(countdata), ncol = ncol(countdata)) + +# Go through all combinations of brain region and dex treatment +for (r in region){ + for (d in dex){ + sample_ids <- covariates$Sample_ID[covariates$Region == r & covariates$Dex == d] + # Subset counts to samples of this dex/region group + sub_counts <- countdata[,colnames(countdata)%in%sample_ids] + # Copy gene expression data if a gene is expressed in all samples of the dex/region group + for (k in 1:nrow(countdata)) { + if (length(which(sub_counts[k,]==0)) == 0){ + filtered_data[k,] <- countdata[k,] + } + } + } +} +colnames(filtered_data) <- colnames(countdata) +rownames(filtered_data) <- rownames(countdata) +filtered_data <- na.omit(filtered_data) +# 12976 genes/rows and 295 samples/columns + + +# 3.2 Sample-based filtering ---- +# Check 0 read counts (assess if there are samples with way too low counts --> indicating errors) +# Should be more than 60% + +# 90% of the data present is roughly 11700 => 1300 may be zeros +#With this cut-off there are 291 samples left (4 are deleted) +# 80% of the data present is roughly 10400 => 2600 may be zeros +#With this cut-off there are 295 samples left (none are deleted) +# 70% of the data present is roughly 9100 => 3900 may be zeros +#With this cut-off there are 295 samples left (none are deleted) + +delete_ids <- NULL + +for (i in 1:ncol(filtered_data)){ + if (length(which(filtered_data[,i]==0)) > 3900){ + delete_ids <- c(delete_ids, colnames(filtered_data)[i]) + } +} + +filtered_data <- filtered_data[,!colnames(filtered_data) %in% delete_ids] + + +# 4. Simple descriptive statistics (Iuliia) ---- + +# Genes per brain region/condition table (data$Region, data$Dex) +## To report how many genes are left after pre-filtering +genes_per_reg_cond <- matrix(nrow = length(region), ncol = length(dex)) +rownames(genes_per_reg_cond) <- region +colnames(genes_per_reg_cond) <- dex +for (r in region){ + for (d in dex){ + sample_ids <- covariates$Sample_ID[covariates$Region == r & covariates$Dex == d] + sub_counts <- filtered_data[,colnames(filtered_data) %in% sample_ids] + sub_counts <- rbind(sub_counts, NA) + #keep the present genes number and calculate the median value of those + for (k in 1:ncol(sub_counts)) { + sub_counts[nrow(sub_counts), k] <- length(which(sub_counts[,k] > 0)) + } + genes_per_reg_cond[r, toString(d)] <- median(as.numeric(sub_counts[nrow(sub_counts),])) + } +} + +## Adjust the tables again +#####!!!!!Check why countdata delets here +all_col_names <- colnames(countdata) +sample_filter_col_names <- colnames(filtered_data) +to_delete <- outersect(all_col_names,sample_filter_col_names) +covariates <- as.data.frame(covariates[!covariates$Sample_ID %in% to_delete,]) #data.frame for DESeq2 +countdata <- (countdata[,!colnames(countdata) %in% to_delete]) + +data_row_names <- rownames(countdata) +sample_filter_row_names <- rownames(filtered_data) +to_delete <- outersect(data_row_names, sample_filter_row_names) +countdata <- as.matrix(countdata[!rownames(countdata) %in% to_delete,]) +backupcov<-covariates +#ncol(countdata) #295 +#nrow(countdata) #12976 + +#Build a histogram of mouse weight +hist(as.numeric(covariates$mouse_weight.g.[covariates$Dex==0])) +hist(as.numeric(covariates$mouse_weight.g.[covariates$Dex==1])) +shapiro.test(as.numeric(covariates$mouse_weight.g.[covariates$Dex==0])) +shapiro.test(as.numeric(covariates$mouse_weight.g.[covariates$Dex==1])) +wilcox.test(as.numeric(covariates$mouse_weight.g.)~covariates$Dex) +ggplot(covariates,aes(x=mouse_weight.g., color=Dex, fill=Dex))+ + geom_histogram(alpha=0.6, binwidth = 0.5) + + +# Assign Conditions for the Data Analysis ---- +str(covariates) +covariates$Dex = as.factor(covariates$Dex) +covariates$Region = as.factor(covariates$Region) +covariates$Region <- relevel(covariates$Region, "CER") +covariates$Animal = as.factor(covariates$Animal) +covariates$Injection_volume <- as.factor(covariates$Injection_volume) +covariates$Researcher = as.factor(covariates$Researcher) +covariates$date_of_punching = as.factor(covariates$date_of_punching) +covariates$Plate = as.factor(covariates$Plate) +covariates$Lane = as.factor(covariates$Lane) +covariates$Row = as.factor(covariates$Row) +covariates$cryostat = as.factor(covariates$cryostat) +covariates$Sample_Well = as.factor(covariates$Sample_Well) +covariates$index = as.factor(covariates$index) +covariates$index2 = as.factor(covariates$index2) +covariates$I5_Index_ID = as.factor(covariates$I5_Index_ID) +covariates$I7_Index_ID = as.factor(covariates$I7_Index_ID) + +# sort covariates table to have same order as countdata +rownames(covariates) <- covariates$Sample_ID +idx<-match(colnames(countdata),rownames(covariates)) +covariates<-covariates[idx,] + diff --git a/01_DiffExp/02_deseq_singleRegion.R b/01_DiffExp/02_deseq_singleRegion.R new file mode 100644 index 0000000..71d9b69 --- /dev/null +++ b/01_DiffExp/02_deseq_singleRegion.R @@ -0,0 +1,512 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 03.08.2020 +## Author: Nathalie +################################################## +# RNA-seq Analysis. Part 2. Analysis with DESeq2 +# Brain region AMYGDALA +# !!!!This is the right script, use this one!!!! + +setwd("~/Documents/ownCloud/DexStim_RNAseq_Mouse") + +# 1. load data and packages ---- +source("scripts/01_DiffExp/01_setup_NG.R") +source("scripts/01_DiffExp/00_functions.R") +library(MASS) +library(pheatmap) +library(dplyr) +library(tidyverse) +library(RColorBrewer) +library(lattice) +library(EnhancedVolcano) +library(magrittr) +library(PCAtools) +library(org.Mm.eg.db) + +region <- "PFC" +prefix_plots <- paste0("figures/02_", region,"_deseq2_") +prefix_tables <- paste0("tables/02_",region,"_deseq2_") + +# 1. Subset data to region of interest ---- +Rcovariates <- droplevels(subset(covariates, Region==region)) +Rcountdata <- countdata[,rownames(Rcovariates)] + + +# 2. Create DESeq object ---- +all(Rcovariates$Sample_ID %in% colnames(Rcountdata)) +dds <- DESeqDataSetFromMatrix(countData = Rcountdata, + colData = Rcovariates, + design = ~ Dex) + +#Filter out genes that are not expressed in this brain region +keep <- rowSums(counts(dds)) >= 10 +table(keep) +dds <- dds[keep,] + + +# 3. Normalization, box-plots and MA-plots ---- +#Sequencing depth normalization +dds <- estimateSizeFactors(dds) + +# Extract counts and size factors +norm_counts <- counts(dds, normalized = TRUE) +raw_counts <- counts(dds, normalized = FALSE) + + +# Box-plots before and after the normalization +png(filename = paste0(prefix_plots, "readCounts_nonnormalized.png")) +bp_before <- boxplot(log2(counts(dds)+1), notch=TRUE, + main = "Non-normalized read counts", + ylab="log2(read counts)", cex=.6) +dev.off() + +png(filename = paste0(prefix_plots, "readCounts_normalized.png")) +bp_after <- boxplot(log2(counts(dds, normalize = TRUE)+1), notch=TRUE, + main = "Size-factor-normalized read counts", + ylab="log2(read counts)", cex=.6) +dev.off() + + +# 4. Unsupervised Clustering Analysis and Batch Correction ---- +# 4.1 Transformation ---- +# 4.1.1 VST +# Variance Stabilizing Transformation (VST) - log transformation moderates the variance accross the mean +vsd <- vst(dds, blind = TRUE) + +# 4.1.2 compare log2+1 and vst by plotting first sample against second for each method +df <- bind_rows( + as.data.frame(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>% + mutate(transformation = "log2(x + 1)"), + as.data.frame(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst")) + +colnames(df)[1:2] <- c("x", "y") + +lvls <- c("log2(x + 1)", "vst") +df$transformation <- factor(df$transformation, levels=lvls) + +ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) + + coord_fixed() + facet_grid( . ~ transformation) + + +# 4.2 Sample correlations ------------- +vsd_cor_val <- cor(assay(vsd)) +# Observe the sample distances +vsd_cor_val_group <- dplyr::select(Rcovariates, Dex) +SampleOrder <- order(vsd_cor_val_group$Dex) +pheatmap(vsd_cor_val[,SampleOrder], + annotation_col = vsd_cor_val_group) +pheatmap(vsd_cor_val[,SampleOrder], + annotation_col = vsd_cor_val_group, + filename = paste0(prefix_plots, "samplecorrelations.png")) +dev.off() + + +# 5. PCA analysis and plots ---------------- +# calculate PCs and add to summary table +pc <- pca(vsd@assays@data[[1]], removeVar = 0.2) +colData(dds) <- DataFrame(cbind(colData(dds), pc$rotated[,c(1:10)] )) +head(colData(dds)) + +# PCA plots +percentVar <- round(pc$variance[1:10], digits = 1) +# Dex +ggplot(as.data.frame(colData(dds)), aes(x = PC1, y = PC2, color = Dex)) + + geom_point(size =3) + + xlab(paste0("PC1: ", percentVar[1], "% variance")) + + ylab(paste0("PC2: ", percentVar[2], "% variance")) + + coord_fixed() + + ggtitle("PCA with VST data") +ggsave(paste0(prefix_plots, "pca_Dex.png")) +# possible noise factors +# mouse weight +ggplot(as.data.frame(colData(dds)), aes(x = PC1, y = PC2, color = `mouse_weight.g.`)) + + geom_point(size =3) + + xlab(paste0("PC1: ", percentVar[1], "% variance")) + + ylab(paste0("PC2: ", percentVar[2], "% variance")) + + coord_fixed() + + ggtitle("PCA with VST data") +ggsave(paste0(prefix_plots, "pca_MouseWeight.png")) +# injection volume +ggplot(as.data.frame(colData(dds)), aes(x = PC1, y = PC2, color = `Injection_volume`)) + + geom_point(size =3) + + xlab(paste0("PC1: ", percentVar[1], "% variance")) + + ylab(paste0("PC2: ", percentVar[2], "% variance")) + + coord_fixed() + + ggtitle("PCA with VST data") +ggsave(paste0(prefix_plots, "pca_InjectionVolume.png")) +# cryostat machine +ggplot(as.data.frame(colData(dds)), aes(x = PC1, y = PC2, color = cryostat)) + + geom_point(size =3) + + xlab(paste0("PC1: ", percentVar[1], "% variance")) + + ylab(paste0("PC2: ", percentVar[2], "% variance")) + + coord_fixed() + + ggtitle("PCA with VST data") +ggsave(paste0(prefix_plots, "pca_Cryostat.png")) +# researcher +ggplot(as.data.frame(colData(dds)), aes(x = PC1, y = PC2, color = `Researcher`)) + + geom_point(size =3) + + xlab(paste0("PC1: ", percentVar[1], "% variance")) + + ylab(paste0("PC2: ", percentVar[2], "% variance")) + + coord_fixed() + + ggtitle("PCA with VST data") +ggsave(paste0(prefix_plots, "pca_Researcher.png")) +# date of punching +ggplot(as.data.frame(colData(dds)), aes(x = PC1, y = PC2, color = `date_of_punching`)) + + geom_point(size =3) + + xlab(paste0("PC1: ", percentVar[1], "% variance")) + + ylab(paste0("PC2: ", percentVar[2], "% variance")) + + coord_fixed() + + ggtitle("PCA with VST data") +ggsave(paste0(prefix_plots, "pca_DateOfPunching.png")) +# plate, lane, row +ggplot(as.data.frame(colData(dds)), aes(x = PC1, y = PC2, color = Plate)) + + geom_point(size =3) + + xlab(paste0("PC1: ", percentVar[1], "% variance")) + + ylab(paste0("PC2: ", percentVar[2], "% variance")) + + coord_fixed() + + ggtitle("PCA with VST data") +ggsave(paste0(prefix_plots, "pca_Plate.png")) +ggplot(as.data.frame(colData(dds)), aes(x = PC1, y = PC2, color = Lane, shape = Plate)) + + geom_point(size =3) + + xlab(paste0("PC1: ", percentVar[1], "% variance")) + + ylab(paste0("PC2: ", percentVar[2], "% variance")) + + coord_fixed() + + ggtitle("PCA with VST data") +ggsave(paste0(prefix_plots, "pca_LanePlate.png")) +ggplot(as.data.frame(colData(dds)), aes(x = PC1, y = PC2, color = Row, shape = Plate)) + + geom_point(size =3) + + xlab(paste0("PC1: ", percentVar[1], "% variance")) + + ylab(paste0("PC2: ", percentVar[2], "% variance")) + + coord_fixed() + + ggtitle("PCA with VST data") +ggsave(paste0(prefix_plots, "pca_RowPlate.png")) +# RIN and RIN2 +ggplot(as.data.frame(colData(dds)), aes(x = PC1, y = PC2, color = RIN)) + + geom_point(size =3) + + xlab(paste0("PC1: ", percentVar[1], "% variance")) + + ylab(paste0("PC2: ", percentVar[2], "% variance")) + + coord_fixed() + + ggtitle("PCA with VST data") +ggsave(paste0(prefix_plots, "pca_RIN.png")) +ggplot(as.data.frame(colData(dds)), aes(x = PC1, y = PC2, color = RIN2)) + + geom_point(size =3) + + xlab(paste0("PC1: ", percentVar[1], "% variance")) + + ylab(paste0("PC2: ", percentVar[2], "% variance")) + + coord_fixed() + + ggtitle("PCA with VST data") +ggsave(paste0(prefix_plots, "pca_RIN2.png")) + +#pairs plot +#Color Dex +png(filename = paste0(prefix_plots, "pcaPairsplot_Dex.png"), width = 900, height = 900) +print(splom(as.data.frame(pc$rotated[,1:10]), + col=colData(dds)$Dex, + cex=2,pch='*')) +dev.off() +#Color date +png(filename = paste0(prefix_plots, "pcaPairsplot_DateOfPunching.png"), width = 900, height = 900) +print(splom(as.data.frame(pc$rotated[,1:10]), + col=colData(dds)$date_of_punching, + cex=2,pch='*')) +dev.off() +#Color Researcher +png(filename = paste0(prefix_plots, "pcaPairsplot_Researcher.png"), width = 900, height = 900) +print(splom(as.data.frame(pc$rotated[,1:10]), + col=colData(dds)$Researcher, + cex=2,pch='*')) +dev.off() +#Color Cryostat +png(filename = paste0(prefix_plots, "pcaPairsplot_Cryostat.png"), width = 900, height = 900) +print(splom(as.data.frame(pc$rotated[,1:10]), + col=colData(dds)$cryostat, + cex=2,pch='*')) +dev.off() +#Color Plate +png(filename = paste0(prefix_plots, "pcaPairsplot_Plate.png"), width = 900, height = 900) +print(splom(as.data.frame(pc$rotated[,1:10]), + col=colData(dds)$Plate, + cex=2,pch='*')) +dev.off() +#Color Lane +png(filename = paste0(prefix_plots, "pcaPairsplot_Lane.png"), width = 900, height = 900) +print(splom(as.data.frame(pc$rotated[,1:10]), + col=colData(dds)$Lane, + cex=2,pch='*')) +dev.off() +#Color Row +png(filename = paste0(prefix_plots, "pcaPairsplot_Row.png"), width = 900, height = 900) +print(splom(as.data.frame(pc$rotated[,1:10]), + col=colData(dds)$Row, + cex=2,pch='*')) +dev.off() + + +#MDS plot (not necessary if we have the complete matrix, helpful if only distances are known) +sampleDists <- dist(t(assay(vsd))) +sampleDistMatrix <- as.matrix( sampleDists ) +mds <- as.data.frame(colData(vsd)) %>% + cbind(cmdscale(sampleDistMatrix)) +ggplot(mds, aes(x = `1`, y = `2`, color = Dex, label = Sample_ID)) + + geom_point(size = 3) + + geom_text_repel() + + coord_fixed() + ggtitle("MDS with VST data") +ggsave(paste0(prefix_plots, "mds_Dex.png")) + + +# 6. Outlier detection on dex and baseline samples separately --------------- +dds <- outlier_removal(dds) +dds <- estimateSizeFactors(dds) +vsd <- vst(dds, blind = TRUE) + + +# 7. Batch Identification with SVA (SVA,CCA) ---------------- +batch <- batch_sva(dds) +cov_pc <- batch$cov_pc +n.sv <- batch$n.sv + + +# 8. Corrected model --------------- +# 8.1 Add SVs to model design ----- +ddssva <- dds +colData(ddssva) <- cbind(colData(ddssva), cov_pc[,paste0("SV", seq(1:n.sv))]) +design(ddssva) <- as.formula(paste0("~ Dex +", paste(paste0("SV", seq(1:n.sv)), collapse = "+"))) + + +# 8.2 Variance Partition ---------- +form <- as.formula(paste0("~ ", paste0( grep(colnames(colData(ddssva)), pattern="SV", value=T), collapse=" + " ), + "+ (1|Dex)", collapse="")) +form + +# run variance partition +varPart <- fitExtractVarPartModel(assay(vsd), form, as.data.frame(colData(ddssva))) + +# plot variance partition (Violin plot of variance fraction for each gene and each variable) +png(paste0(prefix_plots, "variancePartition.png")) +plotVarPart(varPart) +dev.off() + +# plot percentage of variance explained by each variable +print("% variance explained by covariates:") +o <- colSums(varPart)*100/sum(varPart) +o +png(paste0(prefix_plots, "varPart_percentage.png")) +barplot(o, las=2) +dev.off() + + + +# 9. Differential Expression Analysis ------------ +# 9.1 Running DE analysis based on the Negative Binomial (aka Gamma-Poisson) distribution +# DESeq method performs three steps: +# a. estimation of size factors (controlling for differences in the sequencing depth) +# b. estimation of dispersion values for each gene +# c. fitting a generalized linear model +# ddssva$Dex <- relevel(ddssva$Dex, ref = "1") +ddssva <- DESeq(ddssva) +plotDispEsts(ddssva) + +# 9.2 Building the results table +# extract estimated log2 fold changes and p values +# without contrast: default is last variable in design +res <- results(ddssva, contrast=c("Dex","1","0")) +# positive logFC: high in dex, low in control using this contrast +# negative logFC: high in control, low in dex treated samples +res +summary(res) +# information on the meaning of the columns in res +mcols(res, use.names = TRUE) +# The first column, baseMean, is a just the average of the normalized count values, +# divided by the size factors, taken over all samples in the DESeqDataSet. +# The remaining four columns refer to a specific contrast, namely the comparison of +# the trt level over the untrt level for the factor variable dex. We will find out +# below how to obtain other contrasts. +# The column log2FoldChange is the effect size estimate. It tells us how much the +# gene’s expression seems to have changed due to treatment with dexamethasone in +# comparison to untreated samples. This value is reported on a logarithmic scale to +# base 2: for example, a log2 fold change of 1.5 means that the gene’s expression is +# increased by a multiplicative factor of 21.5≈2.82. +# Of course, this estimate has an uncertainty associated with it, which is available +# in the column lfcSE, the standard error estimate for the log2 fold change estimate. +# We can also express the uncertainty of a particular effect size estimate as the +# result of a statistical test. The purpose of a test for differential expression +# is to test whether the data provides sufficient evidence to conclude that this +# value is really different from zero. DESeq2 performs for each gene a hypothesis +# test to see whether evidence is sufficient to decide against the null hypothesis +# that there is zero effect of the treatment on the gene and that the observed +# difference between treatment and control was merely caused by experimental variability +# (i.e., the type of variability that you can expect between different samples in the +# same treatment group). As usual in statistics, the result of this test is reported +# as a p value, and it is found in the column pvalue. Remember that a p value indicates +# the probability that a fold change as strong as the observed one, or even stronger, +# would be seen under the situation described by the null hypothesis. + +# lower the FDR in result table +res.05 <- results(ddssva, contrast=c("Dex","1","0"), alpha = 0.05) +table(res.05$padj < 0.05) + +# raise the logFC threshold from 0 +resLFC1 <- results(ddssva, contrast=c("Dex","1","0"), lfcThreshold=1) +table(resLFC1$padj < 0.1) + +# #plot gene with smallest pvalue +# topGene <- rownames(res)[which.min(res$padj)] +# plotCounts(ddssva, gene = topGene, intgroup=c("Dex")) +# library("ggbeeswarm") +# geneCounts <- plotCounts(ddssva, gene = topGene, intgroup = c("Dex","Mouse_ID"), +# returnData = TRUE) +# ggplot(geneCounts, aes(x = Dex, y = count, color = Mouse_ID)) + +# scale_y_log10() + geom_beeswarm(cex = 3) + + +# 9.3 Shrink log2 fold changes +# In statistics, shrinkage is the reduction in the effects of sampling variation. +# In regression analysis, a fitted relationship appears to perform less well on a +# new data set than on the data set used for fitting.[1] In particular the value of +# the coefficient of determination 'shrinks'. This idea is complementary to overfitting +# and, separately, to the standard adjustment made in the coefficient of determination +# to compensate for the subjunctive effects of further sampling, like controlling for +# the potential of new explanatory terms improving the model by chance: that is, the +# adjustment formula itself provides "shrinkage." But the adjustment formula yields +# an artificial shrinkage. +# A shrinkage estimator is an estimator that, either explicitly or implicitly, incorporates +# the effects of shrinkage. In loose terms this means that a naive or raw estimate is +# improved by combining it with other information. The term relates to the notion that +# the improved estimate is made closer to the value supplied by the 'other information' +# than the raw estimate. In this sense, shrinkage is used to regularize ill-posed +# inference problems. (Source: Wikipedia) +resultsNames(ddssva) +res <- lfcShrink(ddssva, coef="Dex_1_vs_0", type = "apeglm") +res <- res[order(res$padj),] +head(res) +res_print <- as.data.table(res, keep.rownames = TRUE) %>% + dplyr::rename("Ensembl_ID" = "rn") +res_print$Gene_Symbol <- mapIds(org.Mm.eg.db, keys = res_print$Ensembl_ID, + keytype = "ENSEMBL", column="SYMBOL") +head(res_print) +write.table(res_print, file=paste0(prefix_tables, "Dex_1_vs_0_lfcShrink.txt"), + sep="\t", quote = F, row.names = FALSE) + +# 9.4 Plots +# MA plot (mean of normalized counts vs LFC) +png(paste0(prefix_plots, "MAplot_lfcShrink.png")) +DESeq2::plotMA(res,ylim = c(-5, 5)) +dev.off() +# MA plot without shrinkage +res.noshr <- results(ddssva, name="Dex_1_vs_0") +DESeq2::plotMA(res.noshr, ylim = c(-5, 5)) + +# volcano plot +png(paste0(prefix_plots, "volcanoplot_lfcShrink.png"), + width = 600, + height = 600) +print(EnhancedVolcano(res, + lab = rownames(res), + x = "log2FoldChange", + y = "pvalue", + title = paste0("DESeq2 results: ", region), + subtitle = "Differential expression")) +dev.off() + +volcano_data <- as.data.frame(res@listData) +volcano_data$sig <- as.factor(volcano_data$padj <= 0.1) +ggplot(volcano_data, aes(x=log2FoldChange, y=-log10(padj), color=sig)) + + geom_point() + + scale_color_manual(values = c("#FF9900", "lightgrey"), + breaks = c("TRUE", "FALSE"), + labels = c("padj <= 0.1", "padj > 0.1"), + name = "") + + xlab("log2(FoldChange)") + + ylab("-log10(padj)") + + theme_bw() + + theme(text = element_text(size= 20)) + + #legend.position = "bottomright") + + labs(title = element_text("Differential expression analysis \nwith DESeq2")) +ggsave(filename = paste0(prefix_plots, "volcanoplot_lfcShrink.svg"), + width = 8, height = 6) + +# # plot gene with smallest p value +# DESeq2::plotMA(res,ylim = c(-5, 5)) +# topGene <- rownames(res)[which.min(res$padj)] +# with(res[topGene, ], { +# points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2) +# text(baseMean, log2FoldChange, topGene, pos=2, col="dodgerblue") +# }) +# +# # plot histogram of pvalue distribution +# hist(res$pvalue[res$baseMean > 1], breaks = 0:20/20, +# col = "grey50", border = "white") + + +# # plot gene expression of most variable genes as heatmap +# library("genefilter") +# topVarGenes <- head(order(rowVars(assay(vsd)), decreasing = TRUE), 20) +# mat <- assay(vsd)[ topVarGenes, ] +# mat <- mat - rowMeans(mat) +# anno <- as.data.frame(colData(vsd)[, "Dex"]) +# rownames(anno) <- colnames(mat) +# pheatmap(mat, annotation_col = anno) + + + + +# 10. Differential Expression Analysis with lfcThreshold (analog to treat function in limma) ------------ +# 10.1 Running DE analysis based on the Negative Binomial (aka Gamma-Poisson) distribution +#ddssva <- DESeq(ddssva) +#plotDispEsts(ddssva) + +# 10.2 Building the results table +# extract estimated log2 fold changes and p values +res <- results(ddssva, contrast=c("Dex","1","0"), lfcThreshold = 0.5) +# downregulation (negative logFC): high in dex 0, low in dex 1 using this contrast +res <- res[order(res$padj),] +res +summary(res) +# information on the meaning of the columns in res +mcols(res, use.names = TRUE) + + +# 10.3 Shrink log2 fold changes +# svalues: The adjusted p-values and s-values are similar but with a different +# definition of error. One focuses on falsely rejecting what are truly null genes, +# and the other on getting the sign of the LFC wrong. +res_shrink <- lfcShrink(ddssva, coef = "Dex_1_vs_0", type = "apeglm", lfcThreshold = 0.5) +res_shrink <- res_shrink[order(res_shrink$svalue),] +head(res_shrink) +res_shrink_print <- as.data.table(res_shrink, keep.rownames = TRUE) %>% + dplyr::rename("Ensembl_ID" = "rn") +res_shrink_print$Gene_Symbol <- mapIds(org.Mm.eg.db, keys = res_shrink_print$Ensembl_ID, + keytype = "ENSEMBL", column="SYMBOL") +head(res_shrink_print) +write.table(res_shrink_print, file=paste0(prefix_tables, "Dex_1_vs_0_lfcShrink_lfc0.5.txt"), + sep="\t", quote = F, row.names = FALSE) + +# 10.4 Plots +# MA plot (mean of normalized counts vs LFC) +png(paste0(prefix_plots, "MAplot_lfcShrink_lfc0.5.png")) +DESeq2::plotMA(res_shrink,ylim = c(-5, 5)) +dev.off() +# MA plot without shrinkage +DESeq2::plotMA(res, ylim = c(-5, 5)) + +# volcano plot +png(paste0(prefix_plots, "volcanoplot_lfcShrink_lfc0.5.png"), + width = 600, + height = 600) +print(EnhancedVolcano(res_shrink, + lab = rownames(res_shrink), + x = "log2FoldChange", + y = "svalue", + title = paste0("DESeq2 results: ", region), + subtitle = "Differential expression")) +dev.off() +# volcano plot without shrinkage +EnhancedVolcano(res, + lab = rownames(res), + x = "log2FoldChange", + y = "pvalue", + title = paste0("DESeq2 results: ", region), + subtitle = "Differential expression") + + +# 11. Print data in correct format for kimono ------------ +print_kimono(vsd, colData(ddssva)) diff --git a/01_DiffExp/06_comparison_deseq_regions.R b/01_DiffExp/06_comparison_deseq_regions.R new file mode 100644 index 0000000..122f429 --- /dev/null +++ b/01_DiffExp/06_comparison_deseq_regions.R @@ -0,0 +1,258 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 08.09.2020 +## Author: Nathalie +################################################## +# Compare deseq SV DE genes between regions + +setwd("~/Documents/ownCloud/DexStim_RNAseq_Mouse") + +library(rlist) +library(VennDiagram) +library(UpSetR) +library(RColorBrewer) +library(org.Mm.eg.db) + +# regions <- c("AMY", "PFC", "PVN", "CER", "vDG", "dDG", "vCA1", "dCA1", "vCA3", "dCA3") +# vCA3 and dCA3 were excluded +regions <- c("AMY", "PFC", "PVN", "CER", "vDG", "dDG", "vCA1", "dCA1") + +folder_plots <- paste0("figures") +folder_tables <- paste0("tables") + + +# 0. functions ------------------------------- +write_genelist <- function(genelist, filename){ + # write list with ENSEMBL IDs + write.table(genelist, file = paste0(folder_tables, "/06_",filename,".txt"), + quote = FALSE, row.names = FALSE, col.names = FALSE) + # write list with ENTREZ IDs + entrez <- mapIds(org.Mm.eg.db, keys = genelist, keytype = "ENSEMBL", column="ENTREZID") + write.table(entrez, file = paste0(folder_tables, "/06_",filename,"_entrezID.txt"), + quote = FALSE, row.names = FALSE, col.names = FALSE) + # write list with GENE SYMBOLS + symbol <- mapIds(org.Mm.eg.db, keys = genelist, keytype = "ENSEMBL", column="SYMBOL") + write.table(symbol, file = paste0(folder_tables, "/06_",filename,"_symbolID.txt"), + quote = FALSE, row.names = FALSE, col.names = FALSE) +} + + +# 1. read DE tables from all regions ---------- + +list_reg <- list() +list_reg_sig <- list() +list_genes_sig <- list() +list_genes_sig_up <- list() +list_genes_sig_down <- list() +background_genes <- list() + +for (reg in regions){ + res <- read.table(file=paste0(folder_tables, "/02_", reg, "_deseq2_Dex_1_vs_0_lfcShrink.txt"),sep="\t", + header = TRUE) + na_indices <- which(is.na(res$padj)) + res$padj[na_indices] <- 1 + list_reg[[reg]] <- res + res_sig <- res[res$padj <= 0.1,] + list_reg_sig[[reg]] <- res_sig + list_genes_sig[[reg]] <- res_sig$Ensembl_ID + list_genes_sig_up[[reg]] <- res_sig$Ensembl_ID[res_sig$log2FoldChange > 0] + list_genes_sig_down[[reg]] <- res_sig$Ensembl_ID[res_sig$log2FoldChange < 0] + background_genes[[reg]] <- res$Ensembl_ID +} + +background <- Reduce(intersect, background_genes) +write_genelist(background,"background") + +union_back <- Reduce(union, background_genes) + + +# 2. Overlap between different results -------------- + +# png(filename = paste0(folder_plots, "/06_comparison_deseq_upsetPlot_withCA3.png"), height = 800, width = 1000) +png(filename = paste0(folder_plots, "/06_comparison_deseq_upsetPlot.png"), height = 600, width = 1000) +print(upset(fromList(list_genes_sig), nsets = 8, order.by = "freq", + text.scale = c(1.8, 1.9, 1.8, 1.9, 1.9, 1.9), + sets.x.label = "#DE genes in brain region", + mainbar.y.label = "#DE genes in intersection")) +dev.off() + +pdf(file = paste0(folder_plots, "/06_comparison_deseq_upsetPlot.pdf"), height = 9, width = 14) +print(upset(fromList(list_genes_sig), nsets = 8, order.by = "freq", + text.scale = c(1.8, 1.9, 1.8, 1.9, 1.9, 1.9), + sets.x.label = "#DE genes in brain region", + mainbar.y.label = "#DE genes in intersection")) +dev.off() + +# # 3. Overlap between regions -------------------- +# # Regions AMY, CER, PFC and PVN +# venn.diagram(list(list_genes_sig[["AMY"]], list_genes_sig[["CER"]], list_genes_sig[["PFC"]], list_genes_sig[["PVN"]]), +# category.names = c("AMY", "CER", "PFC", "PVN"), +# filename = paste0(folder_plots, "/06_comparison_deseq_AMY_CER_PFC_PVN.png"), +# output = TRUE, +# imagetype="png" , +# height = 800, +# width = 800, +# lwd = 1, +# col=c("#284E5C","#288577","#73BA70","#EAE362"), +# fill = c(alpha("#284E5C",0.3), alpha("#288577",0.3), alpha("#73BA70",0.3), alpha("#EAE362", 0.3)), +# cex = 0.5, +# fontfamily = "sans", +# cat.cex = 0.3, +# cat.default.pos = "outer", +# cat.fontfamily = "sans", +# cat.col = c("#284E5C","#288577","#73BA70","#EAE362"), +# margin = 0.05) +# +# # Hippocampus ventral +# venn.diagram(list(list_genes_sig[["vDG"]], list_genes_sig[["vCA1"]], +# list_genes_sig[["vCA3"]]), +# category.names = c("vDG", "vCA1", "vCA3"), +# filename = paste0(folder_plots, "/06_comparison_deseq_HIPventral.png"), +# output = TRUE, +# imagetype="png" , +# height = 800, +# width = 800, +# lwd = 1, +# col=c("#284E5C","#288577","#73BA70"), +# fill = c(alpha("#284E5C",0.3), alpha("#288577",0.3), alpha("#73BA70",0.3)), +# cex = 0.5, +# fontfamily = "sans", +# cat.cex = 0.3, +# cat.default.pos = "outer", +# cat.fontfamily = "sans", +# cat.col = c("#284E5C","#288577","#73BA70"), +# margin = 0.05) +# +# # Hippocampus dorsal +# venn.diagram(list(list_genes_sig[["dDG"]], list_genes_sig[["dCA1"]], +# list_genes_sig[["dCA3"]]), +# category.names = c("dDG", "dCA1", "dCA3"), +# filename = paste0(folder_plots, "/06_comparison_deseq_HIPdorsal.png"), +# output = TRUE, +# imagetype="png" , +# height = 800, +# width = 800, +# lwd = 1, +# col=c("#284E5C","#288577","#73BA70"), +# fill = c(alpha("#284E5C",0.3), alpha("#288577",0.3), alpha("#73BA70",0.3)), +# cex = 0.5, +# fontfamily = "sans", +# cat.cex = 0.3, +# cat.default.pos = "outer", +# cat.fontfamily = "sans", +# cat.col = c("#284E5C","#288577","#73BA70"), +# margin = 0.05) + +# PFC, PVN and dCA1 for progress report on 26.11.2020 +library(eulerr) +png(filename = paste0(folder_plots, "/06_comparison_deseq_dCA1_PFC_PVN.png"), + height = 600, width = 600) +plot(euler(list("dCA1" = list_genes_sig[["dCA1"]], "PFC" = list_genes_sig[["PFC"]], + "PVN" = list_genes_sig[["PVN"]]), shape = "ellipse"), + labels = list(cex = 1.5), quantities = list(cex = 1.5)) +dev.off() + + +# 4. Gene lists ------------------------------------- + +# 4.1 Overlap all regions +overlap <- Reduce(intersect, list_genes_sig) +write_genelist(overlap, "overlap_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1") +overlap_up <- Reduce(intersect, list_genes_sig_up) +write_genelist(overlap_up, "overlap_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1_up") +overlap_down <- Reduce(intersect, list_genes_sig_down) +write_genelist(overlap_down, "overlap_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1_down") + +union_sig <- Reduce(union, list_genes_sig) +write_genelist(union_sig, "union_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1") + +# 4.2 Overlap all regions except PFC +overlap1 <- Reduce(intersect, list_genes_sig[c(1,3:8)]) +write_genelist(overlap1, "overlap_AMY-CER-PVN-dDG-vDG-dCA1-vCA1") + +# difference between all regions and all regions without PFC +d <- setdiff(overlap1, overlap) +d <- list_reg[["PFC"]][d,] +d$ENTREZID <- mapIds(org.Mm.eg.db, keys = rownames(d), keytype = "ENSEMBL", column = "ENTREZID") +d$SYMBOL <- mapIds(org.Mm.eg.db, keys = rownames(d), keytype = "ENSEMBL", column = "SYMBOL") +write.table(d, file = paste0(folder_tables, "/06_diff_allRegionsExceptPFC.txt"), + quote = FALSE) + +# 4.3 Overlap of PFC and CER +overlap_pfc_cer <- Reduce(intersect, list_genes_sig[c(2,4)]) +union_other <- Reduce(union, list_genes_sig[c(1,3,5:8)]) +write_genelist(overlap_pfc_cer, "overlap_CER-PFC") +d <- setdiff(overlap_pfc_cer, union_other) +write_genelist(d, "overlap_CER-PFC_only") + + +# 4.3 Overlap of dorsal hippocampus +overlap_dhip <- Reduce(intersect, list_genes_sig[c(6,8)]) +overlap_dhip_up <- Reduce(intersect, list_genes_sig_up[c(6,8)]) +overlap_dhip_down <- Reduce(intersect, list_genes_sig_down[c(6,8)]) +write_genelist(overlap_dhip, "overlap_dDG-dCA1") + + +# 4.4 Overlap of ventral hippocampus +overlap_vhip <- Reduce(intersect, list_genes_sig[c(5,7)]) +overlap_vhip_up <- Reduce(intersect, list_genes_sig_up[c(5,7)]) +overlap_vhip_down <- Reduce(intersect, list_genes_sig_down[c(5,7)]) +write_genelist(overlap_vhip, "overlap_vDG-vCA1") + + +# 4.5 overlap ventral and dorsal HIP ------------- +venn.diagram( + list(overlap_dhip, overlap_vhip), + category.names = c("dorsal HIP", "ventral HIP"), + filename = paste0(folder_plots, "/06_comparison_deseq_HIP.png"), + output = TRUE, + imagetype = "png" , + height = 800, + width = 800, + lwd = 1, + col = c("#284E5C", "#288577"), + fill = c("#284E5C", "#288577"), + alpha = 0.3, + cex = 0.5, + fontfamily = "sans", + cat.cex = 0.3, + cat.default.pos = "outer", + cat.dist = 0.05, + cat.fontfamily = "sans", + cat.col = c("#284E5C", "#288577"), + margin = 0.05 +) + +venn.diagram( + list(overlap_dhip_up, overlap_vhip_up, overlap_dhip_down, overlap_vhip_down), + category.names = c("dorsal HIP up", "ventral HIP up", "dorsal HIP down", "ventral HIP down"), + filename = paste0(folder_plots, "/06_comparison_deseq_HIP_upDown.png"), + output = TRUE, + imagetype = "png" , + height = 800, + width = 800, + lwd = 1, + col=c("#284E5C","#288577","#73BA70","#EAE362"), + fill = c("#284E5C","#288577","#73BA70","#EAE362"), + alpha = 0.3, + cex = 0.5, + fontfamily = "sans", + cat.cex = 0.3, + # cat.default.pos = "text", + cat.dist = 0.1, + cat.fontfamily = "sans", + cat.col = c("#284E5C", "#288577", "#73BA70", "#EAE362"), + margin = 0.05) + +# intersection dorsal ventral +int <- intersect(overlap_dhip, overlap_vhip) +write_genelist(int, "HIP_intersectDorsalVentral") +# DE in dorsal but not ventral +diff_dv <- setdiff(overlap_dhip, overlap_vhip) +write_genelist(diff_dv, "HIP_diffDorsalVentral") +# DE in ventral but not dorsal +diff_vd <- setdiff(overlap_vhip, overlap_dhip) +write_genelist(diff_vd, "HIP_diffVentralDorsal") +# union dorsal ventral +un_dv <- union(overlap_dhip, overlap_vhip) +write_genelist(un_dv, "HIP_unionDorsalVentral") diff --git a/01_DiffExp/06a_comparison_deseq_regions_plots.R b/01_DiffExp/06a_comparison_deseq_regions_plots.R new file mode 100644 index 0000000..cac15ff --- /dev/null +++ b/01_DiffExp/06a_comparison_deseq_regions_plots.R @@ -0,0 +1,267 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 21.01.2021 +## Author: Nathalie +################################################## +# Compare deseq SV DE genes between regions +# Plots for SAB meeting + +setwd("~/Documents/ownCloud/DexStim_RNAseq_Mouse") + +# library(rlist) +library(RColorBrewer) +# library(org.Mm.eg.db) +library(data.table) +library(ggplot2) +library(dplyr) +library(tidyverse) +library(tidyr) +library(gridExtra) +library(ggraph) +library(igraph) + + +regions <- + c("AMY", "PFC", "PVN", "CER", "vDG", "dDG", "vCA1", "dCA1") + +folder_plots <- paste0("figures") +folder_tables <- paste0("tables") + + +# 1. Read DE tables from all regions ---------- + +list_reg_sig <- list() +list_genes_sig <- list() + +for (reg in regions) { + res <- + fread( + file = paste0( + folder_tables, + "/02_", + reg, + "_deseq2_Dex_1_vs_0_lfcShrink.txt" + ), + sep = "\t" + ) + na_indices <- which(is.na(res$padj)) + res$padj[na_indices] <- 1 + res_sig <- res[res$padj <= 0.1, ] + # res_sig <- res[res$log2FoldChange >= 1] + list_reg_sig[[reg]] <- res_sig + list_genes_sig[[reg]] <- rownames(res_sig) +} + + + +# 2. Concatenate DE tables ----------------- + +data <- bind_rows(list_reg_sig, .id = "region") %>% + group_by(Ensembl_ID) %>% + summarise(region = list(region)) + +data_unique <- data %>% + mutate(nr_regions = lengths(region)) %>% + mutate(unique = (nr_regions == 1)) %>% + unnest(cols = c(region)) %>% + mutate("combined_id" = paste0(region, "-", Ensembl_ID)) + +data_barplot <- data_unique %>% + group_by(region, unique) %>% + count() %>% + group_by(region) %>% + mutate(sum = sum(n)) + +matrix_heatmap <- matrix( + data = 0, + nrow = length(regions), + ncol = length(regions), + dimnames = list(regions, regions) +) +for (reg1 in regions) { + for (reg2 in regions) { + if (reg1 != reg2) { + nr_genes <- + length(which( + sapply(data$region, function(x) + reg1 %in% x) & + sapply(data$region, function(x) + reg2 %in% x) + )) + matrix_heatmap[reg1, reg2] <- nr_genes + } else{ + nr_genes <- + length(which(sapply(data$region, function(x) { + (reg1 %in% x) & (length(x) == 1) + }))) + matrix_heatmap[reg1, reg2] <- nr_genes + } + } +} +matrix_heatmap <- data.frame(matrix_heatmap) +matrix_heatmap$x.values <- rownames(matrix_heatmap) +matrix_heatmap.melted <- + melt(matrix_heatmap, id.vars = c("x.values")) +matrix_heatmap.melted <- + pivot_longer( + matrix_heatmap, + cols = AMY:dCA1, + names_to = c("variable"), + names_transform = list(variable = as.factor) + ) +matrix_heatmap.melted$x.values <- + factor(x = matrix_heatmap.melted$x.values, + levels = levels(matrix_heatmap.melted$variable)) + + + +# 3. Stacked barplot ------------------------- + +bp <- ggplot(data_barplot, aes(fill = unique, y = n, x = region)) + + geom_bar(position = "stack", stat = "identity") + + scale_fill_manual( + name = "", + labels = c("DE in multiple regions", "DE unique"), + values = c("tan1", "royalblue") + ) + + xlab("brain region") + + ylab("# diff. exp. genes") + + theme_light() + + theme( + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + # legend.title = element_blank(), + legend.position = "top" + ) + + geom_text(aes(label = paste0(round((n / sum) * 100, digits = 1 + ), "%")), + position = position_stack(vjust = 0.5), + size = 4) +ggsave( + bp, + filename = paste0( + folder_plots, + "/06_comparison_deseq_barplot_vertical_percentage.png" + ), + width = 8, + height = 6 +) + +bp_horizontal <- bp + + # scale_x_reverse() + + coord_flip() + +ggsave( + bp_horizontal, + filename = paste0( + folder_plots, + "/06_comparison_deseq_barplot_horizontal_percentage.png" + ), + width = 6, + height = 8 +) + + + +# 4. Heatmap --------------------------------- + +# Prepare heatmap +# Would it make sense to have total nr of DE genes in diagonal? +gm <- + ggplot(data = matrix_heatmap.melted, aes( + x = factor(x.values), + y = variable, + fill = value + )) + + geom_tile() + + scale_fill_distiller( + name = "Nr. of DE genes", + palette = "Reds", + direction = 1, + na.value = "transparent" + ) + + theme_light() + + theme( + axis.title.y = element_blank(), + axis.title.x = element_blank(), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.position = "bottom", + legend.title = element_text(size = 12), + legend.text = element_text(size = 10) + ) + +# Prepare x axis barplot +bp.x <- + ggplot(data = data_barplot, aes( + x = factor(region, levels = levels(matrix_heatmap.melted$x.values)), + y = n, + fill = unique + )) + + geom_bar(stat = "identity", position = "stack") + + scale_fill_manual( + name = "", + labels = c("DE in multiple regions", "DE unique"), + values = c("red3", "navy") + ) + + ylab("") + + theme_light() + + theme( + axis.title.y = element_text(size = 8), + axis.text.x = element_text(size = 12), + axis.title.x = element_blank(), + axis.text.y = element_text(size = 12), + legend.position = "top", + legend.text = element_text(size = 10) + ) + + scale_x_discrete(position = "top") + +# Put plots together +hm_comb <- grid.arrange(bp.x, gm, nrow = 2, ncol = 1) + +ggsave( + hm_comb, + filename = paste0(folder_plots, "/06_comparison_deseq_heatmap.png"), + height = 10, + width = 7 +) + +# # 5. Circular plot ------------------------- +# +# # create data frame giving hierarchical structure of regions and DE genes +# d1 <- data.frame(from="origin", to=regions) +# d2 <- data.frame(from=data_unique$region, to=data_unique$combined_id) +# edges <- rbind(d1, d2) +# +# # create a datafram with connection between genes +# all_leaves <- data_unique$combined_id +# comb_genes <- inner_join(x = data_unique, y = data_unique, by = "ensembl_id") +# connect <- data.frame(from = comb_genes$combined_id.x, to = comb_genes$combined_id.y) +# connect$value <- 1 +# +# # create a vertices data-frame +# vertices <- data.frame( +# name = unique(c(as.character(edges$from), as.character(edges$to))) , +# value = 1 +# ) +# vertices$group <- edges$from[ match( vertices$name, edges$to ) ] +# +# +# # Create a graph object +# mygraph <- igraph::graph_from_data_frame( edges, vertices=vertices ) +# +# # The connection object must refer to the ids of the leaves: +# from <- match( connect$from, vertices$name) +# to <- match( connect$to, vertices$name) +# +# # Basic usual argument +# ggraph(mygraph, layout = 'dendrogram', circular = TRUE) + +# geom_conn_bundle(data = get_con(from = from, to = to), alpha=0.2, colour="skyblue", tension = .5) + +# geom_node_point(aes(filter = leaf, x = x*1.05, y=y*1.05, colour=group)) + +# scale_colour_manual(values= rep( brewer.pal(9,"Paired") , 30)) + +# theme_void() +# +# # ggsave(filename = paste0(folder_plots, "/06_comparison_deseq_circular.png")) diff --git a/01_DiffExp/06b_comparison_HIP_deseq_regions_plots.R b/01_DiffExp/06b_comparison_HIP_deseq_regions_plots.R new file mode 100644 index 0000000..b98f52f --- /dev/null +++ b/01_DiffExp/06b_comparison_HIP_deseq_regions_plots.R @@ -0,0 +1,244 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 19.04.2021 +## Author: Nathalie +################################################## +# Compare deseq SV DE genes between HIP regions +# Plots for manuscript + +library(RColorBrewer) +library(data.table) +library(ggplot2) +library(dplyr) +library(tidyverse) +library(tidyr) +# library(gridExtra) +library(pheatmap) + +basepath <- "~/Documents/ownCloud/DexStim_RNAseq_Mouse/" + +regions <- + c("vDG", "dDG", "vCA1", "dCA1") + + + +# 1. Read DE tables from HIP regions ---------- + +list_reg <- list() +list_reg_sig <- list() +list_genes_top10 <- list() + +for (reg in regions) { + res <- + fread( + file = paste0( + basepath, + "tables/02_", + reg, + "_deseq2_Dex_1_vs_0_lfcShrink.txt" + ), + sep = "\t" + ) + na_indices <- which(is.na(res$padj)) + res$padj[na_indices] <- 1 + list_reg[[reg]] <- res + res_sig <- res[res$padj <= 0.1, ] + # res_sig <- res[res$log2FoldChange >= 1] + list_reg_sig[[reg]] <- res_sig + list_genes_top10[[reg]] <- res_sig$Ensembl_ID[1:25] +} + + +# 2. check uniqueness of DE genes --------------- + +for (reg in regions){ + index_reg <- which(regions == reg) + # df <- bind_rows(list_reg[-index_reg], .id="region") %>% + # filter(DE0.1) + df <- bind_rows(list_reg_sig[-index_reg], .id="region") + list_reg_sig[[reg]]$regions_DE <- sapply(list_reg_sig[[reg]]$Ensembl_ID, + function(x) paste(df[df$Ensembl_ID == x,]$region, collapse = " ")) + list_reg_sig[[reg]]$unique_DE <- sapply(list_reg_sig[[reg]]$regions_DE, + function(x) x == "") +} + + +# 3. plot number of unique/not unique genes within HIP ------------------------- + +# make dataframe to plot barplot +df_uni <- data.frame(region = character(), + uniqueness = character(), + genes = numeric()) +for (reg in regions){ + u <- table(list_reg_sig[[reg]]$unique_DE) + df_uni <- rbind(df_uni, list("region" = reg, "uniqueness" = "unique", + "genes" = u[2])) + df_uni <- rbind(df_uni, list("region" = reg, "uniqueness" = "not_unique", + "genes" = u[1])) +} +df_uni$region <- as.factor(df_uni$region) +df_uni$uniqueness <- as.factor(df_uni$uniqueness) + +# sum DE genes per region for percentage label +df_uni <- df_uni %>% + group_by(region) %>% + mutate(sum = sum(genes)) + +# stacked barplot with unique DE genes in HIP +ggplot(df_uni, aes(x = region, + y = genes, + fill = uniqueness)) + + geom_bar(stat = "identity", position = "stack") + + scale_fill_manual( + name = "", + labels = c("DE in multiple HIP regions", "DE unique in HIP"), + values = c("tan1", "royalblue") + ) + + xlab("brain region") + + ylab("# diff. exp. genes") + + theme_light() + + theme( + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + # legend.title = element_blank(), + legend.position = "top" + ) + + geom_text(aes(label = paste0(round((genes / sum) * 100, digits = 1 + ), "%")), + position = position_stack(vjust = 0.5), + size = 4) +ggsave(filename = paste0(basepath, "figures/06b_comparison_HIP_deseq_barplot_vertical_percentage.png"), + width = 8, + height = 6) + + + +# 4. Heatmap of unique genes with highest p-value --------------------------- + +# Function to get gene IDs of unique DE with lowest p-values +top_unique <- function(df){ + + df <- df[df$unique_DE,] + genes <- df$Ensembl_ID[1:10] + + return(genes) +} + +# Concatenate DE tables ----------------- +# TODO: try also p-value + +data <- bind_rows(list_reg, .id = "region") +genes <- unlist(lapply(list_reg_sig, top_unique)) +data <- data[data$Ensembl_ID %in% genes,] %>% + pivot_wider(id_cols = c(Gene_Symbol), + names_from = region, + #values_from = log2FoldChange) + values_from = padj) + +data_mat <- as.matrix(data[,2:5]) +rownames(data_mat) <- data$Gene_Symbol + + +# Heatmap + +# Complex heatmap +library(ComplexHeatmap) +library(circlize) +Heatmap(data_mat, + name = "log2FoldChange", #title of legend + column_title = "Hippocampal region", row_title = "Genes", + row_names_gp = gpar(fontsize = 7), # Text size for row names + cluster_columns = FALSE, + #col = colorRamp2(c(-4, 0, 4), c("blue", "#EEEEEE", "red")) + col = colorRamp2(c(0,1), c("red", "#EEEEEE")) +) + +pheatmap(data_mat, + cutree_rows = 2, + cluster_cols = FALSE) + + + + +# 3. GO enrichment for the genes of each region ------------------ + +go_enrichment_all <- function(df_reg, unique){ + if (unique){ + genes <- df_reg$Ensembl_ID[df_reg$unique_DE] + } else { + genes <- df_reg$Ensembl_ID + } + genes <- mapIds(org.Mm.eg.db, keys = genes, keytype = "ENSEMBL", + column = "ENTREZID") + background <- read.table(file = paste0(basepath, "tables/06_background_entrezID.txt"), + header = FALSE)$V1 + + # enrichment + ego <- enrichGO(gene = as.character(genes), + universe = as.character(background), + OrgDb = org.Mm.eg.db, + ont = "BP", + pAdjustMethod = "BH", + #pvalueCutoff = 0.01, + #qvalueCutoff = 0.05, + minGSSize = 10, # min number of genes associated with GO term + maxGSSize = 10000, # max number of genes associated with GO term + readable = TRUE)@result + # ego_simple <- clusterProfiler::simplify( + # ego, + # cutoff = 0.7, + # by = "p.adjust", + # select_fun = min, + # measure = "Wang", + # semData = NULL + # )@result + + return(ego) +} + +list_GO <- list() +# GO.BPcollection = subsetCollection(GOcollection, tags = "GO.BP") +for (reg in regions){ + + go_enr_unique <- go_enrichment_all(list_reg_sig[[reg]], TRUE) + # go_enr_all <- go_enrichment_all(list_reg[[reg]], GOcollection, FALSE) + list_GO[[reg]] <- go_enr_unique + +} + + +# 4. Plot GO terms +df_all <- bind_rows(list_GO, .id="region") +for (reg in regions){ + + df_reg <- list_GO[[reg]] %>% + # group_by(inGroups) %>% + # slice_min(order_by = p.adjust, n = 10) + slice_head(n = 10) + + df <- df_all[df_all$ID %in% df_reg$ID,] + # df$dataSetName <- sapply(df$dataSetName, function(x) str_trunc(x, 45, "right")) + df$Description <- factor(df$Description, + levels = rev(reorder(df$Description[df$region==reg], df$p.adjust[df$region==reg]))) + df$region <- factor(df$region, levels = c("vDG", "dDG", "vCA1", "dCA1")) + #df$inGroups <- factor(df$inGroups) + #levels(df$inGroups) <- c("Biological Process", "Cellular Components", "Molecular Function") + + # Plot results (plotted pvalues are not adjusted for multiple testing) + df %>% + arrange(desc(Description)) %>% + ggplot(aes(x=Description, y = -log10(p.adjust), fill = region)) + + geom_bar(position = position_dodge2(reverse=TRUE), stat="identity") + + geom_hline(yintercept = -log10(0.1),linetype="dashed", color = "red") + + coord_flip() + + xlab("GOterm") + + ggtitle(paste0("GO terms enriched for DE genes only in ",reg, " (Top 10 each)")) + + # facet_wrap(~inGroups, scales="free") + + theme(axis.text.y = element_text(size = 10)) + ggsave(filename = paste0(basepath, "figures", "/06b_HIP_", reg, "_GOterms_unique.png"), width = 13, height = 7) +} + + diff --git a/01_DiffExp/07_anRichmentAnalysis.R b/01_DiffExp/07_anRichmentAnalysis.R new file mode 100644 index 0000000..2d2f73f --- /dev/null +++ b/01_DiffExp/07_anRichmentAnalysis.R @@ -0,0 +1,448 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 08.09.2020 +## Author: Nathalie +################################################## +# Functional annotation with anRichment + +library(anRichment) +library(anRichmentMethods) +library(org.Mm.eg.db) +library(ggplot2) +library(dplyr) + +basepath <- "~/Documents/ownCloud/DexStim_RNAseq_Mouse/" +folder_plots <- paste0("figures") +folder_tables <- paste0("tables") + +GOcollection <- buildGOcollection(organism = "mouse") + +# 1.1 GO enrichment all regions --------------------- +genes <- read.table(file = paste0(basepath, folder_tables, "/06_overlap_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1_entrezID.txt"), + header = FALSE) +background <- read.table(file = paste0(basepath, folder_tables, "/06_background_entrezID.txt"), + header = FALSE) +modules <- rep("not_significant", nrow(background)) +modules[which(background$V1 %in% genes$V1)] <- "significant" + +# enrichment +GOenrichment <- enrichmentAnalysis( + classLabels = modules, + identifiers = background$V1, + refCollection = GOcollection, + useBackground = "given", + # threshold = 0.1, + # thresholdType = "Bonferroni", + getOverlapEntrez = TRUE, + getOverlapSymbols = TRUE, + ignoreLabels = "not_significant" +) + +# Filter out terms with less than 10 genes overlap and FDR above 0.1 +table.display <- GOenrichment$enrichmentTable %>% + filter(nCommonGenes >= 10 & FDR <= 0.1) +write.csv(table.display, file = paste0(folder_tables,"/07_GOenrichment_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1.csv"), + row.names = FALSE) + +# Plot results (Top 20 terms) +table.display %>% + head(20) %>% + arrange(desc(FDR)) %>% + mutate(dataSetName=factor(dataSetName,levels=dataSetName)) %>% +ggplot(aes(x=dataSetName, y = -log10(FDR), fill = inGroups)) + + geom_bar(stat="identity") + + geom_hline(yintercept = -log10(0.1),linetype="dashed", color = "red") + + scale_fill_discrete(name = "Ontology", + labels = c("BP", "MF", "CC")) + + coord_flip() + + ylab("-log10(FDR)") + + xlab("GOterm") + + ggtitle("GO terms enriched for DE genes across all brain regions (Top 20)") + + theme(text = element_text(size=15), + plot.title = element_text(size = 15, hjust=1)) +ggsave(filename = paste0(folder_plots, "/07_GOenrichment_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1.png")) + + +# 1.2 NCBI BioSystems collection ----------------- +biosysCollection <- BioSystemsCollection("mouse") +knownGroups(biosysCollection) + +# enrichment +KEGGenrichment <- enrichmentAnalysis( + classLabels = modules, + identifiers = background$V1, + refCollection = biosysCollection, + useBackground = "given", + threshold = 0.1, + thresholdType = "Bonferroni", + getOverlapEntrez = TRUE, + getOverlapSymbols = TRUE, + ignoreLabels = "not_significant" +) +collectGarbage() + +names(KEGGenrichment) +names(KEGGenrichment$enrichmentTable) +table.display <- KEGGenrichment$enrichmentTable +table.display$overlapGenes <- shortenStrings(table.display$overlapGenes, maxLength = 70, + split = "|") +write.csv(GOenrichment$enrichmentTable, file = paste0(folder_tables,"/07_PathwayEnrichment_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1.csv"), + row.names = FALSE) + +# plot results (plotted pvalues are not adjusted for multiple testing) +table.display %>% + arrange(desc(pValue)) %>% + mutate(dataSetName=factor(dataSetName,levels=dataSetName)) %>% +ggplot(aes(x=dataSetName, y = -log10(pValue), fill=inGroups)) + + geom_bar(stat="identity") + + geom_hline(yintercept = -log10(0.1),linetype="dashed", color = "red") + + coord_flip() + + ggtitle("Pathways enriched for DE genes across all brain regions") +ggsave(filename = paste0(folder_plots, "/07_PathwayEnrichment_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1.png")) + + +# 1.3 calculate enrichment of disease associated genes --------- +library(disgenet2r) +# Schizophrenia C0036341 +# Major depressive disorder C1269683 +# Bipolar disorder C0005586 +genes_SCZ <- disease2gene( disease = c("C0036341"), + database = "CURATED", verbose = TRUE )@qresult +genes_MDD <- disease2gene( disease = c("C1269683"), + database = "CURATED", verbose = TRUE )@qresult +genes_BIP <- disease2gene( disease = c("C0005586"), + database = "CURATED", verbose = TRUE )@qresult + +genesymbols <- read.table(file = paste0(folder_tables, "/06_overlap_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1_symbolID.txt"), + header = FALSE) +genesymbols <- sapply(genesymbols$V1, toupper) + +bgsymbols <- read.table(file = paste0(folder_tables, "/06_background_symbolID.txt"), + header = FALSE) +bgsymbols <- sapply(bgsymbols$V1, toupper) + +pval <- data.frame(disease = c("SCZ", "MDD", "BIP"), pvalue = c(1,1,1)) +diseaseGenes <- data.frame(disease = character(), genes = character()) +# hypergeometric test SCZ +tmp <- intersect(genes_SCZ$gene_symbol, genesymbols) +diseaseGenes <- rbind(diseaseGenes, cbind(rep("SCZ", times = length(tmp)), tmp)) +o <- length(intersect(genes_SCZ$gene_symbol, genesymbols)) +o_b <- length(intersect(genes_SCZ$gene_symbol, bgsymbols)) +test <- fisher.test(matrix(c(o, o_b-o, length(genesymbols)-o, length(bgsymbols)-length(genesymbols)-o_b+o), 2, 2), + alternative='greater')$p.value +pval[1,2] <- test + +# hypergeometric test MDD +tmp <- intersect(genes_MDD$gene_symbol, genesymbols) +diseaseGenes <- rbind(diseaseGenes, cbind(rep("MDD", times = length(tmp)), tmp)) +o <- length(intersect(genes_MDD$gene_symbol, genesymbols)) +o_b <- length(intersect(genes_MDD$gene_symbol, bgsymbols)) +test <- fisher.test(matrix(c(o, o_b-o, length(genesymbols)-o, length(bgsymbols)-length(genesymbols)-o_b+o), 2, 2), + alternative='greater')$p.value +pval[2,2] <- test + +# hypergeometric test BIP +tmp <- intersect(genes_BIP$gene_symbol, genesymbols) +diseaseGenes <- rbind(diseaseGenes, cbind(rep("BIP", times = length(tmp)), tmp)) +o <- length(intersect(genes_BIP$gene_symbol, genesymbols)) +o_b <- length(intersect(genes_BIP$gene_symbol, bgsymbols)) +test <- fisher.test(matrix(c(o, o_b-o, length(genesymbols)-o, length(bgsymbols)-length(genesymbols)-o_b+o), 2, 2), + alternative='greater')$p.value +pval[3,2] <- test + +pval[,2] <- p.adjust(pval[,2], method = "fdr") + +# plot disease gene enrichment +ggplot(pval, aes(x=disease, y=-log10(pvalue))) + + geom_bar(stat="identity") + + geom_hline(yintercept = -log10(0.1),linetype="dashed", color = "red") + + coord_flip() + + xlab("disease") + + ylab("-log10(FDR)") + + ggtitle("Enrichment of known disease genes among DE genes shared between all regions") + + theme(text = element_text(size=15), + plot.title = element_text(size = 15)) +ggsave(filename = paste0(folder_plots, "/07_diseaseGeneEnrichment_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1.png")) + +write.table(diseaseGenes, file = paste0(folder_tables, "/07_diseaseGenes_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1.txt"), + quote = FALSE, row.names = FALSE, col.names = FALSE) +write.table(pval, file = paste0(folder_tables, "/07_diseaseGenes_pvalues_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1.txt"), + quote = FALSE, row.names = FALSE) + + +# 2. Hippocampus: dorsal vs ventral ------------ + +intersect_hip <- read.table(file = paste0(folder_tables, "/06_HIP_intersectDorsalVentral_entrezID.txt"), + header = FALSE) +diff_dorsal <- read.table(file = paste0(folder_tables, "/06_HIP_diffDorsalVentral_entrezID.txt"), + header = FALSE) +diff_ventral <- read.table(file = paste0(folder_tables, "/06_HIP_diffVentralDorsal_entrezID.txt"), + header = FALSE) +union_hip <- read.table(file = paste0(folder_tables, "/06_HIP_unionDorsalVentral_entrezID.txt"), + header = FALSE) +# !!!! BACKGROUND are only diff exp genes in any HIP area here !!!! +# union_hip <- background +modules <- rep("XXX", nrow(union_hip)) +modules[which(union_hip$V1 %in% intersect_hip$V1)] <- "intersect" +modules[which(union_hip$V1 %in% diff_dorsal$V1)] <- "dorsal" +modules[which(union_hip$V1 %in% diff_ventral$V1)] <- "ventral" + +# enrichment +GOenrichment <- enrichmentAnalysis( + classLabels = modules, + identifiers = union_hip$V1, + refCollection = GOcollection, + useBackground = "given", + nBestDataSets = length(GOcollection$dataSets), + # threshold = 0.1, + # thresholdType = "Bonferroni", + getOverlapEntrez = TRUE, + getOverlapSymbols = TRUE +) + +# Filter to the BP terms and exclude terms with less 10 genes overlap +# Exclude terms with nominal pvalue above 0.1 +table.display <- GOenrichment$enrichmentTable %>% + filter(inGroups == "GO|GO.BP|GO" & class != "XXX") %>% + filter(nCommonGenes >= 10 & pValue <= 0.1) %>% + group_by(class) %>% slice_min(order_by = FDR, n = 10) +write.csv(GOenrichment$enrichmentTable, file = paste0(folder_tables,"/07_GOenrichment_HIP.csv"), + row.names = FALSE) + +# Plot results (plotted pvalues are not adjusted for multiple testing) +table.display %>% + # arrange(desc(class)) %>% + mutate(dataSetName=factor(dataSetName,levels=rev(dataSetName))) %>% +ggplot(aes(x=dataSetName, y = -log10(FDR), fill = class)) + + geom_bar(stat="identity") + + geom_hline(yintercept = -log10(0.1),linetype="dashed", color = "red") + + coord_flip() + + xlab("GOterm") + + ggtitle("GO terms enriched for DE genes in Hippocampus") +ggsave(filename = paste0(folder_plots, "/07_GOenrichment_HIP.png")) + + +# 2.2 NCBI BioSystems collection ----------------- +biosysCollection <- BioSystemsCollection("mouse") +knownGroups(biosysCollection) + +# enrichment +KEGGenrichment <- enrichmentAnalysis( + classLabels = modules, + identifiers = union_hip$V1, + refCollection = biosysCollection, + useBackground = "given", + threshold = 0.1, + thresholdType = "Bonferroni", + getOverlapEntrez = TRUE, + getOverlapSymbols = TRUE +) +collectGarbage() + +names(KEGGenrichment) +names(KEGGenrichment$enrichmentTable) +table.display <- KEGGenrichment$enrichmentTable +table.display$overlapGenes <- shortenStrings(table.display$overlapGenes, maxLength = 70, + split = "|") +write.csv(GOenrichment$enrichmentTable, file = paste0(folder_tables,"/07_PathwayEnrichment_HIP.csv"), + row.names = FALSE) + +# plot results (plotted pvalues are not adjusted for multiple testing) +table.display %>% + arrange(desc(class)) %>% + mutate(dataSetName=factor(dataSetName,levels=dataSetName)) %>% +ggplot(aes(x=dataSetName, y = -log10(pValue), fill=class)) + + geom_bar(stat="identity") + + geom_hline(yintercept = -log10(0.1),linetype="dashed", color = "red") + + coord_flip() + + ggtitle("Pathways enriched for DE genes in Hippocampus") +ggsave(filename = paste0(folder_plots, "/07_PathwayEnrichment_HIP.png")) + + + + +# 3.1 GO enrichment all regions without PFC --------------------- +# !!! not really interesting --> 9 genes from which 5 are nominally sig also in PFC !!!! +GOcollection <- buildGOcollection(organism = "mouse") +genes <- read.table(file = paste0(folder_tables, "/06_diff_allRegionsExceptPFC.txt"), + header = TRUE) +background <- read.table(file = paste0(folder_tables, "/06_background_entrezID.txt"), + header = FALSE) +modules <- rep("not_significant", nrow(background)) +modules[which(background$V1 %in% genes$ENTREZID)] <- "significant" + +# enrichment +GOenrichment <- enrichmentAnalysis( + classLabels = modules, + identifiers = background$V1, + refCollection = GOcollection, + useBackground = "given", + # threshold = 0.1, + # thresholdType = "Bonferroni", + getOverlapEntrez = TRUE, + getOverlapSymbols = TRUE, + ignoreLabels = "not_significant" +) + +table.display <- GOenrichment$enrichmentTable +write.csv(table.display, file = paste0(folder_tables,"/07_GOenrichment_AMY-CER-PVN-dDG-vDG-dCA1-vCA1.csv"), + row.names = FALSE) + +# plot results (plotted pvalues are not adjusted for multiple testing) +table.display %>% + arrange(desc(pValue)) %>% + mutate(dataSetName=factor(dataSetName,levels=dataSetName)) %>% + ggplot(aes(x=dataSetName, y = -log10(pValue), fill = inGroups)) + + geom_bar(stat="identity") + + geom_hline(yintercept = -log10(0.1),linetype="dashed", color = "red") + + scale_fill_discrete(name = "Ontology", + labels = c("BP", "MF", "CC")) + + coord_flip() + + ylab("-log10(pValue)") + + xlab("GOterm") + + ggtitle("GO terms enriched for DE genes across all brain regions except PFC") + + theme(text = element_text(size=15), + plot.title = element_text(size = 15, hjust=1)) +ggsave(filename = paste0(folder_plots, "/07_GOenrichment_AMY-CER-PVN-dDG-vDG-dCA1-vCA1.png")) + + + + +# 4.1 GO enrichment only PFC and CER --------------------- +genes <- read.table(file = paste0(folder_tables, "/06_overlap_CER-PFC_only_entrezID.txt"), + header = FALSE) +background <- read.table(file = paste0(folder_tables, "/06_background_entrezID.txt"), + header = FALSE) +modules <- rep("not_significant", nrow(background)) +modules[which(background$V1 %in% genes$V1)] <- "significant" + +# enrichment +GOenrichment <- enrichmentAnalysis( + classLabels = modules, + identifiers = background$V1, + refCollection = GOcollection, + useBackground = "given", + # threshold = 0.1, + # thresholdType = "Bonferroni", + getOverlapEntrez = TRUE, + getOverlapSymbols = TRUE, + ignoreLabels = "not_significant" +) + +table.display <- GOenrichment$enrichmentTable +write.csv(table.display, file = paste0(folder_tables,"/07_GOenrichment_CER-PFC.csv"), + row.names = FALSE) + +# plot results (plotted pvalues are not adjusted for multiple testing) +table.display %>% + arrange(desc(pValue)) %>% + mutate(dataSetName=factor(dataSetName,levels=dataSetName)) %>% + ggplot(aes(x=dataSetName, y = -log10(pValue), fill = inGroups)) + + geom_bar(stat="identity") + + geom_hline(yintercept = -log10(0.1),linetype="dashed", color = "red") + + scale_fill_discrete(name = "Ontology", + labels = c("BP", "MF", "CC")) + + coord_flip() + + ylab("-log10(pValue)") + + xlab("GOterm") + + ggtitle("GO terms enriched for DE genes across all brain regions except PFC") + + theme(text = element_text(size=15), + plot.title = element_text(size = 15, hjust=1)) +ggsave(filename = paste0(folder_plots, "/07_GOenrichment_AMY-CER-PVN-dDG-vDG-dCA1-vCA1.png")) + + + + +# 5.1 GO enrichment all regions upregulated--------------------- +genes <- read.table(file = paste0(folder_tables, "/06_overlap_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1_up_entrezID.txt"), + header = FALSE) +background <- read.table(file = paste0(folder_tables, "/06_background_entrezID.txt"), + header = FALSE) +modules <- rep("not_significant", nrow(background)) +modules[which(background$V1 %in% genes$V1)] <- "significant" + +# GO enrichment +GOenrichment <- enrichmentAnalysis( + classLabels = modules, + identifiers = background$V1, + refCollection = GOcollection, + useBackground = "given", + nBestDataSets = length(GOcollection$dataSets), + # threshold = 0.1, + # thresholdType = "Bonferroni", + getOverlapEntrez = TRUE, + getOverlapSymbols = TRUE, + ignoreLabels = "not_significant" +) + +# Exclude terms with less than 10 genes overlap or FDR above 0.1 +table.display <- GOenrichment$enrichmentTable %>% + filter(nCommonGenes >= 10 & FDR <= 0.1) +write.csv(table.display, file = paste0(folder_tables,"/07_GOenrichment_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1_up.csv"), + row.names = FALSE) + +# Plot results (Top 20) +table.display %>% + head(20) %>% + arrange(desc(FDR)) %>% + mutate(dataSetName=factor(dataSetName,levels=dataSetName)) %>% + ggplot(aes(x=dataSetName, y = -log10(FDR), fill = inGroups)) + + geom_bar(stat="identity") + + geom_hline(yintercept = -log10(0.1),linetype="dashed", color = "red") + + scale_fill_discrete(name = "Ontology", + labels = c("BP", "MF", "CC")) + + coord_flip() + + ylab("-log10(FDR)") + + xlab("GOterm") + + ggtitle("GO terms enriched for upregulated DE genes across all brain regions (Top 20)") + + theme(text = element_text(size=15), + plot.title = element_text(size = 15, hjust=1)) +ggsave(filename = paste0(folder_plots, "/07_GOenrichment_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1_up.png")) + + + +# 5.2 GO enrichment all regions downregulated--------------------- +genes <- read.table(file = paste0(folder_tables, "/06_overlap_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1_down_entrezID.txt"), + header = FALSE) +background <- read.table(file = paste0(folder_tables, "/06_background_entrezID.txt"), + header = FALSE) +modules <- rep("not_significant", nrow(background)) +modules[which(background$V1 %in% genes$V1)] <- "significant" + +# GO enrichment +GOenrichment <- enrichmentAnalysis( + classLabels = modules, + identifiers = background$V1, + refCollection = GOcollection, + useBackground = "given", + nBestDataSets = length(GOcollection$dataSets), + # threshold = 0.1, + # thresholdType = "Bonferroni", + getOverlapEntrez = TRUE, + getOverlapSymbols = TRUE, + ignoreLabels = "not_significant" +) + +# Exclude terms with less than 10 genes overlap or FDR above 0.1 +table.display <- GOenrichment$enrichmentTable %>% + filter(nCommonGenes >= 10 & FDR <= 0.1) +write.csv(table.display, file = paste0(folder_tables,"/07_GOenrichment_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1_down.csv"), + row.names = FALSE) + +# Plot results (Top 20) +table.display %>% + head(20) %>% + arrange(desc(FDR)) %>% + mutate(dataSetName=factor(dataSetName,levels=dataSetName)) %>% + ggplot(aes(x=dataSetName, y = -log10(FDR), fill = inGroups)) + + geom_bar(stat="identity") + + geom_hline(yintercept = -log10(0.1),linetype="dashed", color = "red") + + scale_fill_discrete(name = "Ontology", + labels = c("BP", "MF", "CC")) + + coord_flip() + + ylab("-log10(FDR)") + + xlab("GOterm") + + ggtitle("GO terms enriched for downregulated DE genes across all brain regions (Top 20)") + + theme(text = element_text(size=15), + plot.title = element_text(size = 15, hjust=1)) +ggsave(filename = paste0(folder_plots, "/07_GOenrichment_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1_down.png")) \ No newline at end of file diff --git a/01_DiffExp/07a_clusterProfilerAnalysis.R b/01_DiffExp/07a_clusterProfilerAnalysis.R new file mode 100644 index 0000000..7eaa060 --- /dev/null +++ b/01_DiffExp/07a_clusterProfilerAnalysis.R @@ -0,0 +1,336 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 13.04.2021 +## Author: Nathalie +################################################## +# Functional annotation with clusterProfiler +# make figure for manuscript + +library(clusterProfiler) +library(DOSE) +library(org.Mm.eg.db) +library(biomaRt) +library(ggplot2) +library(dplyr) +library(enrichplot) +library(gridExtra) +library(stringr) + + +basepath <- "~/Documents/ownCloud/DexStim_RNAseq_Mouse/" + + + +# 0. Read genes DE in all regions and background ----------------- +genes <- read.table(file = paste0(basepath, "tables/06_overlap_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1_entrezID.txt"), + header = FALSE)[,1] + +# background are all genes in out dataset +background <- read.table(file = paste0(basepath, "tables/06_background_entrezID.txt"), + header = FALSE)[,1] + + + +# 1.1 GO enrichment for genes DE in all regions --------------------- + +# GO enrichment +# TODO: decide on maxGSSize --> with 10000 very similar results to anRichment +# --> Anthi and me decided that it makes sense to leave the cutoff very high +# (no point of restricting the terms here) +ego <- enrichGO(gene = as.character(genes), + universe = as.character(background), + OrgDb = org.Mm.eg.db, + ont = "BP", + pAdjustMethod = "BH", + pvalueCutoff = 0.01, + qvalueCutoff = 0.05, + minGSSize = 10, # min number of genes associated with GO term + maxGSSize = 10000, # max number of genes associated with GO term + readable = TRUE) +head(ego, n = 20) + +barplot(ego, showCategory=20) +dotplot(ego, showCategory=30) + ggtitle("dotplot for DE genes in all regions") + +# SIMPLIFY enriched GO terms (remove very similar terms) +ego_simple <- clusterProfiler::simplify( + ego, + cutoff = 0.7, + by = "p.adjust", + select_fun = min, + measure = "Wang", + semData = NULL +)@result +head(ego_simple, n = 20) + +#barplot(ego_simple, showCategory=20) +#dotplot(ego_simple, showCategory=30) + ggtitle("dotplot for DE genes in all regions") + + + +# 1.2 GO enrichment for upregulated genes DE in all regions --------------------- +genes_up <- read.table(file = paste0(basepath, "tables/06_overlap_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1_up_entrezID.txt"), + header = FALSE)[,1] + +# GO enrichment +# TODO: decide on maxGSSize --> with 10000 very similar results to anRichment +ego_up <- enrichGO(gene = as.character(genes_up), + universe = as.character(background), + OrgDb = org.Mm.eg.db, + ont = "BP", + pAdjustMethod = "BH", + pvalueCutoff = 0.01, + qvalueCutoff = 0.05, + minGSSize = 10, # min number of genes associated with GO term + maxGSSize = 10000, # max number of genes associated with GO term + readable = TRUE)@result +head(ego_up, n = 20) + + + + +# 1.3 GO enrichment for downregulated genes DE in all regions --------------------- +genes_down <- read.table( + file = paste0(basepath, + "tables/06_overlap_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1_down_entrezID.txt"), + header = FALSE)[,1] + +# GO enrichment +# TODO: decide on maxGSSize --> with 10000 very similar results to anRichment +ego_down <- enrichGO(gene = as.character(genes_down), + universe = as.character(background), + OrgDb = org.Mm.eg.db, + ont = "BP", + pAdjustMethod = "BH", + pvalueCutoff = 0.01, + qvalueCutoff = 0.05, + minGSSize = 10, # min number of genes associated with GO term + maxGSSize = 10000, # max number of genes associated with GO term + readable = TRUE)@result +head(ego_down, n = 20) + + +# 1.4 Merge dataframes from all, up and downregulated genes -------------------- + +data_go <- left_join(ego_simple, ego_down, by = c("ID", "Description"), + suffix = c(".all", ".down")) +data_go <- left_join(data_go, ego_up, by = c("ID", "Description"), + suffix = c("", ".up")) + +data_heat <- data_go[1:30,c("Description", "p.adjust.down", "p.adjust")] %>% + tidyr::pivot_longer(cols = p.adjust.down:p.adjust) + + +# 1.5 Barplot ------------------------------- + +bp.1 <- + ggplot(data = data_go[1:25,], aes( + x = factor(Description, levels = rev(data_go$Description[1:25])), + y = Count.all/165 + )) + + geom_bar(stat = "identity", position = "stack", + fill = "#226666") + + # scale_fill_manual( + # name = "", + # labels = c("DE in multiple regions", "DE unique"), + # values = c("red3", "navy") + # ) + + scale_y_continuous(trans="reverse") + + ylab("Gene ration") + + xlab("GO terms - biological process") + + theme_light() + + theme( + axis.title.y = element_text(size = 14), + axis.text.x = element_text(size = 12), + axis.title.x = element_text(size =14), + axis.text.y = element_text(size = 12), + legend.position = "top", + legend.text = element_text(size = 10) + ) + + coord_flip() + +bp.2 <- + ggplot(data = data_go[1:25,], aes( + x = factor(Description, levels = rev(data_go$Description[1:25])), + y = -log10(p.adjust.all) + )) + + geom_bar(stat = "identity", position = "stack", + fill = "#AA3939") + + # scale_fill_manual( + # name = "", + # labels = c("DE in multiple regions", "DE unique"), + # values = c("red3", "navy") + # ) + + ylab("-log10(adj. p-value)") + + theme_light() + + theme( + axis.title.x = element_text(size = 14), + axis.text.y = element_blank(), + axis.title.y = element_blank(), + axis.text.x = element_text(size = 12), + legend.position = "top", + legend.text = element_text(size = 10) + ) + + coord_flip() + +hm.1 <- + ggplot(data = data_heat, aes( + x = factor(Description, levels = rev(data_go$Description[1:25])), + y = name, + fill = value <= 0.01 + # fill = p.adjust + )) + + geom_tile() + + scale_y_discrete(name ="sig. GO term", + limits=c("p.adjust","p.adjust.down"), + labels=c("upreg.", "downreg.")) + + scale_fill_manual( + name = "GO term significant", + values = c("darkgrey", "#FFB620") + ) + + ylab("sig. GO term") + + theme_light() + + theme( + axis.title.y = element_blank(), + axis.title.x = element_text(size = 14), + axis.text.x = element_text(size = 12), + axis.text.y = element_blank(), + # legend.position = "none" + legend.position = "right", + legend.title = element_text(size = 12), + legend.text = element_text(size = 10) + ) + + coord_flip() + +plot_comb <- grid.arrange(bp.1, bp.2, hm.1, nrow = 1, + widths = c(3, 1, 1.5)) + +# Save plot +ggsave( + plot_comb, + filename = paste0(basepath, "figures/07a_goEnrichment_allRegions.png"), + height = 10, + width = 12 +) + +# 2.1 Disease gene enrichment for genes DE in all regions --------------------- + +# Map ENTREZ IDs from mouse to human +human <- useMart("ensembl", dataset = "hsapiens_gene_ensembl") +mouse <- useMart("ensembl", dataset = "mmusculus_gene_ensembl") + +genesV2 <- getLDS(attributes = c("entrezgene_id"), filters = "entrezgene_id", + values = genes , mart = mouse, attributesL = c("entrezgene_id"), + martL = human, uniqueRows=T) +humanx <- unique(genesV2[, 2]) + +backgroundV2 <- getLDS(attributes = c("entrezgene_id"), filters = "entrezgene_id", + values = background , mart = mouse, attributesL = c("entrezgene_id"), + martL = human, uniqueRows=T) +humanb <- unique(backgroundV2[,2]) + +# Disease enrichment +# TODO: decide on maxGSSize +dgn <- enrichDGN(gene = as.character(humanx), + universe = as.character(humanb), + pAdjustMethod = "BH", + pvalueCutoff = 0.05, + qvalueCutoff = 0.2, + minGSSize = 500, # min number of genes associated with GO term + maxGSSize = 5000, # max number of genes associated with GO term + readable = TRUE) +head(dgn@result$Description, n = 100) + +dgn_x <- pairwise_termsim(dgn) +enrichplot::emapplot(dgn_x, showCategory = 50) + +dgn@result[dgn@result$Description == "Schizophrenia",] + + +# x <- enrichDO(gene = as.character(humanx), +# ont = "DO", +# pvalueCutoff = 0.05, +# pAdjustMethod = "BH", +# universe = as.character(humanb), +# minGSSize = 5, +# maxGSSize = 500, +# qvalueCutoff = 0.05, +# readable = FALSE) +# head(x) +# +# x2 <- pairwise_termsim(dgn) +# enrichplot::emapplot(x2, showCategory = 50) + + +library(disgenet2r) +library(psygenet2r) + +data2 <- gene2disease( + gene = humanx, + vocabulary = "ENTREZ", + # database = "PSYGENET", + score =c(0.2, 1), + verbose = TRUE +) + +data2_table <- data2@qresult +data2_table <- data2_table[(data2_table$disease_class_name == " Mental Disorders" | + data2_table$disease_class_name == " Nervous System Diseases"),] +data2_table <- data2_table[(str_detect(data2_table$disease_class_name, "Mental Disorders") | + str_detect(data2_table$disease_class_name, "Nervous System Diseases")),] + +plot( data2, + class = "Network", + prop = 10) + +plot( data2, + class ="Heatmap") + +plot( data2, + class="DiseaseClass") + +# disease enrichment using disgenet2r +# does not work for PSYGENET as database (bug in code) +enr <- disease_enrichment( + genes = humanx, + universe = humanb, + vocabulary = "ENTREZ", + verbose = TRUE, + database = "CURATED", + warnings = TRUE +)@qresult + +# gene-disease associations (GDA) using psygenet2r +m1 <- psygenetGene( + gene = humanx, + database = "ALL", + verbose = TRUE +) +plot( m1, type = "GDCA network" ) +plot( m1 ) +plot( m1, type="GDCA heatmap" ) +# geneAttrPlot( m1, type = "disease category", class = "Lollipop" ) + +png(filename = paste0(basepath, "figures/07a_diseaseAssociations_allRegions.png"), + width = 600, height = 600) +plot( m1 ) +dev.off() + +# disease enrichment (per disease class) using psygenet2r +enr_psy <- enrichedPD( + gene = humanx, + verbose = TRUE, + warnings = TRUE +) + +ggplot(enr_psy, aes(x = MPD, + y = -log10(p.value))) + + geom_bar(stat = "identity", + position = "stack", + fill = "#1E88E5") + + coord_flip() +ggsave( + filename = paste0(basepath, "figures/07a_diseaseEnrichment_allRegions.png"), + width = 8, + height = 8 +) diff --git a/01_DiffExp/07b_clusterProfilerAnalysis_HIP.R b/01_DiffExp/07b_clusterProfilerAnalysis_HIP.R new file mode 100644 index 0000000..b39b450 --- /dev/null +++ b/01_DiffExp/07b_clusterProfilerAnalysis_HIP.R @@ -0,0 +1,80 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 19.04.2021 +## Author: Nathalie +################################################## +# Functional annotation for HIP with clusterProfiler +# make figure for manuscript + +library(clusterProfiler) +library(DOSE) +library(org.Mm.eg.db) +library(biomaRt) +library(ggplot2) +library(dplyr) +library(data.table) + + +basepath <- "~/Documents/ownCloud/DexStim_RNAseq_Mouse/" + +regions <- + c("vDG", "dDG", "vCA1", "dCA1") + + + +# 1. Read DE tables from HIP regions ---------- + +list_reg_sig <- list() +background <- NULL + +for (reg in regions) { + res <- + fread( + file = paste0( + basepath, + "tables/02_", + reg, + "_deseq2_Dex_1_vs_0_lfcShrink.txt" + ), + sep = "\t" + ) + na_indices <- which(is.na(res$padj)) + res$padj[na_indices] <- 1 + res_sig <- res[res$padj <= 0.1, ] + # res_sig <- res[res$log2FoldChange >= 1] + list_reg_sig[[reg]] <- res_sig + background <- res$Ensembl_ID +} + + + +# 2. Concatenate DE tables ----------------- + +data <- bind_rows(list_reg_sig, .id = "region") + + + +# 3. GO enrichment ------------------------- + +# IMPORTANT: which background? + +for (reg in regions){ + + genes <- list_reg_sig[[reg]]$Ensembl_ID + # background <- unique(data$Ensembl_ID) + + # TODO: decide on maxGSSize --> with 10000 very similar results to anRichment + ego <- enrichGO(gene = genes, + universe = background, + OrgDb = org.Mm.eg.db, + keyType = "ENSEMBL", + ont = "BP", + pAdjustMethod = "BH", + pvalueCutoff = 0.01, + qvalueCutoff = 0.05, + minGSSize = 10, # min number of genes associated with GO term + maxGSSize = 10000, # max number of genes associated with GO term + readable = TRUE) + print(head(ego, n = 20)) + +} diff --git a/01_DiffExp/08_gsea_deseq2.R b/01_DiffExp/08_gsea_deseq2.R new file mode 100644 index 0000000..aabf5f8 --- /dev/null +++ b/01_DiffExp/08_gsea_deseq2.R @@ -0,0 +1,45 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 08.09.2020 +## Author: Nathalie +################################################## +# Gene Set Enrichment Analysis of genes with high fold change +# !!! too many gene sets tested --> nothing remains sig after multiple testing !!! + +library(fgsea) +library(org.Mm.eg.db) + +region <- "PFC" + +folder_plots <- paste0("figures") +folder_tables <- paste0("tables") + +res <- read.table(file=paste0(folder_tables, "/02_", region, "_deseq2_Dex_0_vs_1_lfcShrink_lfc0.5.txt"),sep="\t") +res <- res[order(res$log2FoldChange),] +ranks <- res$log2FoldChange +entrez <- mapIds(org.Mm.eg.db, keys = rownames(res), keytype = "ENSEMBL", column="ENTREZID") +names(ranks) <- entrez +head(ranks) + + +barplot(sort(ranks, decreasing = T)) +data(examplePathways) + +fgseaRes <- fgsea(examplePathways, ranks) + +head(fgseaRes[order(padj, -abs(NES)), ], n=20) +fgseaRes <- fgseaRes[order(padj, -abs(NES)),] + +topUp <- fgseaRes %>% + filter(ES > 0) %>% + top_n(10, wt=-pval) +topDown <- fgseaRes %>% + filter(ES < 0) %>% + top_n(10, wt=-pval) +topPathways <- bind_rows(topUp, topDown) %>% + arrange(-ES) +plotGseaTable(examplePathways[topPathways$pathway], + ranks, + fgseaRes, + gseaParam = 0.5) + diff --git a/01_DiffExp/09_tablesAnthi.R b/01_DiffExp/09_tablesAnthi.R new file mode 100644 index 0000000..efffc85 --- /dev/null +++ b/01_DiffExp/09_tablesAnthi.R @@ -0,0 +1,166 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 19.09.2020 +## Author: Nathalie +################################################## +# Make tables for Anthi + +setwd("~/Documents/ownCloud/DexStim_RNAseq_Mouse") + +library(org.Mm.eg.db) +library(dplyr) +library(anRichment) +library(anRichmentMethods) + +regions <- c("AMY", "PFC", "PVN", "CER", "vDG", "dDG", "vCA1", "dCA1") + +folder_plots <- paste0("figures") +folder_tables <- paste0("tables") + + +# 1. read DE tables from all regions ---------- + +list_reg <- list() +for (reg in regions){ + res <- read.table(file=paste0(folder_tables, "/02_", reg, "_deseq2_Dex_1_vs_0_lfcShrink.txt"),sep="\t") + res <- res[res$padj <= 0.1,] + res$ensembl_id <- rownames(res) + # res$padj[which(is.na(res$padj))] <- 1 + res$gene_symbol <- mapIds(org.Mm.eg.db, keys = res$ensembl_id, + keytype = "ENSEMBL", column="SYMBOL") + res$entrez <- mapIds(org.Mm.eg.db, keys = res$ensembl_id, + keytype = "ENSEMBL", column="ENTREZID") + list_reg[[reg]] <- res +} + + +# 2. check uniqueness of DE genes --------------- + +for (reg in regions){ + index_reg <- which(regions == reg) + # df <- bind_rows(list_reg[-index_reg], .id="region") %>% + # filter(DE0.1) + df <- bind_rows(list_reg[-index_reg], .id="region") + # find regions where gene is also differentially expressed + list_reg[[reg]]$regions_DE <- sapply(list_reg[[reg]]$ensembl_id, + function(x) paste(df[df$ensembl_id == x,]$region, collapse = " ")) + # boolean if gene is DE uniquely in this region + list_reg[[reg]]$unique_DE <- sapply(list_reg[[reg]]$regions_DE, + function(x) x == "") +} + + +# 3. GO enrichment for the genes of each region ------------------ + +go_enrichment_all <- function(df_reg, GOcoll, unique){ + if (unique){ + genes <- df_reg$entrez[df_reg$unique_DE] + } else { + genes <- df_reg$entrez + } + background <- read.table(file = paste0(folder_tables, "/06_background_entrezID.txt"), + header = FALSE) + modules <- rep("not_significant", nrow(background)) + modules[which(background$V1 %in% genes)] <- "significant" + + # enrichment + GOenrichment <- enrichmentAnalysis( + classLabels = modules, + identifiers = background$V1, + refCollection = GOcoll, + useBackground = "given", + nBestDataSets = length(GOcoll$dataSets), + # threshold = 0.1, + # thresholdType = "Bonferroni", + getOverlapEntrez = TRUE, + getOverlapSymbols = TRUE, + ignoreLabels = "not_significant", + maxReportedOverlapGenes = 500 + ) + + enrichmentTable <- GOenrichment$enrichmentTable %>% + filter(nCommonGenes > 10, pValue <= 0.1) + + return(enrichmentTable) +} + +GOcollection <- buildGOcollection(organism = "mouse") +# GO.BPcollection = subsetCollection(GOcollection, tags = "GO.BP") +for (reg in regions){ + + go_enr_unique <- go_enrichment_all(list_reg[[reg]], GOcollection, TRUE) + write.table(go_enr_unique, file = paste0(folder_tables, "/09_", reg, "_GOterms_unique.txt"), + quote = FALSE, sep = "\t") + go_enr_all <- go_enrichment_all(list_reg[[reg]], GOcollection, FALSE) + write.table(go_enr_all, file = paste0(folder_tables, "/09_", reg, "_GOterms_all.txt"), + quote = FALSE, sep = "\t") + + list_reg[[reg]]$GOterms_unique <- sapply(list_reg[[reg]]$entrez, + function(x) paste(go_enr_unique$dataSetName[which(str_detect(go_enr_unique$overlapGenes, x))], collapse="|")) + list_reg[[reg]]$GOterms_all <- sapply(list_reg[[reg]]$entrez, + function(x) paste(go_enr_all$dataSetName[which(str_detect(go_enr_all$overlapGenes, x))], collapse="|")) +} + + +# 4. Print df of each brain region to file ------------------- + +for (reg in regions){ + list_reg[[reg]]$ensembl_id <- NULL + write.csv(list_reg[[reg]], file = paste0(folder_tables, "/09_", reg, "_DEgenes_unique_GOterms.csv"), + quote = FALSE) +} + + +# 5. Print logfoldchange in each region (examples for slides) ----------------------- + +# read all regions with all genes (no pval filtering) +list_reg <- list() +for (reg in regions){ + res <- read.table(file=paste0(folder_tables, "/02_", reg, "_deseq2_Dex_1_vs_0_lfcShrink.txt"),sep="\t") + res$ensembl_id <- rownames(res) + res$gene_symbol <- mapIds(org.Mm.eg.db, keys = res$ensembl_id, + keytype = "ENSEMBL", column="SYMBOL") + res$entrez <- mapIds(org.Mm.eg.db, keys = res$ensembl_id, + keytype = "ENSEMBL", column="ENTREZID") + list_reg[[reg]] <- res +} + +# combine data of all regions (append rows) +df <- bind_rows(list_reg, .id="region") +head(df) +# df <- df %>% +# dplyr::select(region, log2FoldChange, padj, ensembl_id, gene_symbol, entrez) + +# all regions +genes_all <- read.table(file=paste0(folder_tables,"/06_overlap_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1_symbolID.txt")) +for (i in 1:10){ + print(df %>% + filter(gene_symbol == genes_all$V1[i])) +} +fkbp5 <- df %>% + filter(gene_symbol == "Fkbp5") +ggplot(fkbp5, aes(x = region, y = log2FoldChange, fill = padj < 0.1)) + + geom_bar(stat="identity") + + xlab("brain region") + + scale_fill_manual("FDR < 0.1", values = c("TRUE" = "yellowgreen", "FALSE" = "orange")) + + ggtitle("FKBP5: differentially expressed in all brain regions") + + theme_bw() + + theme(text = element_text(size=12)) +ggsave(filename = paste0(folder_plots,"/09_FKBP5_foldchanges.png"), width = 6, height = 4) + +# only CER +genes_cer <- read.table(file=paste0(folder_tables,"/06_unique_CER_symbolID.txt")) +for (i in 1:10){ + print(df %>% + filter(gene_symbol == genes_cer$V1[i])) +} +tgfb3 <- df %>% + filter(gene_symbol == "Tgfb3") +ggplot(tgfb3, aes(x = region, y = log2FoldChange, fill = padj < 0.1)) + + geom_bar(stat="identity") + + xlab("brain region") + + scale_fill_manual("FDR < 0.1", values = c("TRUE" = "yellowgreen", "FALSE" = "orange")) + + ggtitle("TGFB3: differentially expressed only in the Cerebellum") + + theme_bw() + + theme(text = element_text(size=12)) +ggsave(filename = paste0(folder_plots,"/09_TGFB3_foldchanges.png"), width = 6, height = 4) diff --git a/01_DiffExp/10_plotGOsingle.R b/01_DiffExp/10_plotGOsingle.R new file mode 100644 index 0000000..29c9914 --- /dev/null +++ b/01_DiffExp/10_plotGOsingle.R @@ -0,0 +1,126 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 21.09.2020 +## Author: Nathalie +################################################## +# GO plots single regions + +setwd("~/Documents/ownCloud/DexStim_RNAseq_Mouse") + +library(org.Mm.eg.db) +library(dplyr) +library(stringr) +library(anRichment) +library(anRichmentMethods) + +regions <- c("AMY", "PFC", "PVN", "CER", "vDG", "dDG", "vCA1", "dCA1") + +folder_plots <- paste0("figures") +folder_tables <- paste0("tables") + + +# 1. read DE tables from all regions ---------- + +list_reg <- list() +for (reg in regions){ + res <- read.table(file=paste0(folder_tables, "/02_", reg, "_deseq2_Dex_1_vs_0_lfcShrink.txt"),sep="\t") + res <- res[res$padj <= 0.1,] + res$ensembl_id <- rownames(res) + # res$padj[which(is.na(res$padj))] <- 1 + res$gene_symbol <- mapIds(org.Mm.eg.db, keys = res$ensembl_id, + keytype = "ENSEMBL", column="SYMBOL") + res$entrez <- mapIds(org.Mm.eg.db, keys = res$ensembl_id, + keytype = "ENSEMBL", column="ENTREZID") + list_reg[[reg]] <- res +} + + +# 2. check uniqueness of DE genes --------------- + +for (reg in regions){ + index_reg <- which(regions == reg) + # df <- bind_rows(list_reg[-index_reg], .id="region") %>% + # filter(DE0.1) + df <- bind_rows(list_reg[-index_reg], .id="region") + list_reg[[reg]]$regions_DE <- sapply(list_reg[[reg]]$ensembl_id, + function(x) paste(df[df$ensembl_id == x,]$region, collapse = " ")) + list_reg[[reg]]$unique_DE <- sapply(list_reg[[reg]]$regions_DE, + function(x) x == "") +} + + +# 3. GO enrichment for the genes of each region ------------------ + +go_enrichment_all <- function(df_reg, GOcoll, unique){ + if (unique){ + genes <- df_reg$entrez[df_reg$unique_DE] + } else { + genes <- df_reg$entrez + } + background <- read.table(file = paste0(folder_tables, "/06_background_entrezID.txt"), + header = FALSE) + modules <- rep("not_significant", nrow(background)) + modules[which(background$V1 %in% genes)] <- "significant" + + # enrichment + GOenrichment <- enrichmentAnalysis( + classLabels = modules, + identifiers = background$V1, + refCollection = GOcoll, + useBackground = "given", + nBestDataSets = length(GOcoll$dataSets), + # threshold = 0.1, + # thresholdType = "Bonferroni", + getOverlapEntrez = TRUE, + getOverlapSymbols = TRUE, + ignoreLabels = "not_significant", + maxReportedOverlapGenes = 500 + ) + + enrichmentTable <- GOenrichment$enrichmentTable + + return(enrichmentTable) +} + +GOcollection <- buildGOcollection(organism = "mouse") +list_GO <- list() +# GO.BPcollection = subsetCollection(GOcollection, tags = "GO.BP") +for (reg in regions){ + + go_enr_unique <- go_enrichment_all(list_reg[[reg]], GOcollection, TRUE) + # go_enr_all <- go_enrichment_all(list_reg[[reg]], GOcollection, FALSE) + list_GO[[reg]] <- go_enr_unique + +} + + +# 4. Plot GO terms +df_all <- bind_rows(list_GO, .id="region") +for (reg in regions){ + + df_reg <- list_GO[[reg]] %>% + filter(nCommonGenes >= 10, pValue <= 0.1) %>% + group_by(inGroups) %>% slice_min(order_by = pValue, n = 10) + + df <- df_all[df_all$dataSetName %in% df_reg$dataSetName,] + df$dataSetName <- sapply(df$dataSetName, function(x) str_trunc(x, 45, "right")) + df$dataSetName <- factor(df$dataSetName, levels = rev(reorder(df$dataSetName[df$region==reg], df$pValue[df$region==reg]))) + df$region <- factor(df$region, levels = c("AMY", "CER", "PFC", "PVN", "vDG", "dDG", "vCA1", "dCA1")) + df$inGroups <- factor(df$inGroups) + levels(df$inGroups) <- c("Biological Process", "Cellular Components", "Molecular Function") + + # Plot results (plotted pvalues are not adjusted for multiple testing) + df %>% + arrange(desc(dataSetName)) %>% + ggplot(aes(x=dataSetName, y = -log10(pValue), fill = region)) + + geom_bar(position = position_dodge2(reverse=TRUE), stat="identity") + + geom_hline(yintercept = -log10(0.1),linetype="dashed", color = "red") + + coord_flip() + + xlab("GOterm") + + ggtitle(paste0("GO terms enriched for DE genes only in ",reg, " (Top 10 each)")) + + facet_wrap(~inGroups, scales="free") + + theme(axis.text.y = element_text(size = 10)) + ggsave(filename = paste0(folder_plots, "/10_", reg, "_GOterms_unique.png"), width = 13, height = 7) +} + + diff --git a/01_DiffExp/12_meanExp_regionDex.R b/01_DiffExp/12_meanExp_regionDex.R new file mode 100644 index 0000000..0854855 --- /dev/null +++ b/01_DiffExp/12_meanExp_regionDex.R @@ -0,0 +1,42 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 09.10.2020 +## Author: Nathalie +################################################## +# Parse expression data to mean value per Region/Dex status + +library(data.table) +library(tidyr) +library(dplyr) +library(stringr) +library(org.Mm.eg.db) + +folder_table <- "/Users/nathalie_gerstner/Documents/ownCloud/DexStim_RNAseq_Mouse/tables" +output_file <- file.path(folder_table, "12_meanExp_regionDex.csv") +files <- list.files(path = folder_table, pattern = "deseq2_expression_vsd.txt$", full.names = TRUE) + +# 1. Read expression table for each region +expr_list <- lapply(files, function(x) as.data.frame(t(as.matrix(fread(x),rownames=1)))) + +# 2. Merge expression tables and remove columns with NAs +expr_all <- bind_rows(expr_list) %>% + dplyr::select(where(~!any(is.na(.)))) %>% # maybe change this and set NAs to 0 + tibble::rownames_to_column("sample") %>% # copy rownames to column + separate(sample, c("region", "dex"), "_") # separate former row name into region and dex status +expr_all$dex <- str_remove(expr_all$dex, "\\d+") # remove mouse number of dex status + +# 3. Get mean expression per region/dex group +expr_mean <- expr_all %>% + group_by(region, dex) %>% + summarise(across(everything(), mean)) %>% # mean per group for all genes + mutate(x = paste(region, dex, sep="_")) %>% # concatenate region and dex + tibble::column_to_rownames("x") %>% # and use as rownames + dplyr::select(-region,-dex) # remove region and dex column +expr_mean <- as.data.frame(t(expr_mean)) + +# 4. Add gene symols +expr_mean$gene_symbol <- mapIds(org.Mm.eg.db, keys = rownames(expr_mean), + keytype = "ENSEMBL", column="SYMBOL") + +# 4. Write mean values to file +fwrite(expr_mean, file = output_file, row.names = TRUE) diff --git a/02_CoExp_Kimono/.DS_Store b/02_CoExp_Kimono/.DS_Store new file mode 100644 index 0000000..0392bc9 Binary files /dev/null and b/02_CoExp_Kimono/.DS_Store differ diff --git a/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/00_parseFunCoup.R b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/00_parseFunCoup.R new file mode 100644 index 0000000..a4a8b43 --- /dev/null +++ b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/00_parseFunCoup.R @@ -0,0 +1,61 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 29.09.2020 +## Author: Nathalie +################################################## +# Parse FunCoup mouse data as input for kimono + +library(data.table) +library(dplyr) +library(org.Mm.eg.db) +library(igraph) + +basepath <- "/Users/nathalie_gerstner/Documents/ownCloud/DexStim_RNAseq_Mouse/" +funcoup_file <- file.path(basepath, "data/kimono_input/FC5.0_M.musculus_compact.gz") +output_file <- file.path(basepath,"data/kimono_input/prior_expr_funcoup_mm.csv") + +# 1. Read file +funcoup_mus <- fread(funcoup_file, col.names = c("PFC", "FBS", "Gene_A", "Gene_B")) #%>% + #filter(PFC >= 0.4) + + +# 2. Write interactions with ENSEMBL IDs to file +funcoup_ens <- funcoup_mus %>% + dplyr::select('Gene_A', 'Gene_B') +fwrite(funcoup_ens, file = output_file) + + +# 3. Plot some statistics +funcoup_app <- c(funcoup_ens$Gene_A, funcoup_ens$Gene_B) +funcoup_unique <- unique(funcoup_app) +hist(table(funcoup_app)) +max(table(funcoup_app)) + + +# 4. Read 63 genes from Zimmermann/Arloth Paper +# And check if they interact in GeneMANIA +GR_genes <- "/Users/nathalie_gerstner/Documents/ownCloud/DexStim_RNAseq_Mouse/data/kimono_input/63genes_ZimmermannPaper.csv" +GRgenes <- fread(GR_genes) +funcoup_GR <- funcoup_mus %>% filter(Gene_A %in% GRgenes$Ensembl & Gene_B %in% GRgenes$Ensembl) + + +# Function to change all ensembl ids in network to gene symbols +ensemblToSymbol <- function(net){ + net$Gene_A <- mapIds(org.Mm.eg.db, keys = net$Gene_A, + keytype = "ENSEMBL", column="SYMBOL") + net$Gene_B <- mapIds(org.Mm.eg.db, keys = net$Gene_B, + keytype = "ENSEMBL", column="SYMBOL") + return(net) +} + +funcoup_GR <- ensemblToSymbol(funcoup_GR) +funcoup_GR <- funcoup_GR[,c("Gene_A", "Gene_B")] +df.g <- graph.data.frame(d = funcoup_GR, directed = FALSE) + +png(filename = paste0(basepath,"figures/02_CoExp_Kimono/05_funcoup_63genes.png"), width = 900, height = 900) +plot(df.g, vertex.label = V(df.g)$name, + layout=layout.fruchterman.reingold(df.g), + vertex.label.color= adjustcolor("black", .8), + vertex.label.cex = 0.7, + vertex.size=10) +dev.off() diff --git a/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/01_parsePhenotypes.R b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/01_parsePhenotypes.R new file mode 100644 index 0000000..0b16557 --- /dev/null +++ b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/01_parsePhenotypes.R @@ -0,0 +1,38 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 16.10.2020 +## Author: Nathalie +################################################## +# Parse phenotype data as input for kimono + +library(data.table) +library(dplyr) +library(tidyr) + +basepath <- "/Users/nathalie_gerstner/Documents/ownCloud/DexStim_RNAseq_Mouse/" +folder_table <- paste0(basepath,"tables/") +output_file <- paste0(basepath,"data/kimono_input/phenotypes_mm_") +files <- list.files(path = folder_table, pattern = "_deseq2_bio_variables.txt$", full.names = TRUE) + + +# 1. Read covariable table for each region +biol_list <- lapply(files, fread) +biol_all <- bind_rows(biol_list) + + +# 2. Fill empty SV cells with 0 +biol_all <- biol_all %>% mutate_all(~replace(., is.na(.), 0)) + + +# 3. Single regions and dex status +regions <- unique(biol_all$Region) +dex <- c(0,1) + +# 4. Print table of each region to file +for (reg in regions){ + for (d in dex){ + phen <- biol_all %>% filter(Region == reg, Dex == d) %>% + dplyr::select(-Region, -Dex) + fwrite(phen, file = paste0(output_file, reg, "_dex", d, ".csv")) + } +} diff --git a/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/02_parseExpression.R b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/02_parseExpression.R new file mode 100644 index 0000000..58a94f9 --- /dev/null +++ b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/02_parseExpression.R @@ -0,0 +1,47 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 16.10.2020 +## Author: Nathalie +################################################## +# Parse expression data as input for kimono + +library(data.table) +library(dplyr) + +regions <- c("AMY", "CER", "PFC", "PVN", "vDG", "dDG", "vCA1", "dCA1") +dex <- c(0,1) + +basepath <- "/Users/nathalie_gerstner/Documents/ownCloud/DexStim_RNAseq_Mouse/" +folder_table <- paste0(basepath,"tables/") +output_file <- paste0(basepath,"data/kimono_input/expression_mm_") +files <- list.files(path = folder_table, pattern = "deseq2_expression_vsd.txt$", full.names = TRUE) + +# 1. Read expression table for each region +expr_list <- lapply(files, function(x) as.data.frame(t(as.matrix(fread(x),rownames=1)))) + +# 2. Merge expression tables and remove columns with NAs +expr_all <- bind_rows(expr_list) %>% + dplyr::select(where(~!any(is.na(.)))) # maybe change this and set NAs to 0 + +# 3a. Write expression table to file +for (reg in regions){ + for (d in dex){ + dex_str <- ifelse(d == 0, "CNTRL", "DEX") + phen <- expr_all[startsWith(row.names(expr_all),reg),] + phen <- phen[grepl(dex_str, row.names(phen)),] + fwrite(phen, file = paste0(output_file, reg, "_dex", d, ".csv"), row.names = TRUE) + } +} + +# 3b. Scale all expression values together and write to file +expr_scaled <- as.data.frame(scale(expr_all)) +# expr_scaled[,1:10] +# expr_all[,1:10] +for (reg in regions){ + for (d in dex){ + dex_str <- ifelse(d == 0, "CNTRL", "DEX") + phen <- expr_scaled[startsWith(row.names(expr_scaled),reg),] + phen <- phen[grepl(dex_str, row.names(phen)),] + fwrite(phen, file = paste0(output_file, reg, "_dex", d, "_scaled.csv"), row.names = TRUE) + } +} diff --git a/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/03_parseMapGeneBio.R b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/03_parseMapGeneBio.R new file mode 100644 index 0000000..4145e38 --- /dev/null +++ b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/03_parseMapGeneBio.R @@ -0,0 +1,30 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 16.10.2020 +## Author: Nathalie +################################################## +# Mapping between genes and phenotypes as input for kimono +# Sufficient to do this once, not for each region and dex status + +library(data.table) +library(dplyr) +library(tidyr) + +basepath <- "/Users/nathalie_gerstner/Documents/ownCloud/DexStim_RNAseq_Mouse/" +folder_table <- paste0(basepath,"tables/") +output_file <- paste0(basepath,"data/kimono_input/prior_expr_bio_mm_regDex.csv") +pheno_file <- paste0(basepath,"data/kimono_input/phenotypes_mm_AMY_dex0.csv") +expr_file <- paste0(basepath,"data/kimono_input/expression_mm_AMY_dex0.csv") + + +# 1. Read phenotype and expression file +pheno <- fread(pheno_file) +expr <- fread(expr_file) + + +# 2. Create all pairs of pheno and gene +map <- tidyr::crossing("gene" = colnames(expr)[-1], "bio" = colnames(pheno)[-1]) + + +# 3. Write mapping to file +fwrite(map, output_file) diff --git a/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/04_runKimono.R b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/04_runKimono.R new file mode 100644 index 0000000..c8cec9a --- /dev/null +++ b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/04_runKimono.R @@ -0,0 +1,130 @@ +#!/usr/bin/env Rscript + +############################################### + +args = commandArgs(trailingOnly=TRUE) + +# test if there is at least one argument: if not, return an error +# if (length(args)!=5) { +# stop("Please supply expression.csv, phenotypes.csv, prior_expr_bio.csv and output file", call.=FALSE) +# } +reg <- "PVN" +d <- 0 + +############################################## +# manual(if not snakemake): data files +args[[1]]=paste0("/Users/nathalie_gerstner/Documents/ownCloud/DexStim_RNAseq_Mouse/data/kimono_input/expression_mm_",reg,"_dex",d,".csv") +args[[2]]=paste0("/Users/nathalie_gerstner/Documents/ownCloud/DexStim_RNAseq_Mouse/data/kimono_input/phenotypes_mm_",reg,"_dex",d,".csv") +args[[3]]="/Users/nathalie_gerstner/Documents/ownCloud/DexStim_RNAseq_Mouse/data/kimono_input/prior_expr_bio_mm_regDex.csv" +args[[4]]="/Users/nathalie_gerstner/Documents/ownCloud/DexStim_RNAseq_Mouse/data/kimono_input/prior_expr_funcoup_mm.csv" +args[[5]]=paste0("/Users/nathalie_gerstner/Documents/ownCloud/DexStim_RNAseq_Mouse/tables/coExpression_kimono/04_singleRegion_",reg,"_dex",d,"_funcoup.csv") + +############################################### +### 1 libraries & code +############################################### + +libraries <- c("data.table", "tidyr", "ggplot2", "reshape2", "oem", "ramify", "stringr", + "magrittr", "foreach", "doParallel", "dplyr", "furrr", "purrr") +lapply(libraries, require, character.only = TRUE) + +source("/Users/nathalie_gerstner/Documents/ownCloud/DexStim_RNAseq_Mouse/scripts/02_CoExp_Kimono/kimono_stability/infer_sgl_model.R") +source('/Users/nathalie_gerstner/Documents/ownCloud/DexStim_RNAseq_Mouse/scripts/02_CoExp_Kimono/kimono_stability/utility_functions.R') +source('/Users/nathalie_gerstner/Documents/ownCloud/DexStim_RNAseq_Mouse/scripts/02_CoExp_Kimono/kimono_stability/kimono.R') + +print("libraries & moni main done") + + + +############################################### +### 2 Data +############################################### + +mrna <- as.data.frame(as.matrix(fread(args[1]),rownames=1)) +# mrna <- mrna[1:5,] + + +############################################### + +biological <- as.data.frame(as.matrix(fread(args[2]),rownames=1)) + +# (make sure the rows are in the same order) +idorder <- as.character(rownames(biological)) +mrna <- mrna[match(idorder, rownames(mrna)),] + + +print("read input") +############################################### +############################################### + +# mapping + +prior_mrna_bio <- fread(args[[3]]);prior_mrna_bio[1:5,] +prior_mrna <- fread(args[[4]]); prior_mrna[1:5,] + +print("read mapping") + + + + +############################################### +# 3 Assemble into lists +############################################### + +#set input parameters +input_list <- list( + as.data.table(mrna), + # as.data.table(mrna), + as.data.table(biological) +) +names(input_list) <- c('mrna', + 'biological') + +######################### +mapping_list <- list( + as.data.table(prior_mrna_bio), + as.data.table(prior_mrna) +) +######################### +metainfo <-data.frame('ID' = c('mrna_bio', 'prior_mrna'), + 'main_to' = c(2,1) +) +print("data created") + + +############################################### +# 4 Run MONI +############################################### + +# parallel +options(future.globals.maxSize = 30000 * 1024^2) # more memory for each thread +plan(multisession, workers = 12) # 12 parallel + + + +# start +start_time <- Sys.time();start_time + +node_list <- colnames(input_list$mrna)[1:10] +results <- future_map(node_list, run_kimono_para, stab_sel = TRUE, .progress = TRUE, .options = future_options(seed = TRUE)) + +# falls nicht parallelisiert: +# results <- kimono(input_list, mapping_list, metainfo, main_layer = 1, min_features = 5, sel_iterations = 10, core = 2) + + +Sys.time()- start_time +results <- do.call(rbind, results) # make one big data table + +############################################### + + +print("kimono done"); end_time <- Sys.time();end_time - start_time + + + +#end + +fwrite(results,file=args[5], + row.names = FALSE,quote = FALSE) +print("table written") + + diff --git a/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/04_runKimono_funcoup.R b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/04_runKimono_funcoup.R new file mode 100644 index 0000000..badee18 --- /dev/null +++ b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/04_runKimono_funcoup.R @@ -0,0 +1,130 @@ +#!/usr/bin/env Rscript + +############################################### + +args = commandArgs(trailingOnly=TRUE) + +# read region, dex status and startnode +reg <- args[[1]] +#reg <- "dCA1" +d <- args[[2]] +#d <- 0 +startnode <- as.numeric(args[[3]]) + +############################################## +# manual(if not snakemake): data files +basepath <- "/binder/mgp/workspace/DexStim_RNAseq_Mouse/" +input_expr <- paste0(basepath,"data/kimono_input/expression_mm_",reg,"_dex",d,".csv") +input_pheno <- paste0(basepath,"data/kimono_input/phenotypes_mm_",reg,"_dex",d,".csv") +input_prior_bio <- paste0(basepath,"data/kimono_input/prior_expr_bio_mm_regDex.csv") +input_prior <- paste0(basepath,"data/kimono_input/prior_expr_funcoup_mm.csv") +output <- paste0(basepath,"tables/coExpression_kimono/04_singleRegion_",reg,"_dex",d,"_funcoup_parallel_",startnode,".csv") + +############################################### +### 1 libraries & code +############################################### + +#.libPaths( c( .libPaths(), "/binder/mgp/workspace/DexStim_RNAseq_Mouse/Rpackages/") ) +#Sys.setenv(R_LIBS = paste("/binder/mgp/workspace/DexStim_RNAseq_Mouse/Rpackages/", Sys.getenv("R_LIBS"), sep=.Platform$path.sep)) +libraries <- c("data.table", "tidyr", "ggplot2", "reshape2", "oem", "ramify", "stringr", + "magrittr", "foreach", "doParallel", "dplyr", "furrr", "purrr") +lapply(libraries, require, character.only = TRUE) + +source(paste0(basepath,"scripts/02_CoExp_Kimono/kimono_stability/infer_sgl_model.R")) +source(paste0(basepath,'scripts/02_CoExp_Kimono/kimono_stability/utility_functions.R')) +source(paste0(basepath,'scripts/02_CoExp_Kimono/kimono_stability/kimono.R')) + +print("libraries & moni main done") + + + +############################################### +### 2 Data +############################################### + +mrna <- as.data.frame(as.matrix(fread(input_expr),rownames=1)) +# mrna <- mrna[1:5,] + + +############################################### + +biological <- as.data.frame(as.matrix(fread(input_pheno),rownames=1)) + +# (make sure the rows are in the same order) +idorder <- as.character(rownames(biological)) +mrna <- mrna[match(idorder, rownames(mrna)),] + + +print("read input") +############################################### +############################################### + +# mapping + +prior_mrna_bio <- fread(input_prior_bio);prior_mrna_bio[1:5,] +prior_mrna <- fread(input_prior); prior_mrna[1:5,] + +print("read mapping") + + + + +############################################### +# 3 Assemble into lists +############################################### + +#set input parameters +input_list <- list( + as.data.table(mrna), + # as.data.table(mrna), + as.data.table(biological) +) +names(input_list) <- c('mrna', + 'biological') + +######################### +mapping_list <- list( + as.data.table(prior_mrna_bio), + as.data.table(prior_mrna) +) +######################### +metainfo <-data.frame('ID' = c('mrna_bio', 'prior_mrna'), + 'main_to' = c(2,1) +) +print("data created") + + +############################################### +# 4 Run MONI +############################################### + +# parallel +options(future.globals.maxSize = 30000 * 1024^2) # more memory for each thread +plan(multisession, workers = 12) + +# start +start_time <- Sys.time();start_time + + +endnode <- min(startnode+1000, length(colnames(input_list$mrna))) +node_list <- colnames(input_list$mrna)[startnode:endnode] +results <- future_map(node_list, run_kimono_para, stab_sel = TRUE, niterations = 20, .options = furrr_options(seed = TRUE)) #add one more layer of parallelization in kimono.R (seeds) + + +Sys.time()- start_time +results <- do.call(rbind, results) # make one big data table + +############################################### + + +print("kimono done"); end_time <- Sys.time();end_time - start_time + + + +#end + +fwrite(results,file=output, + row.names = FALSE,quote = FALSE) +print("table written") + + diff --git a/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/05_network.R b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/05_network.R new file mode 100644 index 0000000..76a5e95 --- /dev/null +++ b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/05_network.R @@ -0,0 +1,225 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 29.09.2020 +## Author: Nathalie +################################################## +# Analyze kimono output and create network +# Separate on dex and baseline + +library(data.table) +library(dplyr) +library(ggplot2) +library(org.Mm.eg.db) + +reg <- "AMY" +d <- 0 + +basepath <- "/Users/nathalie_gerstner/Documents/ownCloud/DexStim_RNAseq_Mouse/" +kimono_results <- paste0(basepath,"tables/coExpression_kimono/04_singleRegion_",reg,"_dex",d,"_funcoup.csv") +GR_genes <- paste0(basepath,"data/kimono_input/63genes_ZimmermannPaper.csv") + +# 1. Load Kimono results and filter them +d <- fread(kimono_results) +#dtmoni <- d[value!=0,]; nrow(dtmoni) +dtmoni <- d +network <- dtmoni %>% #filter((value > 0.001) | (value < (-0.001))) %>% + #filter(performance > 0.01) %>% + filter(predictor != '(Intercept)') %>% setDT + +png(filename = paste0(basepath, "figures/02_CoExp_Kimono/05_singleRegion_",reg,"_funcoup_allbetas.png")) +hist(abs(network$value), breaks = 500, + main = paste("Distribution of all beta values in", reg), + xlab = "Beta",xlim = c(0,0.1)) +abline(v = mean(abs(network$value)), lty = 3) +dev.off() + +hist(table(network$target)) + + +# 2. Inspect associations +dtmoni$target %>% unique %>% length +dtmoni[relation=="mrna_bio", predictor] %>% unique + +ggplot(network, aes(relation)) + geom_bar(fill="lightblue") + +network[,.N, by=relation] +network[, .N, by=predictor][order(-N)][1:100] + +nrow(network)/nrow(dtmoni) # retained fraction after filtering + +paste("Number of unique genes:", + length(unique(network$target))) +paste("Number of unique predictors:", + length(unique(network$predictor))) + +# unique predictors +unique(network, by=c("relation", "predictor")) %>% .[,.N, by=relation] + + +# 3. Create network +library(purrr) +library(igraph) +unique(network$relation) + +actors<-unique(c(network$predictor,network$target)) + +relations <- data.frame(from=network$predictor, + to=network$target, + value=network$value, + performance=network$performance) + +# network +g <- graph_from_data_frame(relations, directed=FALSE, vertices=actors) +print(g) + + +# 4. Plot network +plot_network <- function(subnet){ + subnet <- data.frame(lapply(subnet, as.character), stringsAsFactors=FALSE) + subnet$target <- gsub("_", ".", subnet$target) + subnet$predictor <- gsub("_", ".", subnet$predictor) + subnet$relation <- gsub("_",".", subnet$relation) + + lable <- unique(c(paste0("mrna_",subnet[,1]),paste0(subnet$relation,"_",subnet[,2]))) + + actors <- data.frame(name=do.call(rbind, strsplit(lable,"_") )[,2], + omic= do.call(rbind, strsplit(lable,"_") )[,1] + ) + + + relations <- data.frame(from=subnet$predictor, + to=subnet$target, + value=subnet$value, + performance=subnet$performance) + + actors <- unique(actors) %>% setDT + actors[omic=="prior.mrna", omic:="mrna"] + actors <- unique(actors) + g <- graph_from_data_frame(relations, directed=FALSE, vertices=actors) + + deg2 <- igraph::degree(g, mode="all") + + # plot(g, vertex.label.color="black", vertex.size=10, vertex.label=NA , vertex.label.dist=1.5) + # library(qgraph) + + e <- get.edgelist(g,names=FALSE) + # l <- qgraph.layout.fruchtermanreingold(e,vcount=vcount(g), area=1000*(vcount(g)^2),repulse.rad=100+(vcount(g)^30)) + + library(RColorBrewer) + + actors[, edges:=igraph::degree(g)] + actors[, mynames:=name] + + + actors$id <- 1:nrow(actors) + actors <-actors[order(id)] + summary(actors$edges) + + + # edge_threshold <- (as.numeric(sub('.*:', '', summary(actors$edges)[5])) +as.numeric(sub('.*:', '', summary(actors$edges)[6])))/2 + + #edge_threshold<-500 + #actors[edges>edge_threshold,] + #actors[,.N, by=edges,][order(edges)] + #actors[edges < edge_threshold , mynames:=NA] + + colrs <-c(brewer.pal(4, "Set2")[c(1:4)]) + mycol=colrs[2:3] + actors[omic=="mrna", mycolor:=colrs[2]] + actors[omic=="mrna.bio", mycolor:=colrs[3]] + plot(g, + layout=layout.fruchterman.reingold(g), + vertex.frame.color= adjustcolor("black", .4) , + vertex.size=1+(log(deg2)*2), + vertex.color=actors$mycolor , + edge.color = adjustcolor("grey", .8), + edge.curved=.1, + vertex.label = actors$mynames, + vertex.label.color= adjustcolor("black", .8), + vertex.label.cex = 0.7, asp = 1 , + # vertex.label.family = "Times", + edge.width=E(g)$weight*400, + main = paste("Number of edges:", nrow(subnet))) + legend(x=-1, y=-0.5,c("gene", "covariable"), + pch=21, col="#777777", pt.bg=mycol, pt.cex=2, cex=.8, bty="n", ncol=1) +} + +# Function to change all ensembl ids in network to gene symbols +ensemblToSymbol <- function(net){ + net$target <- mapIds(org.Mm.eg.db, keys = net$target, + keytype = "ENSEMBL", column="SYMBOL") + net$predictor[grepl("ENSM", net$predictor)] <- mapIds(org.Mm.eg.db, keys = net$predictor[grepl("ENSM", net$predictor)], + keytype = "ENSEMBL", column="SYMBOL") + return(net) +} + +# whole network +#plot_network(network) + +# dex network +dex <-network[predictor=="Dex",] +#plot_network(dex) + +# Read 63 genes from Zimmermann/Arloth Paper +GRgenes <- fread(GR_genes) +# Plot the 63 genes and all their connections +net_GR <- network[predictor %in% GRgenes$Ensembl | target %in% GRgenes$Ensembl,] +#png(filename = paste0(basepath,"figures/02_CoExp_Kimono/05_singleRegion_",reg,"_funcoup_63withpredictors.png")) +plot_network(net_GR) +#dev.off() + +# Plot the 63 genes, regions and dex, and the connections between +net_GR <- network[target %in% GRgenes$Ensembl,] +# net_GR <- net_GR[predictor %in% GRgenes$Ensembl, ] +net_GR <- net_GR[predictor %in% GRgenes$Ensembl , ] +plot_network(net_GR) + +png(filename = paste0(basepath, "figures/02_CoExp_Kimono/05_singleRegion_",reg,"_funcoup_63betas.png")) +hist(abs(net_GR$value), breaks = 20, + main = paste("Distribution of beta values in", reg), + xlab = "Beta") +abline(v = mean(abs(net_GR$value)), lty = 3) +dev.off() + +png(filename = paste0(basepath, "figures/02_CoExp_Kimono/05_singleRegion_",reg,"_funcoup_63performance.png")) +hist(net_GR$performance, + main = paste("Distribution of performance values in", reg), + xlab = "Performance") +dev.off() + +net_GR <- ensemblToSymbol(net_GR) +png(filename = paste0(basepath,"figures/02_CoExp_Kimono/05_singleRegion_",reg,"_funcoup_63wofilter.png"), + width = 900, height = 900) +plot_network(net_GR) +dev.off() + +net_GR <- net_GR[abs(value) >= 0.001,] +png(filename = paste0(basepath,"figures/02_CoExp_Kimono/05_singleRegion_",reg,"_funcoup_63filter0.001.png"), + width = 900, height = 900) +plot_network(net_GR) +dev.off() + +# Plot with node degress +degrees <- count(network, target) %>% + mutate(network = "complete") +boxplot(degrees$n) +degrees_prior <- count(network[network$relation == "prior_mrna",], target) %>% + mutate(network = "prior") +boxplot(degrees_prior$n) +degrees_bio <- count(network[network$relation == "mrna_bio",], target) %>% + mutate(network = "biological") +boxplot(degrees_bio$n) + +degrees_comb <- rbind(degrees, degrees_prior, degrees_bio) +degrees_comb$network <- factor(degrees_comb$network, levels = c("complete", "prior", "biological")) + +ggplot(degrees_comb, aes(x = network, y = n, fill = network)) + + geom_boxplot() + + ylab("node degree") +ggsave(filename = paste0(basepath,"figures/02_CoExp_Kimono/05_singleRegion_",reg,"_funcoup_nodeDegrees_wofilter.png")) + + + +# Compare number of edges between 63 genes with number of edges between randomly selected 63 genes +mrna <- d[d$relation == "prior_mrna",] +genes <- unique(d$predictor) diff --git a/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/05_network_kimono.R b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/05_network_kimono.R new file mode 100644 index 0000000..902674f --- /dev/null +++ b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/05_network_kimono.R @@ -0,0 +1,170 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 26.10.2020 +## Author: Nathalie +################################################## +# Analyze kimono output and create network +# Separate on dex and baseline + +library(data.table) +library(dplyr) +library(ggplot2) +library(org.Mm.eg.db) +library(igraph) +library(splineTimeR) + +reg <- "PVN" +d <- 0 + +basepath <- "/Users/nathalie_gerstner/Documents/ownCloud/DexStim_RNAseq_Mouse/" +kimono <- paste0(basepath,"tables/coExpression_kimono/04_singleRegion_",reg,"_dex",d,"_funcoup.csv") +figure_path <- paste0(basepath, "figures/02_CoExp_Kimono/01_CutoffSelection/") +GR_genes <- paste0(basepath,"data/kimono_input/63genes_ZimmermannPaper.csv") +#biogrid_file <- paste0(basepath, "data/kimono_input/prior_expr_biogrid_mm.csv") + +# 1. Read data +data <- fread(kimono) +hist(data$value, breaks = 40000, + xlim = c(-0.001,0.001)) +max(abs(data$value)) +hist(data$performance) +data <- data %>% + filter(performance >= 0.1) + +# 2. Make network statistics for different correlation cutoffs +cutoff_vec <- c(0,0.000001,0.00001,0.0001,0.0005,0.001,0.002,0.005,0.01,0.1) +nr_edges <- rep(x = 0, length(cutoff_vec)) +nr_nodes <- rep(x = 0, length(cutoff_vec)) +degrees <- data.frame(cutoff = numeric(), + node = character(), + degree = numeric()) +link_density <- rep(x = 0, length(cutoff_vec)) +for(i in 1:length(cutoff_vec)){ + + subset <- data[abs(data$value) >= cutoff_vec[i],] + # create an igraph network from dataframe + actors<-unique(c(subset$target,subset$predictor)) + relations <- data.frame(from=subset$target, + to=subset$predictor, + value=subset$value, + performance=subset$performance) + g_subset <- graph_from_data_frame(relations, directed=FALSE, vertices=actors) + + nr_edges[i] <- gsize(g_subset) # number of edges in network + nr_nodes[i] <- gorder(g_subset) # number of nodes in network + link_density[i] <- edge_density(g_subset) # ratio of the number of edges and the number of possible edges + nodedegree <- igraph::degree(g_subset) # node degree for each node in network + degrees <- rbind(degrees, data.frame(cutoff = rep(cutoff_vec[i], length(nodedegree)), node = names(nodedegree), degree = nodedegree)) + +} + +names(nr_edges) <- cutoff_vec +png(filename = paste0(figure_path,"kimono_nr_edges_",reg,".png"), width = 800, height = 600) +barplot(nr_edges, + xlab = "cutoff", + ylab = "number of edges", + main = paste0("Number of edges in kimono network: ", reg)) +dev.off() +names(nr_nodes) <- cutoff_vec +png(filename = paste0(figure_path,"kimono_nr_nodes_",reg,".png"), width = 800, height = 600) +barplot(nr_nodes, + xlab = "cutoff", + ylab = "number of nodes", + main = paste0("Number of nodes in kimono network: ", reg)) +dev.off() +names(link_density) <- cutoff_vec +png(filename = paste0(figure_path,"kimono_link_density_",reg,".png"), width = 800, height = 600) +barplot(link_density, + xlab = "cutoff", + ylab = "link density", + main = paste0("Link density in kimono network: ", reg)) +dev.off() + +ggplot(degrees, aes(x = degree)) + + geom_histogram(position="identity", colour="grey40", alpha=0.2, bins = 100) + + facet_wrap(. ~ cutoff, ncol = 5) + + ggtitle(paste0("Distribution of node degrees for different cutoffs in kimono network: ", reg)) +ggsave(filename = paste0(figure_path,"kimono_node_degrees_",reg,".png"), width = 10, height = 7) + + +# 3. Calculate betweenness and fit powerlaw for different correlation cutoffs +cutoff_vec <- c(0.0001,0.0005,0.001,0.002,0.005) +between <- data.frame(cutoff = numeric(), + node = character(), + betweenness = numeric()) +powerlaw_exponent <- rep(x = 0, length(cutoff_vec)) +for(i in 1:length(cutoff_vec)){ + + subset <- data[abs(data$value) >= cutoff_vec[i],] + # create an igraph network from dataframe + actors<-unique(c(subset$target,subset$predictor)) + relations <- data.frame(from=subset$target, + to=subset$predictor, + value=subset$value, + performance=subset$performance) + g_subset <- graph_from_data_frame(relations, directed=FALSE, vertices=actors) + + nodebetweenness <- betweenness(g_subset, directed = FALSE) # node betweenness: number of shortest paths going through a node + between <- rbind(between, data.frame(cutoff = rep(cutoff_vec[i], length(nodebetweenness)), node = names(nodebetweenness), betweenness = nodebetweenness)) + + # fit a scale free network to data + scaleFreeProp <- networkProperties(g_subset) + powerlaw_exponent[i] <- scaleFreeProp[1,3] + file.rename(from="scale_free_properties.pdf", to = paste0(figure_path,"kimono_scale_free_properties_",reg,"_",cutoff_vec[i],".pdf")) +} + +ggplot(between, aes(x = betweenness)) + + geom_histogram(position="identity", colour="grey40", alpha=0.2, bins = 100) + + facet_wrap(. ~ cutoff, ncol = 5) + + ggtitle(paste0("Distribution of node betweenness for different cutoffs in kimono network: ", reg)) +ggsave(filename = paste0(figure_path,"kimono_node_betweenness_",reg,".png"), width = 10, height = 7) + +names(powerlaw_exponent) <- cutoff_vec +png(filename = paste0(figure_path,"kimono_powerlaw_exponent_",reg,".png"), width = 800, height = 600) +barplot(powerlaw_exponent, + xlab = "cutoff", + ylab = "powerlaw_exponent", + main = paste0("Power-law exponent in kimono network: ", reg)) +dev.off() + + + +# Read 63 genes from Zimmermann/Arloth Paper +GRgenes <- fread(GR_genes) + +cutoff_vec <- c(0,0.000001,0.00001,0.0001,0.0005,0.001,0.002,0.005,0.01,0.1) +nr_edges <- rep(x = 0, length(cutoff_vec)) +nr_nodes <- rep(x = 0, length(cutoff_vec)) +for(i in 1:length(cutoff_vec)){ + + subset <- data[abs(data$value) >= cutoff_vec[i],] + subset <- subset[target %in% GRgenes$Ensembl,] + subset <- subset[predictor %in% GRgenes$Ensembl , ] + # create an igraph network from dataframe + actors<-unique(c(subset$target,subset$predictor)) + relations <- data.frame(from=subset$target, + to=subset$predictor, + value=subset$value, + performance=subset$performance) + g_subset <- graph_from_data_frame(relations, directed=FALSE, vertices=actors) + + nr_edges[i] <- gsize(g_subset) # number of edges in network + nr_nodes[i] <- gorder(g_subset) # number of nodes in network + +} + +names(nr_edges) <- cutoff_vec +png(filename = paste0(figure_path,"kimono_63genes_nr_edges_",reg,".png"), width = 800, height = 600) +barplot(nr_edges, + xlab = "cutoff", + ylab = "number of edges", + main = paste0("Number of edges in kimono network (63 genes): ", reg)) +dev.off() +names(nr_nodes) <- cutoff_vec +png(filename = paste0(figure_path,"kimono_63genes_nr_nodes_",reg,".png"), width = 800, height = 600) +barplot(nr_nodes, + xlab = "cutoff", + ylab = "number of nodes", + main = paste0("Number of nodes in kimono network (63 genes): ", reg)) +dev.off() + diff --git a/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/05_network_parCorr.R b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/05_network_parCorr.R new file mode 100644 index 0000000..1c658ba --- /dev/null +++ b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/05_network_parCorr.R @@ -0,0 +1,206 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 26.10.2020 +## Author: Nathalie +################################################## +# Analyze partial correlations and create network +# Separate on dex and baseline + +library(data.table) +library(dplyr) +library(ggplot2) +library(org.Mm.eg.db) +library(igraph) +library(splineTimeR) + +reg <- "PVN" +d <- 0 + +basepath <- "/Users/nathalie_gerstner/Documents/ownCloud/DexStim_RNAseq_Mouse/" +parcor <- paste0(basepath,"tables/coExpression_kimono/04_parCorr_singleRegion_",reg,"_dex",d,".csv") +figure_path <- paste0(basepath, "figures/02_CoExp_Kimono/01_CutoffSelection/") +GR_genes <- paste0(basepath,"data/kimono_input/63genes_ZimmermannPaper.csv") +biogrid_file <- paste0(basepath, "data/kimono_input/prior_expr_biogrid_mm.csv") + +# 1. Read data +data <- fread(parcor) +hist(data$pcor) +min(abs(data$pcor)) + + +# 2. Make network statistics for different correlation cutoffs +cutoff_vec <- seq(from = 0, to = 0.0015, by = 0.0001) +nr_edges <- rep(x = 0, length(cutoff_vec)) +nr_nodes <- rep(x = 0, length(cutoff_vec)) +degrees <- data.frame(cutoff = numeric(), + node = character(), + degree = numeric()) +link_density <- rep(x = 0, length(cutoff_vec)) +for(i in 1:length(cutoff_vec)){ + + subset <- data[abs(data$pcor) >= cutoff_vec[i],] + # create an igraph network from dataframe + actors<-unique(c(subset$node1_name,subset$node2_name)) + relations <- data.frame(from=subset$node1_name, + to=subset$node2_name, + value=subset$pcor, + performance=subset$pval) + g_subset <- graph_from_data_frame(relations, directed=FALSE, vertices=actors) + + nr_edges[i] <- gsize(g_subset) # number of edges in network + nr_nodes[i] <- gorder(g_subset) # number of nodes in network + link_density[i] <- edge_density(g_subset) # ratio of the number of edges and the number of possible edges + nodedegree <- igraph::degree(g_subset) # node degree for each node in network + degrees <- rbind(degrees, data.frame(cutoff = rep(cutoff_vec[i], length(nodedegree)), node = names(nodedegree), degree = nodedegree)) + +} + +names(nr_edges) <- cutoff_vec +png(filename = paste0(figure_path,"parCorr_nr_edges_",reg,".png"), width = 800, height = 600) +barplot(nr_edges, + xlab = "cutoff", + ylab = "number of edges", + main = paste0("Number of edges in partial correlation network: ", reg)) +dev.off() +names(nr_nodes) <- cutoff_vec +png(filename = paste0(figure_path,"parCorr_nr_nodes_",reg,".png"), width = 800, height = 600) +barplot(nr_nodes, + xlab = "cutoff", + ylab = "number of nodes", + main = paste0("Number of nodes in partial correlation network: ", reg)) +dev.off() +names(link_density) <- cutoff_vec +png(filename = paste0(figure_path,"parCorr_link_density_",reg,".png"), width = 800, height = 600) +barplot(link_density, + xlab = "cutoff", + ylab = "link density", + main = paste0("Link density in partial correlation network: ", reg)) +dev.off() + +ggplot(degrees, aes(x = degree)) + + geom_histogram(position="identity", colour="grey40", alpha=0.2, bins = 100) + + facet_wrap(. ~ cutoff, ncol = 5) + + ggtitle(paste0("Distribution of node degrees for different cutoffs in partial correlation network: ", reg)) +ggsave(filename = paste0(figure_path,"parCorr_node_degrees_",reg,".png"), width = 10, height = 7) + + +# 3. Calculate betweenness and fit powerlaw for different correlation cutoffs +cutoff_vec <- seq(from = 0.0005, to = 0.0008, by = 0.0001) +between <- data.frame(cutoff = numeric(), + node = character(), + betweenness = numeric()) +powerlaw_exponent <- rep(x = 0, length(cutoff_vec)) +for(i in 1:length(cutoff_vec)){ + + subset <- data[abs(data$pcor) >= cutoff_vec[i],] + # create an igraph network from dataframe + actors<-unique(c(subset$node1_name,subset$node2_name)) + relations <- data.frame(from=subset$node1_name, + to=subset$node2_name, + value=subset$pcor, + performance=subset$pval) + g_subset <- graph_from_data_frame(relations, directed=FALSE, vertices=actors) + + nodebetweenness <- betweenness(g_subset, directed = FALSE) # node betweenness: number of shortest paths going through a node + between <- rbind(between, data.frame(cutoff = rep(cutoff_vec[i], length(nodebetweenness)), node = names(nodebetweenness), betweenness = nodebetweenness)) + + # fit a scale free network to data + scaleFreeProp <- networkProperties(g_subset) + powerlaw_exponent[i] <- scaleFreeProp[1,3] + file.rename(from="scale_free_properties.pdf", to = paste0(figure_path,"parCorr_scale_free_properties_",reg,"_",cutoff_vec[i],".pdf")) +} + +ggplot(between, aes(x = betweenness)) + + geom_histogram(position="identity", colour="grey40", alpha=0.2, bins = 100) + + facet_wrap(. ~ cutoff, ncol = 5) + + ggtitle(paste0("Distribution of node betweenness for different cutoffs in partial correlation network: ", reg)) +ggsave(filename = paste0(figure_path,"parCorr_node_betweenness_",reg,".png"), width = 10, height = 7) + +names(powerlaw_exponent) <- cutoff_vec +png(filename = paste0(figure_path,"parCorr_powerlaw_exponent_",reg,".png"), width = 800, height = 600) +barplot(powerlaw_exponent, + xlab = "cutoff", + ylab = "powerlaw_exponent", + main = paste0("Power-law exponent in partial correlation network: ", reg)) +dev.off() + + + +# # 4. Read BioGrid as reference +# biogrid <- fread(biogrid_file) +# +# cutoff_vec <- seq(from = 0, to = 0.0015, by = 0.0001) +# fisher_pval <- rep(0, length(cutoff_vec)) +# for(i in 1:length(cutoff_vec)){ +# subset <- data[abs(data$pcor) >= cutoff_vec[i],] +# +# # Identify nodes present in Biogrid and parCorr subnet +# nodes_subset <- unique(c(subset$node1_name,subset$node2_name)) +# nodes_biogrid <- unique(c(biogrid$ensembl_A, biogrid$ensembl_B)) +# inters_biosub <- intersect(nodes_subset, nodes_biogrid) +# +# # Subset biogrid edges and subset edges to these nodes +# subset_biogrid <- biogrid %>% +# filter(ensembl_A %in% inters_biosub & ensembl_B %in% inters_biosub) +# subset_parCorr <- subset %>% +# filter(node1_name %in% inters_biosub & node2_name %in% inters_biosub) %>% +# mutate(from=node1_name, to=node2_name) +# +# # Create adjacency matrix for biogrid subnet und subset subnet +# g_biograph <- graph_from_data_frame(subset_biogrid %>% mutate(from=ensembl_A, to=ensembl_B), +# directed=FALSE, vertices = inters_biosub) +# a_biograph <- as_adjacency_matrix(g_biograph,names=TRUE,sparse=FALSE,type='lower') +# +# g_subset <- graph_from_data_frame(data.frame(subset_parCorr$from, subset_parCorr$to), +# directed = FALSE, vertices = inters_biosub) +# a_subset <- as_adjacency_matrix(g_subset, names = TRUE, sparse = FALSE, type = 'lower') +# +# biogrid_vector <- a_biograph[lower.tri(a_biograph)] +# subset_vector <- a_subset[lower.tri(a_subset)] +# +# contingency <- table(biogrid_vector, subset_vector) +# f <- fisher.test(contingency) +# fisher_pval[i] <- f$p.value +# +# } + + + +# Read 63 genes from Zimmermann/Arloth Paper +GRgenes <- fread(GR_genes) + +cutoff_vec <- seq(from = 0, to = 0.0015, by = 0.0001) +nr_edges <- rep(x = 0, length(cutoff_vec)) +nr_nodes <- rep(x = 0, length(cutoff_vec)) +for(i in 1:length(cutoff_vec)){ + + subset <- data[abs(data$pcor) >= cutoff_vec[i],] + subset <- subset[node1_name %in% GRgenes$Ensembl,] + subset <- subset[node2_name %in% GRgenes$Ensembl , ] + # create an igraph network from dataframe + actors<-unique(c(subset$node1_name,subset$node2_name)) + relations <- data.frame(from=subset$node1_name, + to=subset$node2_name, + value=subset$pcor, + performance=subset$pval) + g_subset <- graph_from_data_frame(relations, directed=FALSE, vertices=actors) + + nr_edges[i] <- gsize(g_subset) # number of edges in network + nr_nodes[i] <- gorder(g_subset) # number of nodes in network + +} + +names(nr_edges) <- cutoff_vec +png(filename = paste0(figure_path,"parCorr_63genes_nr_edges_",reg,".png"), width = 800, height = 600) +barplot(nr_edges, + xlab = "cutoff", + ylab = "number of edges", + main = paste0("Number of edges in partial correlation network (63 genes): ", reg)) +dev.off() +names(nr_nodes) <- cutoff_vec +png(filename = paste0(figure_path,"parCorr_63genes_nr_nodes_",reg,".png"), width = 800, height = 600) +barplot(nr_nodes, + xlab = "cutoff", + ylab = "number of nodes", + main = paste0("Number of nodes in partial correlation network (63 genes): ", reg)) +dev.off() diff --git a/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/comparison_prior_DEbackground.R b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/comparison_prior_DEbackground.R new file mode 100644 index 0000000..8c234b0 --- /dev/null +++ b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/comparison_prior_DEbackground.R @@ -0,0 +1,30 @@ +prior <- read.table( + file = "~/Documents/ownCloud/DexStim_RNAseq_Mouse/data/kimono_input/prior_expr_funcoup_mm.csv", + sep = ",", + header = TRUE) + +background <- read.table( + file = "~/Documents/ownCloud/DexStim_RNAseq_Mouse/tables/06_background.txt" +) + +de_PFC <- read.table( + file = "~/Documents/ownCloud/DexStim_RNAseq_Mouse/tables/02_AMY_deseq2_Dex_1_vs_0_lfcShrink.txt", + header = TRUE +) %>% + filter(padj <= 0.1) + + +prior_genes <- unique(c(prior$V1, prior$V2)) + +intersect(prior_genes, background$V1) + +intersect(prior_genes, de_PFC$Ensembl_ID) + +prior <- prior %>% + filter(Gene_A %in% background$V1 & Gene_B %in% background$V1) + + +funcoup <- read.table( + file = "~/Documents/ownCloud/DexStim_RNAseq_Mouse/data/kimono_input/FC5.0_M.musculus_compact", + sep = "\t" +) diff --git a/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/pipeline_kimono_regions_dex_funcoup.sh b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/pipeline_kimono_regions_dex_funcoup.sh new file mode 100644 index 0000000..bcf5b78 --- /dev/null +++ b/02_CoExp_Kimono/02d_kimono_singleRegion_diffGRN/pipeline_kimono_regions_dex_funcoup.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# +#SBATCH --job-name=kimono_region +#SBATCH --output=err_out/kimono_region_%A_%a.out +#SBATCH --error=err_out/kimono_region_%A_%a.err +#SBATCH --array=0-25 +#SBATCH --mem=5000 +#SBATCH --cpus-per-task=12 +#SBATCH --partition=hp +#SBATCH --exclude=hp11 + +region="dCA1" +dex=("0" "1") +startnodes=("1" "1001" "2001" "3001" "4001" "5001" "6001" "7001" "8001" "9001" "10001" "11001" "12001") + +nstartnodes=${#startnodes[@]} +ndex=${#dex[@]} + +#get region and dex index for each job id +istartnode=$((SLURM_ARRAY_TASK_ID / ndex)) #divide task id by number of dex status +istartnode=$iregion|cut -f1 -d"." #take floor of the index +idex=$(($SLURM_ARRAY_TASK_ID%$ndex)) + +echo "My SLURM_ARRAY_TASK_ID: " $SLURM_ARRAY_TASK_ID +echo "${startnodes[$istartnode]}" +echo "${dex[$idex]}" +echo $istartnode +echo $idex + +Rscript --vanilla 04_runKimono_funcoup.R $region ${dex[$idex]} ${startnodes[$istartnode]} diff --git a/02_CoExp_Kimono/kimono_stability/.DS_Store b/02_CoExp_Kimono/kimono_stability/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/02_CoExp_Kimono/kimono_stability/.DS_Store differ diff --git a/02_CoExp_Kimono/kimono_stability/infer_sgl_model.R b/02_CoExp_Kimono/kimono_stability/infer_sgl_model.R new file mode 100644 index 0000000..38a30b5 --- /dev/null +++ b/02_CoExp_Kimono/kimono_stability/infer_sgl_model.R @@ -0,0 +1,243 @@ +#' Calculate tau based on frobenius norm +#' +#' @parameter x matrix/dataframe. must have at least rows & cols > 2 +#' @return Frobenius norm of x +#' @examples +#' x <- matrix(1:20,5,4) +#' calc_tau(x) +calc_tau <- function(x){ + fr_norm <- calc_frobenius_norm(x) + 10^(-fr_norm) +} + +#' Calculate alpha based on frobenius norm and group information +#' +#' @parameter x matrix/dataframe. must have at least rows & cols > 2 +#' @parameter groups, vector of group numbers +#' @return Frobenius norm of x +#' @examples +#' x <- matrix(1:20,5,4) +#' group <- c(1,1,2,2) +#' calc_alpha(x, group) +calc_alpha <- function(x, group){ + + fr_norm <- c() + for (g in unique(group)) { + tmp <- calc_frobenius_norm(as.matrix(x[, which(group == g)])) + fr_norm <- c( fr_norm, tmp ) + } + 10^mean( -fr_norm, na.rm = T) + +} + +#' Calculate lambda1.se model for crossvalidation +#' +#' @parameter lambdas vector of lambdas +#' @parameter cv_result, matrix with nrow == folds and ncol == length(lamdas) +#' @return lambda1.se +calc_lambda1.se <- function(lambdas,error_cv){ + + cv <- colMeans(error_cv) + std <- sd(cv) / sqrt(sum(!is.na(cv))) + lambdas[ min(which(cv <= (min(cv) + std) )) ] +} + +#' Extracts groups numbers +#' +#' @parameter col names vector like +#' @return vector of numeric groups +#' @examples +#' names <- c("methylation___cg0123123","rppa___MTOR") +#' infer_prior_groups(names) +parse_prior_groups <- function(names, sep="\\___"){ + as.numeric( as.factor( do.call( rbind, strsplit( names , split = sep ))[,1])) +} + +#' detects non informative features +#' +#' @parameter matrix x +#' @return vector of bool +#' @export +is_underpowered <- function(x){ + apply( x , 2, var) == 0 +} + +#' detects non informative features +#' +#' @parameter Y_hat predicted matrix with one y_hat per column +#' @return vector of mse +calc_cv_error <- function(Y_hat,y){ + + apply( Y_hat , 2, calc_mse, y ) +} + +#' detects non informative features +#' +#' @parameter Y_hat predicted matrix with one y_hat per column +#' @return vector of mse +parsing_name <- function(string,sep="___"){ + + idx <- grepl("___",string) #might be variables like intercept which do not have any prefix + + result <- data.frame('prefix'= as.character(string), "id"= as.character(string), stringsAsFactors = FALSE) + split_string <- do.call(rbind,strsplit(as.character(string[idx]),'___')) + result[-idx,1] <- split_string[,1] + result[-idx,2] <- split_string[,2] + + result +} + +#' Calculate crossvalidation to estimate lambda1.se & identify potential underpowered features +#' +#' @parameter y data.table - feature to predict +#' @parameter X data.table - input features with prior names attached to features +#' @parameter model - input features with prior names attached to features +#' @parameter intercept boolean +#' @parameter seed_cv int - remove randomness for crossvalidation +#' @parameter folds_cv int - defining the amount of folds +#' @return list containing lambda1.se and feature names excluding underpowered ones +calc_cv_sgl <- function(y, x, model = "sparse.grp.lasso", intercept = TRUE, seed_cv = 1234, folds_cv = 5){ + + error_cv <- matrix(NA, ncol=100, nrow = folds_cv) # matrix storing cv errors. oem tests for 100 lambda values + repeat_cv <- TRUE + + while (repeat_cv) { + + + if(ncol(x) < 2) # in case we exclude all features return empty list + return(list()) + + set.seed(seed_cv) # set seed to reproduce cv folds + fold_idx <- sample( rep( 1:folds_cv, length.out = nrow(x) ) ) + + for (fold in 1:folds_cv) { + + #define test and training sets + test_x <- x[which(fold_idx == fold), ] + test_y <- y[which(fold_idx == fold)] + + train_x <- x[which(fold_idx != fold), ] + train_y <- y[which(fold_idx != fold)] + + #remove underpowered features by testing the variance in training and test set + underpowered <- is_underpowered( test_x) | is_underpowered( train_x) + + x <- x[ , !underpowered, with=FALSE ] + + #restart cv if underpowered features were detected + if( any( underpowered )) + break + + #transform data table to matrix for calculations + train_x <- as.matrix(train_x) + train_y <- as.matrix(train_y) + test_x <- as.matrix(test_x) + test_y <- as.matrix(test_y) + + #prepare oem parameters + group <- parse_prior_groups(colnames(x)) + tau <- calc_tau(train_x ) + alpha <- calc_alpha(train_x, group) + + # supressing warnings since oem warns for all "p > n -> it might be slow for small sample sizes". + # however still the best package for sparse group lasso + fit_cv <- suppressWarnings( + oem(x = train_x , + y = train_y , + penalty = model, + alpha = alpha, + tau = tau , + groups = group, + intercept = intercept) + ) + + Y_hat <- predict( fit_cv, test_x, type = "response" ) # predict test set + error_cv[fold, ] <- calc_cv_error(Y_hat,test_y ) # evaluate test error + } + + #stop loop if no features got removed without error + repeat_cv <- ifelse(ncol(x) == ncol(train_x), FALSE, TRUE) + } + + lambda1.se <- calc_lambda1.se(fit_cv$lambda[[1]], error_cv) + features <- colnames(x) + + list("lambda1.se" = lambda1.se, + "features" = features ) +} + +#' Calculate crossvalidation to estimate lambda1.se & identify potential underpowered features +#' +#' @parameter y data.table - feature to predict +#' @parameter X data.table - input features with prior names attached to features +#' @parameter model string - which model to train. currently only sparse group lasso tested +#' @return edge list for a given input y and x +train_kimono_sgl <- function(y, x, model = "sparse.grp.lasso", intercept = TRUE, ..., seed_cv = 1234){ + + y <- data.table(scale(y)) + x <- data.table(scale(x)) + + # estimate best lambda and identify underpowered features + cv_result <- calc_cv_sgl(y, x , seed_cv = seed_cv) + + if(length(cv_result)==0) return(c()) #exit function hyere if all features got excluded + + # parse cv results + feature_set <- cv_result$features + lambda1.se <- cv_result$lambda + + # exclude underpowered features and convert input to matrix + x <- x[,colnames(x) %in% feature_set, with = FALSE] + x <- as.matrix(x) + y <- as.matrix(y) + + # get sgl input parameters + group <- parse_prior_groups(colnames(x)) + tau <- calc_tau(x) + alpha <- calc_alpha(x, group) + + #fit model with supressed warnings on whole dataset. + # supressing warnings since oem warns for all "p > n -> it might be slow for small sample sizes". + # however still the best package for sparse group lasso + fit <-suppressWarnings( + oem(x = x , + y = y , + penalty = model, + lambda = lambda1.se , + alpha = alpha, + tau = tau , + groups = group, + intercept = intercept #oem performs better with intercept even though + # we scaled the input data. Inferred intercepts + # can be ignored since they are almost 0. + ) + ) + + y_hat <- predict(fit, x, type = "response") + + covariates <- rownames(fit$beta$sparse.grp.lasso) + beta <- as.vector(fit$beta$sparse.grp.lasso) + performance <- calc_r_square(y, y_hat ) + mse <- calc_mse(y,y_hat) + + #return c() if fit is an intercept only models + if(!any(beta[covariates != "(Intercept)"] != 0)) return(c()) + + + prefix_covariates <- parsing_name(covariates) + + data.frame("predictor"=prefix_covariates$id, + "value"=beta, + "performance"= performance, + "mse"=mse, + "relation"=prefix_covariates$prefix + ) +} + +#for debugging purpose only +#DEBUG <- train_kimono_sgl(y,x) +#cat("performance:",DEBUG[1,3],"\n features:",length(which(DEBUG[,2]!= 0)),"of:",length(DEBUG[,2]),"\n","mse:",DEBUG[1,4]) +#model = "sparse.grp.lasso" +#intercept = TRUE +#seed_cv = 1234 +#folds_cv = 5 \ No newline at end of file diff --git a/02_CoExp_Kimono/kimono_stability/kimono.R b/02_CoExp_Kimono/kimono_stability/kimono.R new file mode 100644 index 0000000..50c1c75 --- /dev/null +++ b/02_CoExp_Kimono/kimono_stability/kimono.R @@ -0,0 +1,353 @@ +stderr <- function(x, na.rm=FALSE) { + if (na.rm) x <- na.omit(x) + sqrt(var(x)/length(x)) +} + + +stability_select <- function(x,y, target, nseeds){ + + #Initialize the Seeds and empty Matrix and Vectors for mse and rsquared of length 100 + seeds <- 1:nseeds + + mse_values <- rep(NA, nseeds) + rsquared_values <- rep(NA, nseeds) + beta_values <- matrix(data = NA, nseeds, ncol(x)+1) + + # Groupsparse does not always return all features - therefore we need a vector with all the names + # and all the relations to calculate the stability selection + + colnames <- colnames(x) + names <- sapply(colnames, FUN =function(x) { + split <- unlist(str_split(string = x, pattern = "___")[[1]][2]) + return(split) + }) + names <- c("(Intercept)", unname(names)) + art <- sapply(colnames, FUN =function(x) { + split <- unlist(str_split(string = x, pattern = "___")[[1]][1]) + return(split) + }) + art <- c("(Intercept)", unname(art)) + + + + coef_matrix <- c() + + # Iterate over each of the seeds and set the seed + for(i in 1:nseeds){ + + + # Based on the Seed calculate the Cross Validation to determine the best lambda + # and calculate the best final fit + + + fit <- train_kimono_sgl(y,x, seed_cv = seeds[i]) + fit + #fit <- train_kimono_sgl(y,x, seed_cv = seeds[i]) + #print(fit) + + # For some seeds groupsparse does not return a model these have to be skipped + if(is.null(fit)){ + mse_values[i] <- NA + rsquared_values[i]<- NA + beta_values[i] <- NA + next + } + fit <- fit %>% dplyr::rename(r_squared = performance ) + + # generate a boolean vector with true values for the features that have an influence + imp_features <- fit %>% filter(value != 0) %>% pull(predictor) + fit_features <- names %in% imp_features + + + # Calculate the logical vector which features are not 0 in the model + # bind this vector to the coefficient matrix + + #coef_matrix <- base::cbind(coef_matrix, unname(as.matrix(fit[,"value"]!=0)) ) + coef_matrix <- base::cbind(coef_matrix, fit_features) + + # Calculate R-Squared and MSE by comparing real values to the values predicted by the model + #y_hat <- as.vector(predict(fit, s =best_lambda, newx=x)) + mse_values[i] <- fit$mse[1] + rsquared_values[i]<- fit$r_squared[1] + indices_features <- names %in% fit$predictor + beta_values[i,indices_features] <- fit$value + } + + rownames(coef_matrix) <- names + + #print(coef_matrix) + + + # Dataframe with columns of the target and predictor + # The value is the frequency of a feature being included in the different seed models + # Overall R-Squared and MSE are the averages of all R-Squared and MSEs of the different seed models + # Selected R-Squared and MSE are the averages of the seed models where at least one feature was selected + + fit_df <- tibble( + target = rep(target, nrow(coef_matrix)), + predictor = rownames(coef_matrix), + value = rowSums(coef_matrix)/ncol(coef_matrix), + beta_mean = apply(beta_values, 2, mean, na.rm = TRUE), + beta_stderr = apply(beta_values, 2, stderr, na.rm = TRUE), + nr_of_col = ncol(coef_matrix), + overall_rsq = mean(rsquared_values, na.rm = T), + selected_rsq = mean(rsquared_values[colSums(coef_matrix) != 1], na.rm=T), # this is going to be the same value as overall_rsq + overall_mse = mean(mse_values, na.rm= T), + selected_mse = mean(mse_values[colSums(coef_matrix) != 1], na.rm =T), #same here? + ) + + fit_df$relation = art + + + return(fit_df) + +} + + + + + + +#' extracting the relevant mapping information +#' +#' @param node , String +#' @param mapping , data.table with ids in thecolumns 1:2 +#' @return vector of feature names +fetch_mappings <- function(node, mapping){ + + colnames(mapping) <- c('V1','V2') + #extract mapped features + features <- rbind( subset( mapping, V1 == node)[, 2], + subset( mapping, V2 == node)[, 1, with=FALSE], + use.names=FALSE) + unique( as.vector( as.matrix( features ) ) ) +} + +#' using the prior information to fetsh the right data for X +#' +#' @param node , +#' @param input_list +#' @param mapping_list , +#' @param main_layer - default = 1 +#' @return X sample x feature matrix +fetch_var <- function(node ,input_list, mapping_list, metainfo, main_layer = 1, sep = "___"){ + + x <- c() + #print(paste("Node", node, sep = " ")) + #print(paste("Main Layer ",main_layer)) + + #iterate over all available mappings to fetch features across all levels + for (mapping_idx in 1:length(mapping_list) ) { + #print(mapping_idx) + + mapping_to <- metainfo$main_to[mapping_idx] # mapping main layer to X + mapping_name <- metainfo$ID[mapping_idx] # mapping name + + #print(mapping_to) + #print(mapping_name) + + #print(input_list[[mapping_to]]) + + all_features <- colnames(input_list[[ mapping_to ]]) # all possible input features + #print(all_features) + + #if a mapping_list is NULL we map all features (i.e. for clinical data) + if( ncol( mapping_list[[mapping_idx]] ) != 0 ){ + features <- fetch_mappings(node , mapping_list[[mapping_idx]] ) + }else{ + features <- all_features + } + + # if main layer is having a prior to itself it might happen that y is in x + if( mapping_to == main_layer ) + features <- features[features != node] + + #check if there are actually features left + if(length(features) == 0) + next + + #print(features) + #extract relevant data + data <- input_list[[ mapping_to ]][, features[features %in% all_features] , with = FALSE ] + + + + #if we have data to add + if(ncol(data) != 0){ + colnames(data) <- paste0(mapping_name,sep,colnames(data) ) + x <- cbind(x, data) + } + } + + #print(x) + + y <- input_list[[main_layer]][ , node, with=FALSE] + + list("y"=y, + "x"=x) +} + +#' remove na, scale +#' +#' @param y , vector of doubles +#' @param x , matrix features in columns and samples in rows +#' @return x, y without na's +preprocess_data <- function(y, x){ + + y <- scale(y) + x <- scale(x) + + tmp_length <- length(y) + + x <- x[!is.na(y), , drop = FALSE] + y <- y[!is.na(y), drop = FALSE] + + if(!is.null( dim(x) ) ) + x <- x[ ,!is.na(colSums(x)),drop = FALSE] + + list("y"=as.data.table(y), + "x"=as.data.table(x)) +} + +#' check if data is valid +#' +#' @param min_features , default 5 +#' @param x , matrix features in columns and samples in rows +#' @return TRUE / FALSE +is_valid <- function( x, min_features ){ + + if( ncol(x) < min_features ) + return(FALSE) + + if( sum(is.na(colSums( as.matrix(x) ))) > ncol(x)-min_features) + return(FALSE) + + TRUE +} + +#' Infers a model for each node in the main layer +#' +#' +#' @param input_list - list of omics data. First list element will be used as predictor +#' @param mapping_list - list of mappings between each data type one +#' @param metainfo - table of relation between mappings and input list +#' @param main_layer - which input data set represents the main layer (default = 1) +#' @param min_features - autoexclude models with less than 2 features (default = 2) +#' @return a network in form of an edge table +infer_network <- function(input_list, mapping_list, metainfo, main_layer = 1, min_features = 2, stab_sel = FALSE) { + + node_list <- colnames(input_list[[main_layer]])[1:5] #iterating ofer node_list of main layer + + foreach(node = node_list, .combine = 'rbind') %do% { + + #get y and x for a given node + var_list <- fetch_var(node, + input_list, + mapping_list, + metainfo) + + #remove na and scale data + var_list <- preprocess_data(var_list$y,var_list$x) + + #if not enough features stop here + if(!is_valid(var_list$x,min_features)) + return() + + #run model in case the model bugs out catch it + possible_error <- tryCatch( + { + if(stab_sel == FALSE){ + subnet <- train_kimono_sgl(var_list$y,var_list$x ) + } + else{ + subnet <- stability_select(x = var_list$x, y = var_list$y, target = node ) + } + FALSE + }, + error=function(cond) {TRUE}, + warning=function(cond) {TRUE} + ) + + if(possible_error) + return( ) + + if(is.null(subnet)) + return( ) + + + if(stab_sel==F){ + data.table('target'=node, subnet) + } + else{ + subnet + } + } + +} + +#' Run Kimono - Knowledge-guIded Multi-Omic Netowrk inference +#' +#' @importFrom data.table as.data.table +#' @importFrom data.table data.table +#' @import dplyr +#' @import foreach +#' @import oem +#' @param input_list - list of omics data. First list element will be used as predictor +#' @param mapping_list - list of mappings between each data type one +#' @param main_layer - which input data set represents the main layer (default = 1) +#' @param min_features - autoexclude models with less than 2 features (default = 2) +#' @param core - if core != 1 kimono will perform an parallell computation +#' @return a network in form of an edge table +#' @export +kimono <- function(input_list, mapping_list, metainfo, main_layer = 1, min_features = 2,stab_sel = FALSE ,...){ + + result <- infer_network(input_list, mapping_list, metainfo, main_layer = 1, min_features = 2, stab_sel = stab_sel) + + if(nrow(result) == 0) + warning('model was not able to infer any associations') + + result +} + + + +#### own function for parallel + +run_kimono_para <- function(node, myinput_list=input_list, mymapping_list=mapping_list, mymetainfo=metainfo, main_layer = 1, min_features = 5, stab_sel = FALSE, niterations = 100){ + #get y and x for a given node + var_list <- fetch_var(node, + myinput_list, + mymapping_list, + mymetainfo) + + #remove na and scale data + var_list <- preprocess_data(var_list$y,var_list$x) + + #if not enough features stop here + if(!is_valid(var_list$x,min_features)) + return() + + #run model in case the model bugs out catch it + possible_error <- tryCatch( + { + if(stab_sel == FALSE){ + subnet <- train_kimono_sgl(var_list$y,var_list$x ) + } + else{ + subnet <- stability_select(x = var_list$x, y = var_list$y, target = node, nseeds = niterations ) + } + FALSE + }, + error=function(cond) {TRUE}, + warning=function(cond) {TRUE} + ) + + if(possible_error) + return( ) + + if(is.null(subnet)) + return( ) + + #return(data.table('target'=node, subnet)) + return(subnet) +} diff --git a/02_CoExp_Kimono/kimono_stability/utility_functions.R b/02_CoExp_Kimono/kimono_stability/utility_functions.R new file mode 100644 index 0000000..cfd97e6 --- /dev/null +++ b/02_CoExp_Kimono/kimono_stability/utility_functions.R @@ -0,0 +1,27 @@ +#' estimate R squared based on ESS/TSS +#' +#' @parameter y vector of double, assumed to be TRUE +#' @parameter y_hat vector of double, predicted +#' @return R2 value - double +calc_r_square <- function(y, y_hat){ + sum( (y_hat - mean(y))^2 ) / sum( (y - mean(y) )^2 ) +} + +#' estimate Mean Squared Error +#' +#' @parameter y vector of double, assumed to be TRUE +#' @parameter y_hat vector of double, predicted +#' @return mse double +calc_mse <- function(y,y_hat){ + mean((y-y_hat)^2) +} + +#' estimate frobenius norm of a matrix +#' +#' @parameter y vector of double, assumed to be TRUE +#' @parameter y_hat vector of double, predicted +#' @return mse double +calc_frobenius_norm <- function(x){ + m <- cor(as.matrix(x)) + sqrt( sum( m[upper.tri(m)]^2) ) / sqrt( (nrow(m)^2 - nrow(m)) /2 ) +} \ No newline at end of file diff --git a/03_CoExp_Analysis/.DS_Store b/03_CoExp_Analysis/.DS_Store new file mode 100644 index 0000000..c636c3e Binary files /dev/null and b/03_CoExp_Analysis/.DS_Store differ diff --git a/03_CoExp_Analysis/01_prior-baseline_comparison.R b/03_CoExp_Analysis/01_prior-baseline_comparison.R new file mode 100644 index 0000000..abc1678 --- /dev/null +++ b/03_CoExp_Analysis/01_prior-baseline_comparison.R @@ -0,0 +1,158 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 12.12.2020 +## Author: Nathalie +################################################## +# Compare prior and baseline network (nodedegree and nodebetweenness) + +library(data.table) +library(dplyr) +library(ggplot2) +library(igraph) +library(WGCNA) +library(eulerr) + +basepath <- "~/Documents/ownCloud/DexStim_RNAseq_Mouse/" +region <- "PFC" +beta_cutoff <- 0.01 +padj_cutoff <- 0.01 +rsquared_cutoff <- 0.1 + +### FUNCTIONS ------------------------------------- + +# function to read all files from list +readFiles_concat <- function(file_list){ + + # initialize empty data frame + dataset <- data.frame() + + # read each file from list and append to data frame + for (i in 1:length(file_list)){ + temp_data <- fread(file_list[i]) + dataset <- rbindlist(list(dataset, temp_data), use.names = T) + } + + return(dataset) +} + +# Z-score (z_ij) for the differential analysis between gene i and j +z_score <- function(beta_t, beta_c, se_t, se_c){ + z <- (beta_t - beta_c)/ + sqrt((se_t)^2 + (se_c)^2) +} + +### ANALYSIS --------------------------------------- + +# 1. Read data +# 1a. Read co expression networks +dex0_files <- list.files(path = file.path(basepath, "tables/coExpression_kimono"), + pattern = paste0("04\\_singleRegion\\_",region,"\\_dex0\\_funcoup\\_SE\\_.*\\.csv"), + full.names = TRUE) +# dex1_files <- list.files(path = file.path(basepath, "tables/coExpression_kimono"), +# pattern = paste0("04\\_singleRegion\\_",region,"\\_dex1\\_funcoup\\_SE\\_.*\\.csv"), +# full.names = TRUE) + +data_dex0 <- readFiles_concat(dex0_files) +# data_dex1 <- readFiles_concat(dex1_files) + +# 1b. Read prior and make network +funcoup_prior <- fread(file = paste0(basepath, "data/kimono_input/prior_expr_funcoup_mm.csv")) +nodes_prior <- unique(c(funcoup_prior$Gene_A, funcoup_prior$Gene_B)) +relations_prior <- data.frame(from = funcoup_prior$Gene_A, + to = funcoup_prior$Gene_B) +g_prior <- graph_from_data_frame(relations_prior, directed=FALSE, vertices=nodes_prior) +# Calculate nodedegrees of prior +nodedegree_prior <- igraph::degree(g_prior) +nodedegree_prior <- sort(nodedegree_prior, decreasing = TRUE) +# Calculate nodebetweenness of prior +# nodebetweenness_prior <- betweenness(g_prior, directed = FALSE) # node betweenness: number of shortest paths going through a node +# saveRDS(nodebetweenness_prior, file = paste0(basepath, "data/workspaces/nodebetweenness_funcoup.rds")) +nodebetweenness_prior <- readRDS(paste0(basepath, "data/workspaces/nodebetweenness_funcoup.rds")) +nodebetweenness_prior <- sort(nodebetweenness_prior, decreasing = TRUE) + +# 2. Remove interactions with very low r squared values & intercept & SVs +data <- data_dex0 %>% + filter(overall_rsq >= rsquared_cutoff) %>% + filter(predictor != '(Intercept)') +data <- data[!startsWith(data$predictor, "SV"),] +# remove duplicated interactions (mistake made when separating nodes into chunks) +data <- data %>% + distinct(target, predictor, .keep_all = TRUE) + +# 3. Keep only interactions that a beta value > cutoff +data <- data %>% + mutate(high_beta = (abs(beta_mean) > beta_cutoff)) +data_cut <- data %>% + filter(high_beta) + +# 4. Create network +# Nodes in network +node_vec <- unique(c(data_cut$target, data_cut$predictor)) + +# Find modules +relations <- data.frame(from=data_cut$target, + to=data_cut$predictor, + value=data_cut$beta_mean) +# relations <- data.frame(from=data_diff$target, +# to=data_diff$predictor) +g <- graph_from_data_frame(relations, directed=FALSE, vertices=node_vec) +# does graph contain multiple edges with same start and endpoint or loop edges +is_simple(g) +g <- simplify(g) # check the edge attribute parameter + +# Calculate nodedegree +nodedegree <- igraph::degree(g) +nodedegree <- sort(nodedegree, decreasing = TRUE) +nodedegree_rank <- rank(-nodedegree) + +# Calculate nodebetweenness +nodebetweenness <- betweenness(g, directed = FALSE) # node betweenness: number of shortest paths going through a node +nodebetweenness <- sort(nodebetweenness, decreasing = TRUE) # same when values are included in g_diff or not +nodebetweenness_rank <- rank(-nodebetweenness) + + +# 5. Compare estimated baseline network to prior +# rank of genes in prior +nodedegree_prior_rank <- rank(-nodedegree_prior[names(nodedegree)]) +nodebetweenness_prior_rank <- rank(-nodebetweenness_prior[names(nodebetweenness)]) + +# Spearman's rank correlation +cor_nodedegree <- cor.test(nodedegree_prior_rank, nodedegree_rank, + method = "spearman") +cor_nodebetweenness <- cor.test(nodebetweenness_prior_rank, nodebetweenness_rank, + method = "spearman") + +# Scatterplot between prior and base network with correlation in label +data.frame("prior" = nodedegree_prior[names(nodedegree)], + "baseline" = nodedegree) %>% +ggplot(aes(x=prior, y=baseline)) + + # geom_point(size=1,alpha = 0.1) + geom_hex() + + # geom_bin2d() + xlab("nodedegree prior network") + + ylab("nodedegree baseline network") + + ggtitle(paste0("Nodedegree in prior and baseline network (Correlation between ranks: ", + round(cor_nodedegree$estimate, digits = 2),")")) +ggsave(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", + region,"_prior_baseline_funcoup_correlationNodedegree_betacutoff",beta_cutoff,".png"), + width = 9, height = 8) + +data.frame("prior" = nodebetweenness_prior[names(nodebetweenness)], + "baseline" = nodebetweenness) %>% + ggplot(aes(x=prior, y=baseline)) + + # geom_point(size=1,alpha = 0.1) + geom_hex() + + # geom_bin2d() + xlab("nodebetweenness prior network") + + ylab("nodebetweenness baseline network") + + ggtitle(paste0("Nodebetweenness in prior and baseline network (Correlation between ranks: ", + round(cor_nodebetweenness$estimate, digits = 2),")")) +ggsave(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", + region,"_prior_baseline_funcoup_correlationNodebetweenness_betacutoff",beta_cutoff,".png"), + width = 9, height = 8) + + + + + + diff --git a/03_CoExp_Analysis/02_singleRegion_funcoup_cutoff.R b/03_CoExp_Analysis/02_singleRegion_funcoup_cutoff.R new file mode 100644 index 0000000..f5dbf6e --- /dev/null +++ b/03_CoExp_Analysis/02_singleRegion_funcoup_cutoff.R @@ -0,0 +1,223 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 03.12.2020 +## Author: Nathalie +################################################## +# Decide on a beta cutoff for single region funcoup networks + +library(data.table) +library(dplyr) +library(ggplot2) +library(igraph) +library(eulerr) + +basepath <- "~/Documents/ownCloud/DexStim_RNAseq_Mouse/" +region <- "PFC" +padj_cutoff <- 0.01 +rsquared_cutoff <- 0.1 + +### FUNCTIONS ------------------------------------- + +# function to read all files from list +readFiles_concat <- function(file_list){ + + # initialize empty data frame + dataset <- data.frame() + + # read each file from list and append to data frame + for (i in 1:length(file_list)){ + temp_data <- fread(file_list[i]) + dataset <- rbindlist(list(dataset, temp_data), use.names = T) + } + + return(dataset) +} + +# Z-score (z_ij) for the differential analysis between gene i and j +z_score <- function(beta_t, beta_c, se_t, se_c){ + z <- (beta_t - beta_c)/ + sqrt((se_t)^2 + (se_c)^2) +} + + +# Plot changes in ranks of nodes with highest betweenness +plotRanks <- function(a, b, labels = TRUE, labels.offset=0.1, arrow.len=0.1) +{ + old.par <- par(mar=c(1,1,1,1)) + + # Find the length of the vectors + len.1 <- length(a) + len.2 <- length(b) + + # Plot two columns of equidistant points + plot(rep(1, len.1), 1:len.1, pch=20, cex=0.8, + xlim=c(0, 3), ylim=c(0, max(len.1, len.2)), + axes=F, xlab="", ylab="") # Remove axes and labels + points(rep(2, len.2), 1:len.2, pch=20, cex=0.8) + + # Put labels next to each observation + if (labels){ + text(rep(1-labels.offset, len.1), 1:len.1, a) + text(rep(2+labels.offset, len.2), 1:len.2, b) + } + + # Now we need to map where the elements of a are in b + # We use the match function for this job + a.to.b <- match(a, b) + + # Now we can draw arrows from the first column to the second + arrows(rep(1.02, len.1), 1:len.1, rep(1.98, len.2), a.to.b, + length=arrow.len, angle=20) + par(old.par) +} + +### ANALYSIS --------------------------------------- + +# 1. Read data +# 1a. Read co expression networks +dex0_files <- list.files(path = file.path(basepath, "tables/coExpression_kimono"), + pattern = paste0("04\\_singleRegion\\_",region,"\\_dex0\\_funcoup\\_parallel\\_.*\\.csv"), + full.names = TRUE) +dex1_files <- list.files(path = file.path(basepath, "tables/coExpression_kimono"), + pattern = paste0("04\\_singleRegion\\_",region,"\\_dex1\\_funcoup\\_parallel\\_.*\\.csv"), + full.names = TRUE) + +data_dex0 <- readFiles_concat(dex0_files) +data_dex1 <- readFiles_concat(dex1_files) + + +# 2. Join Base and Dex data frame +data <- inner_join(data_dex0, data_dex1, + by = c("target", "predictor"), + suffix = c(".base", ".dex")) +dex_notBase <- anti_join(data_dex1, data_dex0, by = c("target", "predictor")) +head(data) + +# Plot beta values +beta_data <- as.data.frame(rbind(cbind(rep("base", times = nrow(data)), data$beta_mean.base, data$relation.base), + cbind(rep("dex", times = nrow(data)), data$beta_mean.dex, data$relation.dex))) +# ggplot(data = beta_data, aes(x = V1, y = V2, col = V3)) + +# geom_boxplot() + +# xlab("network") + +# ylab("beta") + +# theme(text = element_text(size=20)) +# hist(data$beta_mean.base, breaks = 500) # histogram of beta values in base network +# png(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", +# region,"_funcoup_differential_betahistogramZoom.png"), +# height = 400, width = 600) +# hist(data$beta_mean.base, breaks = 5000, xlim = c(-0.05,0.05), +# xlab = "Beta", main = "", cex.lab=1.5, cex.axis = 1.2) +# dev.off() +# hist(data$beta_mean.dex, breaks = 500) # histogram of beta values in dex network +# hist(data$beta_mean.dex, breaks = 5000, xlim = c(-0.05,0.05)) + + +# 3. Remove interactions with very low r squared values & intercept & SVs +data <- data %>% + filter(overall_rsq.base >= rsquared_cutoff, overall_rsq.dex >= rsquared_cutoff) %>% + filter(predictor != '(Intercept)') +data <- data[!startsWith(data$predictor, "SV"),] + +# 4. Calculate z scores for interactions that are left +data <- mutate(data, z = z_score(beta_mean.dex,beta_mean.base, beta_stderr.dex, beta_stderr.base)) +hist(data$beta_mean.base) + +# 5. Cutoff optimization +poss_cutoff <- c(0.000001, 0.00001, 0.0001, 0.001, 0.005, 0.01, 0.1) +nodebetweenness_list <- list() +for(beta_cutoff in poss_cutoff) { + + print(beta_cutoff) + + # 5a. Keep only interactions that have at least in one network a beta value > 0.05 + data <- data %>% + mutate(diff = (abs(beta_mean.base) > beta_cutoff | abs(beta_mean.dex) > beta_cutoff)) + data_diff1 <- data %>% + filter(diff) + data_diff1$p_diff <- 2*pnorm(-abs(data_diff1$z)) + data_diff1$p_adj <- p.adjust(data_diff1$p_diff, method = "fdr") + + # 5b. Create network corresponding to beta cutoff + data_diff <- data_diff1 %>% + filter(p_adj <= padj_cutoff) + head(data_diff[,c("beta_mean.base", "beta_stderr.base", "beta_mean.dex", "beta_stderr.dex", "z", "p_adj")], 20) + + # Nodes in network + node_vec <- unique(c(data_diff$target, data_diff$predictor)) + + # Find modules + relations <- data.frame(from=data_diff$target, + to=data_diff$predictor, + value=data_diff$z, + performance=data_diff$p_adj) + g_diff <- graph_from_data_frame(relations, directed=FALSE, vertices=node_vec) + + # Calculate nodedegree + nodedegree <- igraph::degree(g_diff) + nodedegree <- sort(nodedegree, decreasing = TRUE) + + # Calculate nodebetweenness + nodebetweenness <- betweenness(g_diff, directed = FALSE) # node betweenness: number of shortest paths going through a node + nodebetweenness <- sort(nodebetweenness, decreasing = TRUE) + nodebetweenness_list[[as.character(beta_cutoff)]] <- nodebetweenness + # hist(nodebetweenness) +} + + +# Compare top genes between different cutoffs +for (i in 1:(length(nodebetweenness_list)-1)){ + + intersect_top <- intersect(names(nodebetweenness_list[[i]][1:100]), names(nodebetweenness_list[[i+1]][1:100])) + print(length(intersect_top)) + match_pos <- match(names(nodebetweenness_list[[i]])[names(nodebetweenness_list[[i]]) %in% intersect_top], + names(nodebetweenness_list[[i+1]])[names(nodebetweenness_list[[i+1]]) %in% intersect_top]) + print(match_pos) + + # Rank plot to compare ranks of top genes + # Comparison between i and i+1 + png(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", + region,"_funcoup_cutoffSelection_rankPlot_betacutoffs",poss_cutoff[i],"-",poss_cutoff[i+1],".png"), + width = 700, height = 700) + plotRanks(names(nodebetweenness_list[[i]][1:100]), names(nodebetweenness_list[[i+1]][1:100]), + labels = FALSE) + dev.off() + # Comparison between 1 and i + if (i > 1){ + png(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", + region,"_funcoup_cutoffSelection_rankPlot_betacutoffs",poss_cutoff[1],"-",poss_cutoff[i],".png"), + width = 700, height = 700) + plotRanks(names(nodebetweenness_list[[1]][1:100]), names(nodebetweenness_list[[i]][1:100]), + labels = FALSE) + dev.off() + } + + # Venn diagram (euler plot) + # Comparison between i and i+1 + list1 <- list(names(nodebetweenness_list[[i]][1:100]), + names(nodebetweenness_list[[i+1]][1:100])) + names(list1) <- c(paste("Beta cutoff", poss_cutoff[i]), + paste("Beta cutoff", poss_cutoff[i+1])) + png(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", + region,"_funcoup_cutoffSelection_vennEuler_betacutoffs",poss_cutoff[i],"-",poss_cutoff[i+1],".png"), + width = 700, height = 700) + print(plot(euler(list1, shape = "ellipse"), + labels = list(cex = 1.5), quantities = list(cex = 1.5))) + dev.off() + + # Comparison between i and 1 + if(i > 1){ + list1 <- list(names(nodebetweenness_list[[1]][1:100]), + names(nodebetweenness_list[[i]][1:100])) + names(list1) <- c(paste("Beta cutoff", poss_cutoff[1]), + paste("Beta cutoff", poss_cutoff[i])) + png(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", + region,"_funcoup_cutoffSelection_vennEuler_betacutoffs",poss_cutoff[1],"-",poss_cutoff[i],".png"), + width = 700, height = 700) + print(plot(euler(list1, shape = "ellipse"), + labels = list(cex = 1.5), quantities = list(cex = 1.5))) + dev.off() + } +} + +# I WOULD DECIDE ON 0.001 from plots, but top genes have too many connections with 0.001 +# --> probably 0.01 \ No newline at end of file diff --git a/03_CoExp_Analysis/03_singleRegion_funcoup_baseline_focusHubGenes.R b/03_CoExp_Analysis/03_singleRegion_funcoup_baseline_focusHubGenes.R new file mode 100644 index 0000000..fc7d4c0 --- /dev/null +++ b/03_CoExp_Analysis/03_singleRegion_funcoup_baseline_focusHubGenes.R @@ -0,0 +1,355 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 04.01.2020 +## Author: Nathalie +################################################## +# Analyze top genes in baseline network +# Use beta cutoff and analyze top genes (nodebetweenness) + +library(data.table) +library(dplyr) +library(ggplot2) +library(igraph) +library(eulerr) +library(org.Mm.eg.db) + +basepath <- "~/Documents/ownCloud/DexStim_RNAseq_Mouse/" +region <- "PFC" +beta_cutoff <- 0.01 +padj_cutoff <- 0.01 +rsquared_cutoff <- 0.1 + +### FUNCTIONS ------------------------------------- + +# function to read all files from list +readFiles_concat <- function(file_list){ + + # initialize empty data frame + dataset <- data.frame() + + # read each file from list and append to data frame + for (i in 1:length(file_list)){ + temp_data <- fread(file_list[i]) + dataset <- rbindlist(list(dataset, temp_data), use.names = T) + } + + return(dataset) +} + +# Z-score (z_ij) for the differential analysis between gene i and j +z_score <- function(beta_t, beta_c, se_t, se_c){ + z <- (beta_t - beta_c)/ + sqrt((se_t)^2 + (se_c)^2) +} + +# Plot changes in ranks of nodes with highest betweenness +plotRanks <- function(a, b, labels = TRUE, labels.offset=0.1, arrow.len=0.1) +{ + old.par <- par(mar=c(1,1,1,1)) + + # Find the length of the vectors + len.1 <- length(a) + len.2 <- length(b) + + # Plot two columns of equidistant points + plot(rep(1, len.1), 1:len.1, pch=20, cex=0.8, + xlim=c(0, 3), ylim=c(0, max(len.1, len.2)), + axes=F, xlab="", ylab="") # Remove axes and labels + points(rep(2, len.2), 1:len.2, pch=20, cex=0.8) + + # Put labels next to each observation + if (labels){ + text(rep(1-labels.offset, len.1), 1:len.1, a) + text(rep(2+labels.offset, len.2), 1:len.2, b) + } + + # Now we need to map where the elements of a are in b + # We use the match function for this job + a.to.b <- match(a, b) + + # Now we can draw arrows from the first column to the second + arrows(rep(1.02, len.1), 1:len.1, rep(1.98, len.2), a.to.b, + length=arrow.len, angle=20) + par(old.par) +} + + +### ANALYSIS --------------------------------------- + +# 1. Prior and network +funcoup_prior <- fread(file = paste0(basepath, "data/kimono_input/prior_expr_funcoup_mm.csv")) +nodes_prior <- unique(c(funcoup_prior$Gene_A, funcoup_prior$Gene_B)) +relations_prior <- data.frame(from = funcoup_prior$Gene_A, + to = funcoup_prior$Gene_B) +g_prior <- graph_from_data_frame(relations_prior, directed=FALSE, vertices=nodes_prior) +# Calculate nodedegrees of prior +nodedegree_prior <- sort(igraph::degree(g_prior), decreasing = TRUE) +# Calculate nodebetweenness of prior +# nodebetweenness_prior <- betweenness(g_prior, directed = FALSE) # node betweenness: number of shortest paths going through a node +# saveRDS(nodebetweenness_prior, file = paste0(basepath, "data/workspaces/nodebetweenness_funcoup.rds")) +nodebetweenness_prior <- sort(readRDS(paste0(basepath, "data/workspaces/nodebetweenness_funcoup.rds")), decreasing = TRUE) +top100genes_prior <- names(nodebetweenness_prior[1:100]) + + +# 2. Read kimono baseline expression networks +dex0_files <- list.files(path = file.path(basepath, "tables/coExpression_kimono"), + pattern = paste0("04\\_singleRegion\\_",region,"\\_dex0\\_funcoup\\_SE\\_.*\\.csv"), + full.names = TRUE) + +data_dex0 <- readFiles_concat(dex0_files) + + +# 3. Remove interactions with very low r squared values & intercept & SVs +data <- data_dex0 %>% + filter(overall_rsq >= rsquared_cutoff) %>% + filter(predictor != '(Intercept)') +data <- data[!startsWith(data$predictor, "SV"),] +# remove duplicated interactions (mistake made when separating nodes into chunks) +data <- data %>% + distinct(target, predictor, .keep_all = TRUE) + + +# 4. Keep only interactions that have a beta value > cutoff +data <- data %>% + mutate(betacut = (abs(beta_mean) > beta_cutoff)) +data_cut <- data %>% + filter(betacut) + + +# 5. Create baseline network corresponding to beta cutoff +head(data_cut[,c("target", "predictor", "beta_mean", "beta_stderr")], 20) +# Save filtered network +fwrite(data_cut, file = paste0(basepath, "/tables/coExpression_kimono/03_AnalysisFuncoup/", + "04_singleRegion_",region,"_filtered_baselineNetwork.csv")) + +# Nodes in network +node_vec <- unique(c(data_cut$target, data_cut$predictor)) + +# Find modules +# relations <- data.frame(from=data_cut$target, +# to=data_cut$predictor, +# value=data_cut$z, +# performance=data_cut$p_adj) +relations <- data.frame(from=data_cut$target, + to=data_cut$predictor) +g_base <- graph_from_data_frame(relations, directed=FALSE, vertices=node_vec) +# does graph contain multiple edges with same start and endpoint or loop edges +is_simple(g_base) +g_base <- simplify(g_base) # check the edge attribute parameter + +# Calculate nodedegree +nodedegree <- igraph::degree(g_base) +nodedegree <- sort(nodedegree, decreasing = TRUE) + +# Calculate nodebetweenness +nodebetweenness <- betweenness(g_base, directed = FALSE) # node betweenness: number of shortest paths going through a node +nodebetweenness <- sort(nodebetweenness, decreasing = TRUE) # same when values are included in g_diff or not +top100genes <- names(nodebetweenness[1:100]) + +# Plot network properties in one plot +data.frame("nodedegree" = nodedegree, + "nodebetweenness" = nodebetweenness) %>% + tidyr::gather(key = "property", value = "value", nodedegree:nodebetweenness) %>% + ggplot(aes(value)) + + geom_histogram() + + facet_wrap(~property, scales = "free") + + xlab("") + + ggtitle(paste0("Baseline expression network in " ,region, " (", + gorder(g_base), " nodes, ", gsize(g_base), " edges)")) +ggsave(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", + region,"_funcoup_focusHubGenes_baselineNetwork_betacutoff",beta_cutoff,".png"), + width = 8, height = 6) + + +# # 6. Compare top genes in our network to top genes in prior +# # 6a. Rank plot to compare ranks of top genes according to nodebetweenness +# png(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", +# region,"_funcoup_focusHubGenes_rankPlot_nodebetweenness_prior-betacutoff",beta_cutoff,".png"), +# width = 700, height = 700) +# plotRanks(top100genes_prior, top100genes, +# labels = FALSE) +# dev.off() +# # Rank plot to compare ranks of top genes according to nodebetweenness +# png(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", +# region,"_funcoup_focusHubGenes_rankPlot_nodedegree_prior-betacutoff",beta_cutoff,".png"), +# width = 700, height = 700) +# plotRanks(names(nodedegree_prior)[1:100], names(nodedegree)[1:100], +# labels = FALSE) +# dev.off() +# +# # 6b. Venn diagram (euler plot) to compare overlap of top genes according to nodebetweenness +# list1 <- list(top100genes_prior, +# top100genes) +# names(list1) <- c("Funcoup prior", +# paste("Beta cutoff", beta_cutoff)) +# png(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", +# region,"_funcoup_focusHubGenes_vennEuler_nodebetweenness_prior-betacutoff",beta_cutoff,".png"), +# width = 700, height = 700) +# print(plot(euler(list1, shape = "ellipse"), +# labels = list(cex = 1.5), quantities = list(cex = 1.5))) +# dev.off() +# # Venn diagram (euler plot) to compare overlap of top genes according to nodedegree +# list1 <- list(names(nodedegree_prior)[1:100], +# names(nodedegree)[1:100]) +# names(list1) <- c("Funcoup prior", +# paste("Beta cutoff", beta_cutoff)) +# png(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", +# region,"_funcoup_focusHubGenes_vennEuler_nodedegree_prior-betacutoff",beta_cutoff,".png"), +# width = 700, height = 700) +# print(plot(euler(list1, shape = "ellipse"), +# labels = list(cex = 1.5), quantities = list(cex = 1.5))) +# dev.off() + + +# 6a. Compare estimated baseline network to prior (ranks of nodedegree/betweenness) +# rank of genes in prior +nodedegree_prior_rank <- rank(-nodedegree_prior[names(nodedegree)]) +nodebetweenness_prior_rank <- rank(-nodebetweenness_prior[names(nodebetweenness)]) +# rank of genes in diff network +nodedegree_rank <- rank(-nodedegree) +nodebetweenness_rank <- rank(-nodebetweenness) + +# Spearman's rank correlation +cor_nodedegree <- cor.test(nodedegree_prior_rank, nodedegree_rank, + method = "spearman") +cor_nodebetweenness <- cor.test(nodebetweenness_prior_rank, nodebetweenness_rank, + method = "spearman") + +# Scatterplot between prior and base network with correlation in label +# data.frame("prior" = nodedegree_prior[names(nodedegree)], +# "differential" = nodedegree) %>% +# ggplot(aes(x=prior, y=differential)) + +# # geom_point(size=1,alpha = 0.1) +# geom_hex() + +# # geom_bin2d() +# xlab("nodedegree prior network") + +# ylab("nodedegree differential network") + +# ggtitle(paste0("Nodedegree in prior and differential network (Correlation between ranks: ", +# round(cor_nodedegree$estimate, digits = 2),")")) +# ggsave(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", +# region,"_prior_differential_funcoup_correlationNodedegree_betacutoff",beta_cutoff,".png"), +# width = 9, height = 8) +# +# data.frame("prior" = nodebetweenness_prior[names(nodebetweenness)], +# "differential" = nodebetweenness) %>% +# ggplot(aes(x=prior, y=differential)) + +# # geom_point(size=1,alpha = 0.1) +# geom_hex() + +# # geom_bin2d() +# xlab("nodebetweenness prior network") + +# ylab("nodebetweenness differential network") + +# ggtitle(paste0("Nodebetweenness in prior and differential network (Correlation between ranks: ", +# round(cor_nodebetweenness$estimate, digits = 2),")")) +# ggsave(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", +# region,"_prior_differential_funcoup_correlationNodebetweenness_betacutoff",beta_cutoff,".png"), +# width = 9, height = 8) + + + +# tmp <- relations_prior[relations_prior$from == "ENSMUSG00000021660" | relations_prior$to == "ENSMUSG00000021660",] +# tmp1 <- relations[relations$from == "ENSMUSG00000021660" | relations$to == "ENSMUSG00000021660",] +# tmp_join <- inner_join(tmp, tmp1, +# by = c("from", "to"), +# suffix = c(".prior", ".diff")) +# dex_notBase <- anti_join(data_dex1, data_dex0, by = c("target", "predictor")) + + +# # 8. Subset igraph object to top genes with their neighbours +# g1 <- induced.subgraph(graph=g_diff,vids=unlist(neighborhood(graph=g_diff,order=1,nodes=c(top100genes[1])))) +# png(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", +# region,"_funcoup_focusHubGenes_networkTopGene1_betacutoff",beta_cutoff,".png"), +# width = 1000, height = 1000) +# plot(g1) +# dev.off() +# library(RCy3) +# cytoscapePing() +# createNetworkFromIgraph(g1,"myIgraph") + + +# 9. "Normalize" nodebetweenness by nodebetweenness in prior +# check if nodebetweenness can not be compared like this because here we use z scores as value +# and in prior no values are included +nodebetweenness_norm <- nodebetweenness/nodebetweenness_prior[names(nodebetweenness)] +nodebetweenness_mat <- data.frame("nodebetweenness" = nodebetweenness, + "nodebetweenness_prior" = nodebetweenness_prior[names(nodebetweenness)], + "nodebetweenness_norm" = nodebetweenness_norm) +# set norm nodebetweenness to NA if nodebetweenness in prior < 10000 +nodebetweenness_mat$nodebetweenness_norm[nodebetweenness_mat$nodebetweenness_prior < 10000] <- NA +nodebetweenness_mat <- arrange(nodebetweenness_mat, desc(nodebetweenness_norm)) +# add column with gene symbol +nodebetweenness_mat$gene_symbol <- mapIds(org.Mm.eg.db, keys = rownames(nodebetweenness_mat), + keytype = "ENSEMBL", column="SYMBOL") +# rank normalized nodebetweenness +nodebetweenness_norm_rank <- rank(-nodebetweenness_norm[names(nodebetweenness)]) + +# correlation between ranks of prior nodebetweenness and norm betweenness +cor_nodebetweenness_norm <- cor.test(nodebetweenness_prior_rank, nodebetweenness_norm_rank, + method = "spearman") + +data.frame("prior" = nodebetweenness_prior[names(nodebetweenness)], + "baseline" = nodebetweenness_norm[(names(nodebetweenness))]) %>% + ggplot(aes(x=prior, y=baseline)) + + # geom_point(size=1,alpha = 0.1) + # geom_hex() + + geom_bin2d() + + xlab("nodebetweenness prior network") + + ylab("norm. nodebetweenness baseline network") + + ggtitle(paste0("Nodebetweenness in prior and baseline network (Correlation between ranks: ", + round(cor_nodebetweenness_norm$estimate, digits = 2),")")) +ggsave(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", + region,"_prior_baseline_funcoup_correlationNodebetweennessNorm_betacutoff",beta_cutoff,".png"), + width = 9, height = 8) + +# nodebetweenness_norm <- sort(nodebetweenness_norm, decreasing = TRUE) +# top100genes_between_norm <- names(nodebetweenness_norm)[1:100] + +# write nodebetweenness table to file +nodebetweenness_mat <- tibble::rownames_to_column(nodebetweenness_mat, "ensembl_id") +fwrite(nodebetweenness_mat, file = paste0(basepath, "/tables/coExpression_kimono/03_AnalysisFuncoup/", + "03_",region,"_funcoup_baseline_nodebetweennessNorm_betacutoff",beta_cutoff,".csv"), + quote = FALSE) + + +# 10. "Normalize" nodedegree by nodedegree in prior +nodedegree_norm <- nodedegree/nodedegree_prior[names(nodedegree)] +nodedegree_mat <- data.frame("nodedegree" = nodedegree, "nodedegree_prior" = nodedegree_prior[names(nodedegree)], + "nodedegree_norm" = nodedegree_norm) %>% + arrange(desc(nodedegree_norm)) +# set norm nodebetweenness to NA if nodedegree in prior < 50 +nodedegree_mat$nodedegree_norm[nodedegree_mat$nodedegree_prior < 50] <- NA +nodedegree_mat <- arrange(nodedegree_mat, desc(nodedegree_norm)) +# add column with gene symbol +nodedegree_mat$gene_symbol <- mapIds(org.Mm.eg.db, keys = rownames(nodedegree_mat), + keytype = "ENSEMBL", column="SYMBOL") + +# rank normalized nodedegree +nodedegree_norm_rank <- rank(-nodedegree_norm[names(nodedegree)]) + +# correlation between ranks of prior nodebetweenness and norm betweenness +cor_nodedegree_norm <- cor.test(nodedegree_prior_rank, nodedegree_norm_rank, + method = "spearman") + +data.frame("prior" = nodedegree_prior[names(nodedegree)], + "baseline" = nodedegree_norm[(names(nodedegree))]) %>% + ggplot(aes(x=prior, y=baseline)) + + # geom_point(size=1,alpha = 0.1) + # geom_hex() + + geom_bin2d() + + xlab("nodedegree prior network") + + ylab("norm. nodedegree baseline network") + + ggtitle(paste0("Nodedegree in prior and baseline network (Correlation between ranks: ", + round(cor_nodedegree_norm$estimate, digits = 2),")")) +ggsave(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", + region,"_prior_baseline_funcoup_correlationNodedegreeNorm_betacutoff",beta_cutoff,".png"), + width = 9, height = 8) + +# write nodedegree table to file +nodedegree_mat <- tibble::rownames_to_column(nodedegree_mat, "ensembl_id") +fwrite(nodedegree_mat, file = paste0(basepath, "/tables/coExpression_kimono/03_AnalysisFuncoup/", + "03_",region,"_funcoup_baseline_nodedegreesNorm_betacutoff",beta_cutoff,".csv"), + quote = FALSE) + + +# # checkout what happens to FKBP5 (ENSMUSG00000024222) in network +# g2 <- induced.subgraph(graph=g_diff,vids=unlist(neighborhood(graph=g_diff,order=1,nodes=c("ENSMUSG00000024222")))) + diff --git a/03_CoExp_Analysis/03a_singleRegion_funcoup_treatment.R b/03_CoExp_Analysis/03a_singleRegion_funcoup_treatment.R new file mode 100644 index 0000000..9f9e675 --- /dev/null +++ b/03_CoExp_Analysis/03a_singleRegion_funcoup_treatment.R @@ -0,0 +1,145 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 04.10.2021 +## Author: Nathalie +################################################## +# Save filtered treatment network +# Use beta cutoff and rsquared cutoff as for baseline and differential network + +library(data.table) +library(dplyr) +library(ggplot2) +library(igraph) +library(eulerr) +library(org.Mm.eg.db) + +basepath <- "~/Documents/ownCloud/DexStim_RNAseq_Mouse/" +region <- "AMY" +beta_cutoff <- 0.01 +padj_cutoff <- 0.01 +rsquared_cutoff <- 0.1 + +### FUNCTIONS ------------------------------------- + +# function to read all files from list +readFiles_concat <- function(file_list){ + + # initialize empty data frame + dataset <- data.frame() + + # read each file from list and append to data frame + for (i in 1:length(file_list)){ + temp_data <- fread(file_list[i]) + dataset <- rbindlist(list(dataset, temp_data), use.names = T) + } + + return(dataset) +} + + +### ANALYSIS --------------------------------------- + +# 1. Read kimono treatment expression networks +dex1_files <- list.files(path = file.path(basepath, "tables/coExpression_kimono"), + pattern = paste0("04\\_singleRegion\\_",region,"\\_dex1\\_funcoup\\_SE\\_.*\\.csv"), + full.names = TRUE) + +data_dex1 <- readFiles_concat(dex1_files) + + +# 2. Remove interactions with very low r squared values & intercept & SVs +data <- data_dex1 %>% + filter(overall_rsq >= rsquared_cutoff) %>% + filter(predictor != '(Intercept)') +data <- data[!startsWith(data$predictor, "SV"),] +# remove duplicated interactions (mistake made when separating nodes into chunks) +data <- data %>% + distinct(target, predictor, .keep_all = TRUE) + + +# 3. Keep only interactions that have a beta value > cutoff +data <- data %>% + mutate(betacut = (abs(beta_mean) > beta_cutoff)) +data_cut <- data %>% + filter(betacut) + + +# 4. Create treatment network corresponding to beta cutoff +head(data_cut[,c("target", "predictor", "beta_mean", "beta_stderr")], 20) +# Save filtered network +fwrite(data_cut, file = paste0(basepath, "/tables/coExpression_kimono/03_AnalysisFuncoup/", + "04_singleRegion_",region,"_filtered_treatmentNetwork.csv")) + + +# Nodes in network +node_vec <- unique(c(data_cut$target, data_cut$predictor)) + +# Network properties +relations <- data.frame(from=data_cut$target, + to=data_cut$predictor) +g_treat <- graph_from_data_frame(relations, directed=FALSE, vertices=node_vec) +# does graph contain multiple edges with same start and endpoint or loop edges +is_simple(g_treat) +g_treat <- simplify(g_treat) # check the edge attribute parameter + +# Calculate nodedegree +nodedegree <- igraph::degree(g_treat) +nodedegree <- sort(nodedegree, decreasing = TRUE) + +# Calculate nodebetweenness +nodebetweenness <- betweenness(g_treat, directed = FALSE) # node betweenness: number of shortest paths going through a node +nodebetweenness <- sort(nodebetweenness, decreasing = TRUE) # same when values are included in g_diff or not + + +# 1. Prior network properties for normalization +funcoup_prior <- fread(file = paste0(basepath, "data/kimono_input/prior_expr_funcoup_mm.csv")) +nodes_prior <- unique(c(funcoup_prior$Gene_A, funcoup_prior$Gene_B)) +relations_prior <- data.frame(from = funcoup_prior$Gene_A, + to = funcoup_prior$Gene_B) +g_prior <- graph_from_data_frame(relations_prior, directed=FALSE, vertices=nodes_prior) +# Calculate nodedegrees of prior +nodedegree_prior <- sort(igraph::degree(g_prior), decreasing = TRUE) +# Calculate nodebetweenness of prior +# nodebetweenness_prior <- betweenness(g_prior, directed = FALSE) # node betweenness: number of shortest paths going through a node +# saveRDS(nodebetweenness_prior, file = paste0(basepath, "data/workspaces/nodebetweenness_funcoup.rds")) +nodebetweenness_prior <- sort(readRDS(paste0(basepath, "data/workspaces/nodebetweenness_funcoup.rds")), decreasing = TRUE) + + +# 9. "Normalize" nodebetweenness by nodebetweenness in prior +nodebetweenness_norm <- nodebetweenness/nodebetweenness_prior[names(nodebetweenness)] +nodebetweenness_mat <- data.frame("nodebetweenness" = nodebetweenness, + "nodebetweenness_prior" = nodebetweenness_prior[names(nodebetweenness)], + "nodebetweenness_norm" = nodebetweenness_norm) +# set norm nodebetweenness to NA if nodebetweenness in prior < 10000 +nodebetweenness_mat$nodebetweenness_norm[nodebetweenness_mat$nodebetweenness_prior < 10000] <- NA +nodebetweenness_mat <- arrange(nodebetweenness_mat, desc(nodebetweenness_norm)) +# add column with gene symbol +nodebetweenness_mat$gene_symbol <- mapIds(org.Mm.eg.db, keys = rownames(nodebetweenness_mat), + keytype = "ENSEMBL", column="SYMBOL") + +# write nodebetweenness table to file +nodebetweenness_mat <- tibble::rownames_to_column(nodebetweenness_mat, "ensembl_id") +fwrite(nodebetweenness_mat, file = paste0(basepath, "/tables/coExpression_kimono/03_AnalysisFuncoup/", + "03a_",region,"_funcoup_treatment_nodebetweennessNorm_betacutoff",beta_cutoff,".csv"), + quote = FALSE) + + +# 10. "Normalize" nodedegree by nodedegree in prior +nodedegree_norm <- nodedegree/nodedegree_prior[names(nodedegree)] +nodedegree_mat <- data.frame("nodedegree" = nodedegree, "nodedegree_prior" = nodedegree_prior[names(nodedegree)], + "nodedegree_norm" = nodedegree_norm) %>% + arrange(desc(nodedegree_norm)) +# set norm nodebetweenness to NA if nodedegree in prior < 50 +nodedegree_mat$nodedegree_norm[nodedegree_mat$nodedegree_prior < 50] <- NA +nodedegree_mat <- arrange(nodedegree_mat, desc(nodedegree_norm)) +# add column with gene symbol +nodedegree_mat$gene_symbol <- mapIds(org.Mm.eg.db, keys = rownames(nodedegree_mat), + keytype = "ENSEMBL", column="SYMBOL") + +# write nodedegree table to file +nodedegree_mat <- tibble::rownames_to_column(nodedegree_mat, "ensembl_id") +fwrite(nodedegree_mat, file = paste0(basepath, "/tables/coExpression_kimono/03_AnalysisFuncoup/", + "03a_",region,"_funcoup_treatment_nodedegreesNorm_betacutoff",beta_cutoff,".csv"), + quote = FALSE) + + diff --git a/03_CoExp_Analysis/04_singleRegion_funcoup_focusHubGenes.R b/03_CoExp_Analysis/04_singleRegion_funcoup_focusHubGenes.R new file mode 100644 index 0000000..06ec013 --- /dev/null +++ b/03_CoExp_Analysis/04_singleRegion_funcoup_focusHubGenes.R @@ -0,0 +1,419 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 04.12.2020 +## Author: Nathalie +################################################## +# Use beta cutoff and analyze top genes (nodebetweenness) + +library(data.table) +library(dplyr) +library(ggplot2) +library(igraph) +library(eulerr) +library(org.Mm.eg.db) + +basepath <- "~/Documents/ownCloud/DexStim_RNAseq_Mouse/" +region <- "vDG" +beta_cutoff <- 0.01 +padj_cutoff <- 0.01 +rsquared_cutoff <- 0.1 + +### FUNCTIONS ------------------------------------- + +# function to read all files from list +readFiles_concat <- function(file_list){ + + # initialize empty data frame + dataset <- data.frame() + + # read each file from list and append to data frame + for (i in 1:length(file_list)){ + temp_data <- fread(file_list[i]) + dataset <- rbindlist(list(dataset, temp_data), use.names = T) + } + + return(dataset) +} + +# Z-score (z_ij) for the differential analysis between gene i and j +z_score <- function(beta_t, beta_c, se_t, se_c){ + z <- (beta_t - beta_c)/ + sqrt((se_t)^2 + (se_c)^2) +} + +# Plot changes in ranks of nodes with highest betweenness +plotRanks <- function(a, b, labels = TRUE, labels.offset=0.1, arrow.len=0.1) +{ + old.par <- par(mar=c(1,1,1,1)) + + # Find the length of the vectors + len.1 <- length(a) + len.2 <- length(b) + + # Plot two columns of equidistant points + plot(rep(1, len.1), 1:len.1, pch=20, cex=0.8, + xlim=c(0, 3), ylim=c(0, max(len.1, len.2)), + axes=F, xlab="", ylab="") # Remove axes and labels + points(rep(2, len.2), 1:len.2, pch=20, cex=0.8) + + # Put labels next to each observation + if (labels){ + text(rep(1-labels.offset, len.1), 1:len.1, a) + text(rep(2+labels.offset, len.2), 1:len.2, b) + } + + # Now we need to map where the elements of a are in b + # We use the match function for this job + a.to.b <- match(a, b) + + # Now we can draw arrows from the first column to the second + arrows(rep(1.02, len.1), 1:len.1, rep(1.98, len.2), a.to.b, + length=arrow.len, angle=20) + par(old.par) +} + + +### ANALYSIS --------------------------------------- + +# 1. Prior and network +funcoup_prior <- + fread(file = paste0(basepath, "data/kimono_input/prior_expr_funcoup_mm.csv")) +nodes_prior <- unique(c(funcoup_prior$Gene_A, funcoup_prior$Gene_B)) +relations_prior <- data.frame(from = funcoup_prior$Gene_A, + to = funcoup_prior$Gene_B) +g_prior <- + graph_from_data_frame(relations_prior, directed = FALSE, vertices = nodes_prior) +# Calculate nodedegrees of prior +nodedegree_prior <- sort(igraph::degree(g_prior), decreasing = TRUE) +# Calculate nodebetweenness of prior +# nodebetweenness_prior <- +# betweenness(g_prior, directed = FALSE) # node betweenness: number of shortest paths going through a node +# saveRDS( +# nodebetweenness_prior, +# file = paste0(basepath, "data/workspaces/nodebetweenness_funcoup.rds") +# ) +nodebetweenness_prior <- + sort(readRDS( + paste0(basepath, "data/workspaces/nodebetweenness_funcoup.rds") + ), decreasing = TRUE) +top100genes_prior <- names(nodebetweenness_prior[1:100]) + + +# 2a. Read kimono expression networks +dex0_files <- + list.files( + path = file.path(basepath, "tables/coExpression_kimono"), + pattern = paste0( + "04\\_singleRegion\\_", + region, + "\\_dex0\\_funcoup\\_SE\\_.*\\.csv" + ), + full.names = TRUE + ) +dex1_files <- + list.files( + path = file.path(basepath, "tables/coExpression_kimono"), + pattern = paste0( + "04\\_singleRegion\\_", + region, + "\\_dex1\\_funcoup\\_SE\\_.*\\.csv" + ), + full.names = TRUE + ) + +data_dex0 <- readFiles_concat(dex0_files) +data_dex1 <- readFiles_concat(dex1_files) + +# 2b. Join Base and Dex data frame +data <- inner_join(data_dex0, data_dex1, + by = c("target", "predictor"), + suffix = c(".base", ".dex")) +dex_notBase <- anti_join(data_dex1, data_dex0, by = c("target", "predictor")) +head(data) +rm(data_dex0) +rm(data_dex1) + + +# 3. Remove interactions with very low r squared values & intercept & SVs +data <- data %>% + filter(overall_rsq.base >= rsquared_cutoff, + overall_rsq.dex >= rsquared_cutoff) %>% + filter(predictor != '(Intercept)') +data <- data[!startsWith(data$predictor, "SV"),] +# remove duplicated interactions (mistake made when separating nodes into chunks) +data <- data %>% + distinct(target, predictor, .keep_all = TRUE) + + +# 4a. Calculate z scores for interactions that are left +data <- mutate(data, + z = z_score( + beta_mean.dex, + beta_mean.base, + beta_stderr.dex, + beta_stderr.base + )) +hist(data$beta_mean.base) + +ggplot(data, aes(x = relation.base, y = beta_mean.base)) + + geom_violin() + +# 4b. Keep only interactions that have at least in one network a beta value > cutoff +data <- data %>% + mutate(diff = (abs(beta_mean.base) > beta_cutoff | abs(beta_mean.dex) > beta_cutoff)) +data_diff1 <- data %>% + filter(diff) +# calculate p-value for z-score +data_diff1$p_diff <- 2*pnorm(-abs(data_diff1$z)) +data_diff1$p_adj <- p.adjust(data_diff1$p_diff, method = "fdr") + + +# 5. Create diff network corresponding to beta cutoff +data_diff <- data_diff1 %>% + filter(p_adj <= padj_cutoff) +head(data_diff[,c("beta_mean.base", "beta_stderr.base", "beta_mean.dex", "beta_stderr.dex", "z", "p_adj")], 20) + +ggplot(data_diff1, aes(x = relation.base, y = z)) + + geom_violin() +hist(data_diff$z, breaks = 5000, xlim = c(-100,100)) + +# Save filtered network +fwrite(data_diff, file = paste0(basepath, "/tables/coExpression_kimono/03_AnalysisFuncoup/", + "04_singleRegion_",region,"_filtered_diffNetwork.csv")) + +# Nodes in network +node_vec <- unique(c(data_diff$target, data_diff$predictor)) + +# Edges in network +relations <- data.frame(from=data_diff$target, + to=data_diff$predictor, + value=data_diff$z, + performance=data_diff$p_adj) +# relations <- data.frame(from=data_diff$target, +# to=data_diff$predictor) +g_diff <- graph_from_data_frame(relations, directed=FALSE, vertices=node_vec) +# does graph contain multiple edges with same start and endpoint or loop edges +is_simple(g_diff) +g_diff <- simplify(g_diff) # check the edge attribute parameter + +# Calculate nodedegree +nodedegree <- igraph::degree(g_diff) +nodedegree <- sort(nodedegree, decreasing = TRUE) + +# Calculate nodebetweenness +nodebetweenness <- betweenness(g_diff, directed = FALSE) # node betweenness: number of shortest paths going through a node +nodebetweenness <- sort(nodebetweenness, decreasing = TRUE) # same when values are included in g_diff or not +top100genes <- names(nodebetweenness[1:100]) + +# Plot network properties in one plot +data.frame("nodedegree" = nodedegree, + "nodebetweenness" = nodebetweenness) %>% + tidyr::gather(key = "property", value = "value", nodedegree:nodebetweenness) %>% + ggplot(aes(value)) + + geom_histogram() + + facet_wrap(~property, scales = "free") + + xlab("") + + ggtitle(paste0("Differential expression network in " ,region, " (", + gorder(g_diff), " nodes, ", gsize(g_diff), " edges)")) +ggsave(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", + region,"_funcoup_focusHubGenes_diffNetwork_betacutoff",beta_cutoff,".png"), + width = 8, height = 6) + + +# 6. Compare top genes in our network to top genes in prior +# # 6a. Rank plot to compare ranks of top genes according to nodebetweenness +# png(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", +# region,"_funcoup_focusHubGenes_rankPlot_nodebetweenness_prior-betacutoff",beta_cutoff,".png"), +# width = 700, height = 700) +# plotRanks(top100genes_prior, top100genes, +# labels = FALSE) +# dev.off() +# # Rank plot to compare ranks of top genes according to nodebetweenness +# png(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", +# region,"_funcoup_focusHubGenes_rankPlot_nodedegree_prior-betacutoff",beta_cutoff,".png"), +# width = 700, height = 700) +# plotRanks(names(nodedegree_prior)[1:100], names(nodedegree)[1:100], +# labels = FALSE) +# dev.off() + +# 6b. Venn diagram (euler plot) to compare overlap of top genes according to nodebetweenness +list1 <- list(top100genes_prior, + top100genes) +names(list1) <- c("Funcoup prior", + paste("Beta cutoff", beta_cutoff)) +png(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", + region,"_funcoup_focusHubGenes_vennEuler_nodebetweenness_prior-betacutoff",beta_cutoff,".png"), + width = 700, height = 700) +print(plot(euler(list1, shape = "ellipse"), + labels = list(cex = 1.5), quantities = list(cex = 1.5))) +dev.off() +# Venn diagram (euler plot) to compare overlap of top genes according to nodedegree +list1 <- list(names(nodedegree_prior)[1:100], + names(nodedegree)[1:100]) +names(list1) <- c("Funcoup prior", + paste("Beta cutoff", beta_cutoff)) +png(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", + region,"_funcoup_focusHubGenes_vennEuler_nodedegree_prior-betacutoff",beta_cutoff,".png"), + width = 700, height = 700) +print(plot(euler(list1, shape = "ellipse"), + labels = list(cex = 1.5), quantities = list(cex = 1.5))) +dev.off() + + +# 6a. Compare estimated differential network to prior (ranks of nodedegree/betweenness) +# rank of genes in prior +nodedegree_prior_rank <- rank(-nodedegree_prior[names(nodedegree)]) +nodebetweenness_prior_rank <- rank(-nodebetweenness_prior[names(nodebetweenness)]) +# rank of genes in diff network +nodedegree_rank <- rank(-nodedegree) +nodebetweenness_rank <- rank(-nodebetweenness) + +# Spearman's rank correlation +cor_nodedegree <- cor.test(nodedegree_prior_rank, nodedegree_rank, + method = "spearman") +cor_nodebetweenness <- cor.test(nodebetweenness_prior_rank, nodebetweenness_rank, + method = "spearman") + +# Scatterplot between prior and base network with correlation in label +data.frame("prior" = nodedegree_prior[names(nodedegree)], + "differential" = nodedegree) %>% + ggplot(aes(x=prior, y=differential)) + + # geom_point(size=1,alpha = 0.1) + geom_hex() + + # geom_bin2d() + xlab("nodedegree prior network") + + ylab("nodedegree differential network") + + ggtitle(paste0("Nodedegree in prior and differential network (Correlation between ranks: ", + round(cor_nodedegree$estimate, digits = 2),")")) +ggsave(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", + region,"_prior_differential_funcoup_correlationNodedegree_betacutoff",beta_cutoff,".png"), + width = 9, height = 8) + +data.frame("prior" = nodebetweenness_prior[names(nodebetweenness)], + "differential" = nodebetweenness) %>% + ggplot(aes(x=prior, y=differential)) + + # geom_point(size=1,alpha = 0.1) + geom_hex() + + # geom_bin2d() + xlab("nodebetweenness prior network") + + ylab("nodebetweenness differential network") + + ggtitle(paste0("Nodebetweenness in prior and differential network (Correlation between ranks: ", + round(cor_nodebetweenness$estimate, digits = 2),")")) +ggsave(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", + region,"_prior_differential_funcoup_correlationNodebetweenness_betacutoff",beta_cutoff,".png"), + width = 9, height = 8) + + + +# tmp <- relations_prior[relations_prior$from == "ENSMUSG00000021660" | relations_prior$to == "ENSMUSG00000021660",] +# tmp1 <- relations[relations$from == "ENSMUSG00000021660" | relations$to == "ENSMUSG00000021660",] +# tmp_join <- inner_join(tmp, tmp1, +# by = c("from", "to"), +# suffix = c(".prior", ".diff")) +# dex_notBase <- anti_join(data_dex1, data_dex0, by = c("target", "predictor")) + + +# # 8. Subset igraph object to top genes with their neighbours +# g1 <- induced.subgraph(graph=g_diff,vids=unlist(neighborhood(graph=g_diff,order=1,nodes=c(top100genes[1])))) +# png(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", +# region,"_funcoup_focusHubGenes_networkTopGene1_betacutoff",beta_cutoff,".png"), +# width = 1000, height = 1000) +# plot(g1) +# dev.off() +# library(RCy3) +# cytoscapePing() +# createNetworkFromIgraph(g1,"myIgraph") + + +# 9. "Normalize" nodebetweenness by nodebetweenness in prior +# check if nodebetweenness can not be compared like this because here we use z scores as value +# and in prior no values are included +nodebetweenness_norm <- nodebetweenness/nodebetweenness_prior[names(nodebetweenness)] +nodebetweenness_mat <- data.frame("nodebetweenness" = nodebetweenness, + "nodebetweenness_prior" = nodebetweenness_prior[names(nodebetweenness)], + "nodebetweenness_norm" = nodebetweenness_norm) %>% + dplyr::arrange(desc(nodebetweenness_norm)) +# set norm nodebetweenness to NA if nodebetweenness in prior < 10000 +nodebetweenness_mat$nodebetweenness_norm[nodebetweenness_mat$nodebetweenness_prior < 10000] <- NA +nodebetweenness_mat <- arrange(nodebetweenness_mat, desc(nodebetweenness_norm)) +# add column with gene symbol +nodebetweenness_mat$gene_symbol <- mapIds(org.Mm.eg.db, keys = rownames(nodebetweenness_mat), + keytype = "ENSEMBL", column="SYMBOL") +# rank normalized nodebetweenness +nodebetweenness_norm_rank <- rank(-nodebetweenness_norm[names(nodebetweenness)]) + +# correlation between ranks of prior nodebetweenness and norm betweenness +cor_nodebetweenness_norm <- cor.test(nodebetweenness_prior_rank, nodebetweenness_norm_rank, + method = "spearman") + +data.frame("prior" = nodebetweenness_prior[names(nodebetweenness)], + "differential" = nodebetweenness_norm[(names(nodebetweenness))]) %>% + ggplot(aes(x=prior, y=differential)) + + # geom_point(size=1,alpha = 0.1) + geom_hex() + + # geom_bin2d() + + xlab("nodebetweenness prior network") + + ylab("norm. nodebetweenness differential network") + + ggtitle(paste0("Nodebetweenness in prior and differential network (Correlation between ranks: ", + round(cor_nodebetweenness_norm$estimate, digits = 2),")")) +ggsave(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", + region,"_prior_differential_funcoup_correlationNodebetweennessNorm_betacutoff",beta_cutoff,".png"), + width = 9, height = 8) + +# nodebetweenness_norm <- sort(nodebetweenness_norm, decreasing = TRUE) +# top100genes_between_norm <- names(nodebetweenness_norm)[1:100] + +# write nodebetweenness table to file +nodebetweenness_mat <- tibble::rownames_to_column(nodebetweenness_mat, "ensembl_id") +fwrite(nodebetweenness_mat, file = paste0(basepath, "/tables/coExpression_kimono/03_AnalysisFuncoup/", + "04_",region,"_funcoup_differential_nodebetweennessNorm_betacutoff",beta_cutoff,".csv"), + quote = FALSE) + + +# 10. "Normalize" nodedegree by nodedegree in prior +nodedegree_norm <- nodedegree/nodedegree_prior[names(nodedegree)] +nodedegree_mat <- data.frame("nodedegree" = nodedegree, "nodedegree_prior" = nodedegree_prior[names(nodedegree)], + "nodedegree_norm" = nodedegree_norm) %>% + arrange(desc(nodedegree_norm)) +# set norm nodebetweenness to NA if nodedegree in prior < 50 +nodedegree_mat$nodedegree_norm[nodedegree_mat$nodedegree_prior < 50] <- NA +nodedegree_mat <- arrange(nodedegree_mat, desc(nodedegree_norm)) +# add column with gene symbol +nodedegree_mat$gene_symbol <- mapIds(org.Mm.eg.db, keys = rownames(nodedegree_mat), + keytype = "ENSEMBL", column="SYMBOL") + +# rank normalized nodedegree +nodedegree_norm_rank <- rank(-nodedegree_norm[names(nodedegree)]) + +# correlation between ranks of prior nodebetweenness and norm betweenness +cor_nodedegree_norm <- cor.test(nodedegree_prior_rank, nodedegree_norm_rank, + method = "spearman") + +data.frame("prior" = nodedegree_prior[names(nodedegree)], + "differential" = nodedegree_norm[(names(nodedegree))]) %>% + ggplot(aes(x=prior, y=differential)) + + # geom_point(size=1,alpha = 0.1) + geom_hex() + + # geom_bin2d() + + xlab("nodedegree prior network") + + ylab("norm. nodedegree differential network") + + ggtitle(paste0("Nodedegree in prior and differential network (Correlation between ranks: ", + round(cor_nodedegree_norm$estimate, digits = 2),")")) +ggsave(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/singleRegions/", + region,"_prior_differential_funcoup_correlationNodedegreeNorm_betacutoff",beta_cutoff,".png"), + width = 9, height = 8) + +# write nodedegree table to file +nodedegree_mat <- tibble::rownames_to_column(nodedegree_mat, "ensembl_id") +fwrite(nodedegree_mat, file = paste0(basepath, "/tables/coExpression_kimono/03_AnalysisFuncoup/", + "04_",region,"_funcoup_differential_nodedegreesNorm_betacutoff",beta_cutoff,".csv"), + quote = FALSE) + + +# # checkout what happens to FKBP5 (ENSMUSG00000024222) in network +# g2 <- induced.subgraph(graph = g_diff, +# vids = unlist(neighborhood( +# graph = g_diff, +# order = 1, +# nodes = c("ENSMUSG00000024222") +# ))) diff --git a/03_CoExp_Analysis/05_singleRegion_comparisonRegions.R b/03_CoExp_Analysis/05_singleRegion_comparisonRegions.R new file mode 100644 index 0000000..5a7add8 --- /dev/null +++ b/03_CoExp_Analysis/05_singleRegion_comparisonRegions.R @@ -0,0 +1,141 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 15.12.2020 +## Author: Nathalie +################################################## +# Use beta cutoff and analyze top genes (nodebetweenness) + +library(data.table) +library(dplyr) +library(ggplot2) +library(igraph) +library(eulerr) +library(UpSetR) +library(org.Mm.eg.db) + +basepath <- "~/Documents/ownCloud/DexStim_RNAseq_Mouse/" +regions <- c("AMY", "CER", "dCA1", "dDG", "PFC", "PVN", "vCA1", "vDG") +beta_cutoff <- 0.01 + + +# 0. functions ------------------------------- +write_genelist <- function(genelist, filename){ + # write list with ENSEMBL IDs + write.table(genelist, file = paste0(basepath, "tables/coExpression_kimono/03_AnalysisFuncoup/", + "/05_",filename,"_ensemblID.txt"), + quote = FALSE, row.names = FALSE, col.names = FALSE) + # write list with ENTREZ IDs + entrez <- mapIds(org.Mm.eg.db, keys = genelist, keytype = "ENSEMBL", column="ENTREZID") + write.table(entrez, file = paste0(basepath, "tables/coExpression_kimono/03_AnalysisFuncoup/", + "/05_",filename,"_entrezID.txt"), + quote = FALSE, row.names = FALSE, col.names = FALSE) + # write list with GENE SYMBOLS + symbol <- mapIds(org.Mm.eg.db, keys = genelist, keytype = "ENSEMBL", column="SYMBOL") + write.table(symbol, file = paste0(basepath, "tables/coExpression_kimono/03_AnalysisFuncoup/", + "/05_",filename,"_geneSymbol.txt"), + quote = FALSE, row.names = FALSE, col.names = FALSE) +} + + +# 1. Read data -------------------------------- +nodedegrees_list <- list() +nodedegrees_0.5 <- list() +for (reg in regions){ + nodedegrees_list[[reg]] <- fread(paste0(basepath, "/tables/coExpression_kimono/03_AnalysisFuncoup/", + "04_",reg,"_funcoup_differential_nodedegreesNorm_betacutoff",beta_cutoff,".csv")) + nodedegrees_0.5[[reg]] <- nodedegrees_list[[reg]]$ensembl_id[nodedegrees_list[[reg]]$nodedegree_norm>=0.5 & + ! is.na(nodedegrees_list[[reg]]$nodedegree_norm)] + +} + +nodebetweenness_list <- list() +nodebetweenness_1 <- list() +for (reg in regions){ + nodebetweenness_list[[reg]] <- fread(paste0(basepath, "/tables/coExpression_kimono/03_AnalysisFuncoup/", + "04_",reg,"_funcoup_differential_nodebetweennessNorm_betacutoff",beta_cutoff,".csv")) + nodebetweenness_1[[reg]] <- nodebetweenness_list[[reg]]$ensembl_id[nodebetweenness_list[[reg]]$nodebetweenness_norm>=1 & + ! is.na(nodebetweenness_list[[reg]]$nodebetweenness_norm)] +} + + +# 2. Compare top genes between regions using Upset Plot + +# 2.1 Nodedegree +png(filename = paste0(basepath, "/figures/02_CoExp_Kimono/03_AnalysisFuncoup/comparisonRegions/", + "upsetPlot_normNodedegree0.5.png"), + height = 700, width = 1000) +print(upset(fromList(nodedegrees_0.5), nsets = 8, nintersects = 50, order.by = "freq", + text.scale = c(1.8, 1.8, 1.8, 1.8, 1.8, 1.8))) +dev.off() + +# 2.2 Nodebetweenness +png(filename = paste0(basepath, "/figures/02_CoExp_Kimono/03_AnalysisFuncoup/comparisonRegions/", + "upsetPlot_normNodebetweenness1.0.png"), + height = 700, width = 1000) +print(upset(fromList(nodebetweenness_1), nsets = 8, nintersects = 50, order.by = "freq", + text.scale = c(1.8, 1.8, 1.8, 1.8, 1.8, 1.8))) +dev.off() + + + +# 3. Plot correlation between 2 different brain regions + +reg_comb <- combn(regions, 2) +# reg1 <- "PFC" +# reg2 <- "dDG" + +for (i in 1:ncol(reg_comb)){ + + reg1 <- reg_comb[1,i] + reg2 <- reg_comb[2,i] + + # 3.1 Nodedegree + degree_reg <- inner_join(nodedegrees_list[[reg1]], nodedegrees_list[[reg2]], by = "ensembl_id", + suffix = c(".reg1", ".reg2")) + ggplot(degree_reg, aes(x=nodedegree_norm.reg1, y=nodedegree_norm.reg2)) + + # geom_point(size=1,alpha = 0.1) + geom_hex() + + # geom_bin2d() + + xlab(paste("norm. nodedegree", reg1)) + + ylab(paste("norm. nodedegree", reg2)) + + ggtitle(paste("Nodedegree in", reg1, "and", reg2, "differential network")) + ggsave(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/comparisonRegions/", + "comparison_",reg1,"-",reg2,"_correlationNodedegreeNorm.png"), + width = 9, height = 8) + + # 3.2 Nodebetweenness + between_reg <- inner_join(nodebetweenness_list[[reg1]], nodebetweenness_list[[reg2]], by = "ensembl_id", + suffix = c(".reg1", ".reg2")) + ggplot(between_reg, aes(x=nodebetweenness_norm.reg1, y=nodebetweenness_norm.reg2)) + + # geom_point(size=1,alpha = 0.1) + geom_hex() + + # geom_bin2d() + + xlab(paste("norm. nodebetweenness", reg1)) + + ylab(paste("norm. nodebetweenness", reg2)) + + ggtitle(paste("Nodebetweenness in", reg1, "and", reg2, "differential network")) + ggsave(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/comparisonRegions/", + "comparison_",reg1,"-",reg2,"_correlationNodebetweennessNorm.png"), + width = 9, height = 8) + +} + + +# 4. Gene lists ------------------------------------- + +# 4.1 Overlap all regions +# nodedegree +overlap_degree <- Reduce(intersect, nodedegrees_0.5) +write_genelist(overlap_degree, "topgenesNodedegree0.5_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1") +# nodebetweenness +overlap_between <- Reduce(intersect, nodebetweenness_1) +write_genelist(overlap_degree, "topgenesNodebetweenness1_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1") + +# union of all nodebetweenness hub genes +union_between <- Reduce(union, nodebetweenness_1) + +# comparison with union of all DE genes +de_genes <- fread(paste0(basepath, "tables/06_union_AMY-CER-PFC-PVN-dDG-vDG-dCA1-vCA1.txt"), + header = FALSE) + +# overlap of hub genes and de genes +common_hub_de <- intersect(union_between, de_genes$V1) diff --git a/03_CoExp_Analysis/05a_singleRegion_comparsionRegions_DEgenes.R b/03_CoExp_Analysis/05a_singleRegion_comparsionRegions_DEgenes.R new file mode 100644 index 0000000..87d02c9 --- /dev/null +++ b/03_CoExp_Analysis/05a_singleRegion_comparsionRegions_DEgenes.R @@ -0,0 +1,163 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 15.12.2020 +## Author: Nathalie +################################################## +# Use beta cutoff and analyze top genes (nodebetweenness) +# UpSet Plot of DE genes with nodebetweenness >/< 1 + +library(data.table) +library(dplyr) +library(ggplot2) +library(igraph) +library(eulerr) +library(UpSetR) +library(org.Mm.eg.db) + +basepath <- "~/Documents/ownCloud/DexStim_RNAseq_Mouse/" +regions <- c("AMY", "CER", "dCA1", "dDG", "PFC", "PVN", "vCA1", "vDG") +beta_cutoff <- 0.01 + + +# 1. Read data -------------------------------- +# nodedegrees_list <- list() +# nodedegrees_0.5 <- list() +# for (reg in regions){ +# nodedegrees_list[[reg]] <- fread(paste0(basepath, "/tables/coExpression_kimono/03_AnalysisFuncoup/", +# "04_",reg,"_funcoup_differential_nodedegreesNorm_betacutoff",beta_cutoff,".csv")) +# nodedegrees_0.5[[reg]] <- nodedegrees_list[[reg]]$ensembl_id[nodedegrees_list[[reg]]$nodedegree_norm>=0.2 & +# ! is.na(nodedegrees_list[[reg]]$nodedegree_norm)] +# +# } + +de_nodebetween <- list() +for (reg in regions){ + nodebetweenness <- fread(paste0(basepath, "/tables/coExpression_kimono/03_AnalysisFuncoup/", + "04_",reg,"_funcoup_differential_nodebetweennessNorm_betacutoff",beta_cutoff,".csv")) + nodebetweenness_1 <- nodebetweenness$ensembl_id[nodebetweenness$nodebetweenness_norm>= 1.0& + ! is.na(nodebetweenness$nodebetweenness_norm)] + + de_genes <- fread(paste0(basepath, "tables/02_",reg,"_deseq2_Dex_1_vs_0_lfcShrink.txt")) %>% + filter(padj <= 0.1) + + de_nodebetween[[reg]] <- intersect(nodebetweenness_1, de_genes$Ensembl_ID) +} + + +# 2 Upset Plot +png(filename = paste0(basepath, "/figures/02_CoExp_Kimono/03_AnalysisFuncoup/comparisonRegions/", + "upsetPlot_DE_normNodebetweennessAbove1.0.png"), + height = 700, width = 1000) +print(upset(fromList(de_nodebetween), nsets = 8, nintersects = 50, order.by = "freq", + text.scale = c(1.8, 1.8, 1.8, 1.8, 1.8, 1.8))) +dev.off() + + + + +# 1b. Read data -------------------------------- + +de_nodebetween <- list() +for (reg in regions){ + nodebetweenness <- fread(paste0(basepath, "/tables/coExpression_kimono/03_AnalysisFuncoup/", + "04_",reg,"_funcoup_differential_nodebetweennessNorm_betacutoff",beta_cutoff,".csv")) + nodebetweenness_1 <- nodebetweenness$ensembl_id[nodebetweenness$nodebetweenness_norm>= 1.0& + ! is.na(nodebetweenness$nodebetweenness_norm)] + + de_genes <- fread(paste0(basepath, "tables/02_",reg,"_deseq2_Dex_1_vs_0_lfcShrink.txt")) %>% + filter(padj <= 0.1) + + de_nodebetween[[reg]] <- nodebetweenness_1 + de_nodebetween[[paste0(reg,"_de")]] <- de_genes$Ensembl_ID +} + +df <- fromList(de_nodebetween) +# x <- which(df$PFC == 1 & rowSums(df[,seq(1,15,by=2)]) == 1) +# df$de <- rowSums(df[,seq(2,16,by=2)]) +# y <- which(df$de >= 1) + +# function to count genes that are also DE gene in at least one +# of the intersect regions +de_region <- function(x){ + index_de <- which(x[seq(1,15,by=2)] == 1)*2 + s <- sum(x[index_de]) + return(s) +} +df$de_region <- apply(df, 1, de_region) + +# 2b Upset Plot +pdf(file = paste0(basepath, "/figures/02_CoExp_Kimono/03_AnalysisFuncoup/comparisonRegions/", + "upsetPlot_DEcolor_normNodebetweennessAbove1.0.pdf"), + height = 10, width = 14) +print(upset(df, nsets = 16, nintersects = 50, order.by = "freq", + sets = c("AMY", "CER", "dCA1", + "dDG", "PFC", "PVN", + "vCA1", "vDG"), + text.scale = c(1.8, 1.9, 1.8, 1.9, 1.9, 1.9), + sets.x.label = "#Hub genes in brain region", + mainbar.y.label = "#Hub genes in intersection", + queries = list( + list( + query = elements, + params = list("de_region",1), + color = "#FFA500", + active = T, + query.name = "DE gene in at least one of the intersect regions" + ) + ), + query.legend = "bottom")) +dev.off() + + + +# 2. Read nodedegree data -------------------------------- + +de_nodedegree <- list() +for (reg in regions){ + nodedegree <- fread(paste0(basepath, "/tables/coExpression_kimono/03_AnalysisFuncoup/", + "04_",reg,"_funcoup_differential_nodedegreesNorm_betacutoff",beta_cutoff,".csv")) + nodedegree_1 <- nodedegree$ensembl_id[nodedegree$nodedegree_norm>= 0.5& + ! is.na(nodedegree$nodedegree_norm)] + + de_genes <- fread(paste0(basepath, "tables/02_",reg,"_deseq2_Dex_1_vs_0_lfcShrink.txt")) %>% + filter(padj <= 0.1) + + de_nodedegree[[reg]] <- nodedegree_1 + de_nodedegree[[paste0(reg,"_de")]] <- de_genes$Ensembl_ID +} + +df <- fromList(de_nodedegree) +# x <- which(df$PFC == 1 & rowSums(df[,seq(1,15,by=2)]) == 1) +# df$de <- rowSums(df[,seq(2,16,by=2)]) +# y <- which(df$de >= 1) + +# function to count genes that are also DE gene in at least one +# of the intersect regions +de_region <- function(x){ + index_de <- which(x[seq(1,15,by=2)] == 1)*2 + s <- sum(x[index_de]) + return(s) +} +df$de_region <- apply(df, 1, de_region) + +# 2b Upset Plot +png(filename = paste0(basepath, "/figures/02_CoExp_Kimono/03_AnalysisFuncoup/comparisonRegions/", + "upsetPlot_DEcolor_normNodedegreeAbove0.5.png"), + height = 700, width = 1000) +print(upset(df, nsets = 8, nintersects = 50, order.by = "freq", + sets = c("AMY", "CER", "dCA1", + "dDG", "PFC", "PVN", + "vCA1", "vDG"), + text.scale = c(1.8, 1.8, 1.8, 1.8, 1.8, 1.8), + queries = list( + list( + query = elements, + params = list("de_region",1), + color = "#Df5286", + active = T, + query.name = "DE gene in at least one of the intersect regions" + ) + ), + query.legend = "bottom")) +dev.off() + diff --git a/03_CoExp_Analysis/06_singleRegion_GOterms_nodebetweenness.R b/03_CoExp_Analysis/06_singleRegion_GOterms_nodebetweenness.R new file mode 100644 index 0000000..3328e00 --- /dev/null +++ b/03_CoExp_Analysis/06_singleRegion_GOterms_nodebetweenness.R @@ -0,0 +1,125 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 07.01.2020 +## Author: Nathalie +################################################## +# GO plots single regions (Network analysis - nodebetweenness) + +library(org.Mm.eg.db) +library(data.table) +library(ggplot2) +library(dplyr) +library(stringr) +library(anRichment) +library(anRichmentMethods) + +basepath <- "~/Documents/ownCloud/DexStim_RNAseq_Mouse/" +regions <- c("AMY", "CER", "dCA1", "dDG", "PFC", "PVN", "vCA1", "vDG") +beta_cutoff <- 0.01 + +folder_plots <- paste0("figures") +folder_tables <- paste0("tables") + + +# 1. Read data from all regions ---------- + +list_reg <- list() +for (reg in regions){ + res <- fread(paste0(basepath, "tables/coExpression_kimono/03_AnalysisFuncoup/04_", + reg,"_funcoup_differential_nodeBetweennessNorm_betacutoff",beta_cutoff,".csv")) + res <- res[res$nodebetweenness_norm>=0.5 & ! is.na(res$nodebetweenness_norm)] + res$entrez <- mapIds(org.Mm.eg.db, keys = res$ensembl_id, + keytype = "ENSEMBL", column="ENTREZID") + list_reg[[reg]] <- res +} + + +# 2. check uniqueness of DE genes --------------- + +for (reg in regions){ + index_reg <- which(regions == reg) + df <- bind_rows(list_reg[-index_reg], .id="region") + list_reg[[reg]]$regions_top <- sapply(list_reg[[reg]]$ensembl_id, + function(x) paste(df[df$ensembl_id == x,]$region, collapse = " ")) + list_reg[[reg]]$unique_top <- sapply(list_reg[[reg]]$regions_top, + function(x) x == "") +} + + +# 3. GO enrichment for the genes of each region ------------------ + +go_enrichment_all <- function(df_reg, GOcoll, unique){ + if (unique){ + genes <- df_reg$entrez[df_reg$unique_top] + } else { + genes <- df_reg$entrez + } + background <- read.table(file = paste0(basepath, folder_tables, "/06_background_entrezID.txt"), + header = FALSE) + modules <- rep("not_significant", nrow(background)) + modules[which(background$V1 %in% genes)] <- "significant" + + # enrichment + GOenrichment <- enrichmentAnalysis( + classLabels = modules, + identifiers = background$V1, + refCollection = GOcoll, + useBackground = "given", + nBestDataSets = length(GOcoll$dataSets), + # threshold = 0.1, + # thresholdType = "Bonferroni", + getOverlapEntrez = TRUE, + getOverlapSymbols = TRUE, + ignoreLabels = "not_significant", + maxReportedOverlapGenes = 500 + ) + + enrichmentTable <- GOenrichment$enrichmentTable + + return(enrichmentTable) +} + +GOcollection <- buildGOcollection(organism = "mouse") +list_GO <- list() +# GO.BPcollection = subsetCollection(GOcollection, tags = "GO.BP") +for (reg in regions){ + + go_enr_unique <- go_enrichment_all(list_reg[[reg]], GOcollection, TRUE) + # go_enr_all <- go_enrichment_all(list_reg[[reg]], GOcollection, FALSE) + list_GO[[reg]] <- go_enr_unique + +} + + +# 4. Plot GO terms +df_all <- bind_rows(list_GO, .id="region") +for (reg in regions){ + + df_reg <- list_GO[[reg]] %>% + filter(nCommonGenes >= 10, pValue <= 0.1) %>% + group_by(inGroups) %>% slice_min(order_by = pValue, n = 10) + + df <- df_all[df_all$dataSetName %in% df_reg$dataSetName,] + df$dataSetName <- sapply(df$dataSetName, function(x) str_trunc(x, 45, "right")) + df$dataSetName <- factor(df$dataSetName, levels = rev(reorder(df$dataSetName[df$region==reg], df$pValue[df$region==reg]))) + df$region <- factor(df$region, levels = c("AMY", "CER", "PFC", "PVN", "vDG", "dDG", "vCA1", "dCA1")) + df$inGroups <- factor(df$inGroups) + levels(df$inGroups) <- c("Biological Process", "Cellular Components", "Molecular Function") + + # Plot results (plotted pvalues are not adjusted for multiple testing) + df <- df %>% + arrange(desc(dataSetName)) + print(ggplot(df, aes(x=dataSetName, y = -log10(pValue), fill = region)) + + geom_bar(position = position_dodge2(reverse=TRUE), stat="identity") + + geom_hline(yintercept = -log10(0.1),linetype="dashed", color = "red") + + coord_flip() + + xlab("GOterm") + + ggtitle(paste0("GO terms enriched for diff. co-expressed genes only in ",reg, " (Top 10 each)")) + + facet_wrap(~inGroups, scales="free") + + theme(axis.text.y = element_text(size = 10))) + ggsave(filename = paste0(basepath, folder_plots, "/02_CoExp_Kimono/03_AnalysisFuncoup/comparisonRegions/", + "06_",reg,"_GOterms_unique_nodebetweenness1.png"), + width = 13, height = 7) +} + + diff --git a/03_CoExp_Analysis/06_singleRegion_GOterms_nodedegree.R b/03_CoExp_Analysis/06_singleRegion_GOterms_nodedegree.R new file mode 100644 index 0000000..8610d69 --- /dev/null +++ b/03_CoExp_Analysis/06_singleRegion_GOterms_nodedegree.R @@ -0,0 +1,125 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 07.01.2020 +## Author: Nathalie +################################################## +# GO plots single regions (Network analysis - nodedegree) + +library(org.Mm.eg.db) +library(dplyr) +library(data.table) +library(ggplot2) +library(stringr) +library(anRichment) +library(anRichmentMethods) + +basepath <- "~/Documents/ownCloud/DexStim_RNAseq_Mouse/" +regions <- c("AMY", "CER", "dCA1", "dDG", "PFC", "PVN", "vCA1", "vDG") +beta_cutoff <- 0.01 + +folder_plots <- paste0("figures") +folder_tables <- paste0("tables") + + +# 1. Read data from all regions ---------- + +list_reg <- list() +for (reg in regions){ + res <- fread(paste0(basepath, "tables/coExpression_kimono/03_AnalysisFuncoup/", + reg,"_funcoup_differential_nodedegreesNorm_betacutoff",beta_cutoff,".csv")) + res <- res[res$nodedegree_norm>=0.5 & ! is.na(res$nodedegree_norm)] + res$entrez <- mapIds(org.Mm.eg.db, keys = res$ensembl_id, + keytype = "ENSEMBL", column="ENTREZID") + list_reg[[reg]] <- res +} + + +# 2. check uniqueness of DE genes --------------- + +for (reg in regions){ + index_reg <- which(regions == reg) + df <- bind_rows(list_reg[-index_reg], .id="region") + list_reg[[reg]]$regions_top <- sapply(list_reg[[reg]]$ensembl_id, + function(x) paste(df[df$ensembl_id == x,]$region, collapse = " ")) + list_reg[[reg]]$unique_top <- sapply(list_reg[[reg]]$regions_top, + function(x) x == "") +} + + +# 3. GO enrichment for the genes of each region ------------------ + +go_enrichment_all <- function(df_reg, GOcoll, unique){ + if (unique){ + genes <- df_reg$entrez[df_reg$unique_top] + } else { + genes <- df_reg$entrez + } + background <- read.table(file = paste0(basepath, folder_tables, "/06_background_entrezID.txt"), + header = FALSE) + modules <- rep("not_significant", nrow(background)) + modules[which(background$V1 %in% genes)] <- "significant" + + # enrichment + GOenrichment <- enrichmentAnalysis( + classLabels = modules, + identifiers = background$V1, + refCollection = GOcoll, + useBackground = "given", + nBestDataSets = length(GOcoll$dataSets), + # threshold = 0.1, + # thresholdType = "Bonferroni", + getOverlapEntrez = TRUE, + getOverlapSymbols = TRUE, + ignoreLabels = "not_significant", + maxReportedOverlapGenes = 500 + ) + + enrichmentTable <- GOenrichment$enrichmentTable + + return(enrichmentTable) +} + +GOcollection <- buildGOcollection(organism = "mouse") +list_GO <- list() +# GO.BPcollection = subsetCollection(GOcollection, tags = "GO.BP") +for (reg in regions){ + + go_enr_unique <- go_enrichment_all(list_reg[[reg]], GOcollection, TRUE) + # go_enr_all <- go_enrichment_all(list_reg[[reg]], GOcollection, FALSE) + list_GO[[reg]] <- go_enr_unique + +} + + +# 4. Plot GO terms +df_all <- bind_rows(list_GO, .id="region") +for (reg in regions){ + + df_reg <- list_GO[[reg]] %>% + filter(nCommonGenes >= 10, pValue <= 0.1) %>% + group_by(inGroups) %>% slice_min(order_by = pValue, n = 10) + + df <- df_all[df_all$dataSetName %in% df_reg$dataSetName,] + df$dataSetName <- sapply(df$dataSetName, function(x) str_trunc(x, 45, "right")) + df$dataSetName <- factor(df$dataSetName, levels = rev(reorder(df$dataSetName[df$region==reg], df$pValue[df$region==reg]))) + df$region <- factor(df$region, levels = c("AMY", "CER", "PFC", "PVN", "vDG", "dDG", "vCA1", "dCA1")) + df$inGroups <- factor(df$inGroups) + levels(df$inGroups) <- c("Biological Process", "Cellular Components", "Molecular Function") + + # Plot results (plotted pvalues are not adjusted for multiple testing) + df <- df %>% + arrange(desc(dataSetName)) + print(ggplot(df, aes(x=dataSetName, y = -log10(pValue), fill = region)) + + geom_bar(position = position_dodge2(reverse=TRUE), stat="identity") + + geom_hline(yintercept = -log10(0.1),linetype="dashed", color = "red") + + coord_flip() + + xlab("GOterm") + + ggtitle(paste0("GO terms enriched for diff. co-expressed genes only in ",reg, " (Top 10 each)")) + + facet_wrap(~inGroups, scales="free") + + theme(axis.text.y = element_text(size = 10))) + ggsave(filename = paste0(basepath, folder_plots, "/02_CoExp_Kimono/03_AnalysisFuncoup/comparisonRegions/", + "06_",reg,"_GOterms_unique_nodedegree0.5.png"), + width = 13, height = 7) +} + + diff --git a/03_CoExp_Analysis/07_singleRegion_comparisonNetworkDE.R b/03_CoExp_Analysis/07_singleRegion_comparisonNetworkDE.R new file mode 100644 index 0000000..877a765 --- /dev/null +++ b/03_CoExp_Analysis/07_singleRegion_comparisonNetworkDE.R @@ -0,0 +1,168 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 08.01.2020 +## Author: Nathalie +################################################## +# Comparison of important and region specific genes +# according to DE and network analysis (nodebetweenness) + +library(org.Mm.eg.db) +library(data.table) +library(ggplot2) +library(dplyr) +library(eulerr) +library(gridExtra) +library(grid) +library(igraph) +library(RCy3) + +basepath <- "~/Documents/ownCloud/DexStim_RNAseq_Mouse/" +regions <- c("AMY", "CER", "dCA1", "dDG", "PFC", "PVN", "vCA1", "vDG") +beta_cutoff <- 0.01 + + +### ANALYSIS ----------------------------- + +# 1. Read data from all regions ---------- +# 1a. DE tables + +list_de <- list() +for (reg in regions){ + res <- read.table(file=paste0(basepath, "tables/02_", reg, + "_deseq2_Dex_1_vs_0_lfcShrink.txt"),sep="\t") + res <- res[res$padj <= 0.1,] + res$ensembl_id <- rownames(res) + # res$padj[which(is.na(res$padj))] <- 1 + res$gene_symbol <- mapIds(org.Mm.eg.db, keys = res$ensembl_id, + keytype = "ENSEMBL", column="SYMBOL") + list_de[[reg]] <- res +} + +# # 1b. Network tables +# list_net <- list() +# for (reg in regions){ +# res <- fread(paste0(basepath, "tables/coExpression_kimono/03_AnalysisFuncoup/", +# "04_", reg, "_funcoup_differential_nodebetweennessNorm_betacutoff", +# beta_cutoff, ".csv")) +# res <- res[res$nodebetweenness_norm>=1.0 & ! is.na(res$nodebetweenness_norm)] +# list_net[[reg]] <- res +# } +# +# +# +# # 2. Venn/Euler Plot per region --------------- +# list_euler <- list() +# +# for (reg in regions){ +# +# list1 <- list(list_de[[reg]]$ensembl_id, +# list_net[[reg]]$ensembl_id) +# names(list1) <- c("Diff. exp. genes", +# "Diff. co-exp. genes") +# list_euler[[reg]] <- plot(euler(list1, shape = "ellipse"), +# labels = list(cex = 1.0), quantities = list(cex = 1.0), +# main = paste0(reg)) +# +# } +# +# grid.arrange(grobs = list_euler, ncol = 4, +# top = "Comparison of DE genes and top network genes") + + + + + +### ANALYZE NEIGHBOURS OF DE GENES IN NETWORK ####################### + +# 1b. Network tables +list_diff <- list() +for (reg in regions){ + res <- fread(paste0(basepath, "tables/coExpression_kimono/03_AnalysisFuncoup/", + "04_singleRegion_", reg, "_filtered_diffNetwork.csv")) + # res <- res[res$nodebetweenness_norm>=1.0 & ! is.na(res$nodebetweenness_norm)] + # res$entrez <- mapIds(org.Mm.eg.db, keys = res$ensembl_id, + # keytype = "ENSEMBL", column="ENTREZID") + list_diff[[reg]] <- res +} + +list_base <- list() +for (reg in regions){ + res <- fread(paste0(basepath, "tables/coExpression_kimono/03_AnalysisFuncoup/", + "04_singleRegion_", reg, "_filtered_baselineNetwork.csv")) + # res <- res[res$nodebetweenness_norm>=1.0 & ! is.na(res$nodebetweenness_norm)] + # res$entrez <- mapIds(org.Mm.eg.db, keys = res$ensembl_id, + # keytype = "ENSEMBL", column="ENTREZID") + list_base[[reg]] <- res +} + +# 1c. Network nodebetweeness tables +list_net_base <- list() +list_net_diff <- list() +for (reg in regions){ + res <- fread(paste0(basepath, "tables/coExpression_kimono/03_AnalysisFuncoup/", + "04_", reg, "_funcoup_differential_nodebetweennessNorm_betacutoff", beta_cutoff, ".csv")) + list_net_diff[[reg]] <- res + res <- fread(paste0(basepath, "tables/coExpression_kimono/03_AnalysisFuncoup/", + "03_", reg, "_funcoup_baseline_nodebetweennessNorm_betacutoff", beta_cutoff, ".csv")) + list_net_base[[reg]] <- res +} + + + +# 2. Subset networks to DE genes +for (reg in regions){ + # norm_nodebetweenness is NA whenever prior nodebetweenness < 10000 + # (our definition) + de_genes <- data.frame("ensembl_id" = list_de[[reg]]$ensembl_id) %>% + left_join(list_net_diff[[reg]], by = "ensembl_id", ) %>% + left_join(list_net_base[[reg]], by = c("ensembl_id", "gene_symbol"), + suffix = c(".diff", ".base")) + de_genes <- as_tibble(de_genes) + + # identify baseline neighbours of de_genes using igraph + actors <- data.frame(name=unique(c(list_base[[reg]]$target, + list_base[[reg]]$predictor, + de_genes$ensembl_id))) + relations <- data.frame(from=list_base[[reg]]$target, + to=list_base[[reg]]$predictor) + ig <- graph_from_data_frame(relations, directed=FALSE, vertices=actors) + ig <- simplify(ig) + # add baseline neighbors in column + de_genes$neighbors.base <- sapply(de_genes$ensembl_id, + function(x) names(unlist(neighbors(ig, x))) ) + de_genes$nr_neigh_de.base <- sapply(de_genes$neighbors.base, + function(x) length(intersect(x, de_genes$ensembl_id))) + de_genes$nr_neigh_notde.base <- sapply(de_genes$neighbors.base, + function(x) length(setdiff(x, de_genes$ensembl_id))) + de_genes$neighbors.base <- sapply(de_genes$neighbors.base, function(x) if(!length(x) == 0) + mapIds(org.Mm.eg.db, keys = x, keytype = "ENSEMBL", column = "SYMBOL")) + de_genes$neighbors.base <- sapply(de_genes$neighbors.base, function(x) toString(x)) + + + # identify differential neighbours of de_genes using igraph + actors <- data.frame(name=unique(c(list_diff[[reg]]$target, + list_diff[[reg]]$predictor, + de_genes$ensembl_id))) + relations <- data.frame(from=list_diff[[reg]]$target, + to=list_diff[[reg]]$predictor) + ig <- graph_from_data_frame(relations, directed=FALSE, vertices=actors) + ig <- simplify(ig) + # add differential neighbors in column + de_genes$neighbors.diff <- sapply(de_genes$ensembl_id, + function(x) names(unlist(neighbors(ig, x))) ) + de_genes$nr_neigh_de.diff <- sapply(de_genes$neighbors.diff, + function(x) length(intersect(x, de_genes$ensembl_id))) + de_genes$nr_neigh_notde.diff <- sapply(de_genes$neighbors.diff, + function(x) length(setdiff(x, de_genes$ensembl_id))) + de_genes$neighbors.diff <- sapply(de_genes$neighbors.diff, function(x) if(!length(x) == 0) + mapIds(org.Mm.eg.db, keys = x, keytype = "ENSEMBL", column = "SYMBOL")) + de_genes$neighbors.diff <- sapply(de_genes$neighbors.diff, function(x) toString(x)) + + # move gene symbols to second column + de_genes <- de_genes %>% + select(ensembl_id, gene_symbol, everything()) + + # write to file + fwrite(de_genes, file = paste0(basepath, "tables/coExpression_kimono/03_AnalysisFuncoup/", + "07_", reg, "_neighbors_DEgenes.csv")) +} \ No newline at end of file diff --git a/03_CoExp_Analysis/08_multipleRegions_funcoup_focusHubGnes.R b/03_CoExp_Analysis/08_multipleRegions_funcoup_focusHubGnes.R new file mode 100644 index 0000000..e1bfe07 --- /dev/null +++ b/03_CoExp_Analysis/08_multipleRegions_funcoup_focusHubGnes.R @@ -0,0 +1,293 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 21.01.2020 +## Author: Nathalie +################################################## +# Analyze multitissue network +# for comparison with single tissue networks + +library(data.table) +library(dplyr) +library(ggplot2) +library(igraph) +library(eulerr) +library(org.Mm.eg.db) + +basepath <- "~/Documents/ownCloud/DexStim_RNAseq_Mouse/" +regions <- c("AMY", "CER", "dCA1", "dDG", "PFC", "PVN", "vCA1", "vDG") +beta_cutoff <- 0.01 +padj_cutoff <- 0.01 +rsquared_cutoff <- 0.1 + +### FUNCTIONS ------------------------------------- + +# function to read all files from list +readFiles_concat <- function(file_list){ + + # initialize empty data frame + dataset <- data.frame() + + # read each file from list and append to data frame + for (i in 1:length(file_list)){ + temp_data <- fread(file_list[i]) + dataset <- rbindlist(list(dataset, temp_data), use.names = T) + } + + return(dataset) +} + +# Z-score (z_ij) for the differential analysis between gene i and j +z_score <- function(beta_t, beta_c, se_t, se_c){ + z <- (beta_t - beta_c)/ + sqrt((se_t)^2 + (se_c)^2) +} + +# Plot changes in ranks of nodes with highest betweenness +plotRanks <- function(a, b, labels = TRUE, labels.offset=0.1, arrow.len=0.1) +{ + old.par <- par(mar=c(1,1,1,1)) + + # Find the length of the vectors + len.1 <- length(a) + len.2 <- length(b) + + # Plot two columns of equidistant points + plot(rep(1, len.1), 1:len.1, pch=20, cex=0.8, + xlim=c(0, 3), ylim=c(0, max(len.1, len.2)), + axes=F, xlab="", ylab="") # Remove axes and labels + points(rep(2, len.2), 1:len.2, pch=20, cex=0.8) + + # Put labels next to each observation + if (labels){ + text(rep(1-labels.offset, len.1), 1:len.1, a) + text(rep(2+labels.offset, len.2), 1:len.2, b) + } + + # Now we need to map where the elements of a are in b + # We use the match function for this job + a.to.b <- match(a, b) + + # Now we can draw arrows from the first column to the second + arrows(rep(1.02, len.1), 1:len.1, rep(1.98, len.2), a.to.b, + length=arrow.len, angle=20) + par(old.par) +} + +# subset network to only one brain region +subset_network <- function(network, region){ + + # find regions that have connections to brain region + select_genes <- network$target[network$predictor == region] + + # subset network data to those targets with connection to region + network <- network %>% + filter(target %in% select_genes) %>% + filter(startsWith(predictor, "ENSMUS")) +} + + +### ANALYSIS --------------------------------------- + +# 1. Prior and network +funcoup_prior <- fread(file = paste0(basepath, "data/kimono_input/prior_expr_funcoup_mm.csv")) +nodes_prior <- unique(c(funcoup_prior$Gene_A, funcoup_prior$Gene_B)) +relations_prior <- data.frame(from = funcoup_prior$Gene_A, + to = funcoup_prior$Gene_B) +g_prior <- graph_from_data_frame(relations_prior, directed=FALSE, vertices=nodes_prior) +# Calculate nodedegrees of prior +nodedegree_prior <- sort(igraph::degree(g_prior), decreasing = TRUE) +# Calculate nodebetweenness of prior +# nodebetweenness_prior <- betweenness(g_prior, directed = FALSE) # node betweenness: number of shortest paths going through a node +# saveRDS(nodebetweenness_prior, file = paste0(basepath, "data/workspaces/nodebetweenness_funcoup.rds")) +nodebetweenness_prior <- sort(readRDS(paste0(basepath, "data/workspaces/nodebetweenness_funcoup.rds")), decreasing = TRUE) +top100genes_prior <- names(nodebetweenness_prior[1:100]) + + +# 2a. Read kimono expression networks +dex0_files <- list.files(path = file.path(basepath, "tables/coExpression_kimono"), + pattern = paste0("04\\_multipleRegions\\_funcoup\\_dex0\\_SE\\_.*\\.csv"), + full.names = TRUE) +dex1_files <- list.files(path = file.path(basepath, "tables/coExpression_kimono"), + pattern = paste0("04\\_multipleRegions\\_funcoup\\_dex1\\_SE\\_.*\\.csv"), + full.names = TRUE) + +data_dex0 <- readFiles_concat(dex0_files) +data_dex1 <- readFiles_concat(dex1_files) + +# 2b. Join Base and Dex data frame +data <- inner_join(data_dex0, data_dex1, + by = c("target", "predictor"), + suffix = c(".base", ".dex")) +dex_notBase <- anti_join(data_dex1, data_dex0, by = c("target", "predictor")) +head(data) +rm(data_dex0) +rm(data_dex1) + + +# 3. Remove interactions with very low r squared values & intercept & SVs +data <- data %>% + filter(overall_rsq.base >= rsquared_cutoff, overall_rsq.dex >= rsquared_cutoff) %>% + filter(predictor != '(Intercept)') +data <- data[!startsWith(data$predictor, "SV"),] +# remove duplicated interactions (mistake made when separating nodes into chunks) +data <- data %>% + distinct(target, predictor, .keep_all = TRUE) + + +# 4a. Calculate z scores for interactions that are left +data <- mutate(data, z = z_score(beta_mean.dex,beta_mean.base, beta_stderr.dex, beta_stderr.base)) +hist(data$beta_mean.base) + +# 4b. Keep only interactions that have at least in one network a beta value > cutoff +data <- data %>% + mutate(diff = (abs(beta_mean.base) > beta_cutoff | abs(beta_mean.dex) > beta_cutoff)) +data_diff1 <- data %>% + filter(diff) +# calculate p-value for z-score +data_diff1$p_diff <- 2*pnorm(-abs(data_diff1$z)) +data_diff1$p_adj <- p.adjust(data_diff1$p_diff, method = "fdr") + + +# 5. Create diff network corresponding to beta cutoff +data_diff <- data_diff1 %>% + filter(p_adj <= padj_cutoff) +head(data_diff[,c("beta_mean.base", "beta_stderr.base", "beta_mean.dex", "beta_stderr.dex", "z", "p_adj")], 20) + +# Nodes in network +node_vec <- unique(c(data_diff$target, data_diff$predictor)) +node_type <- startsWith(node_vec, "ENSMUS") +names(node_type) <- node_vec + +# Edges in network +relations <- data.frame(from=data_diff$target, + to=data_diff$predictor, + value=data_diff$z, + performance=data_diff$p_adj) +# relations <- data.frame(from=data_diff$target, +# to=data_diff$predictor) +g_diff <- graph_from_data_frame(relations, directed=FALSE, vertices=node_vec) +# does graph contain multiple edges with same start and endpoint or loop edges +is_simple(g_diff) +g_diff <- simplify(g_diff) # check the edge attribute parameter + + +# Calculate nodedegree +nodedegree <- igraph::degree(g_diff) +nodedegree <- sort(nodedegree, decreasing = TRUE) + +# Calculate nodebetweenness +nodebetweenness <- betweenness(g_diff, directed = FALSE) # node betweenness: number of shortest paths going through a node +nodebetweenness <- sort(nodebetweenness, decreasing = TRUE) # same when values are included in g_diff or not +top100genes <- names(nodebetweenness[1:100]) + +# Plot network properties in one plot +data.frame("nodedegree" = nodedegree, + "nodebetweenness" = nodebetweenness, + "gene" = node_type[names(nodedegree)]) %>% + tidyr::gather(key = "property", value = "value", nodedegree:nodebetweenness) %>% + ggplot(aes(y = value, fill = gene)) + + geom_boxplot() + + facet_wrap(~property, scales = "free") + + theme_light() + + theme(legend.title = element_blank(), legend.text = element_text(size = 12), + axis.title.y = element_text(size = 12), axis.text.y = element_text(size = 10), + axis.text.x = element_text(size = 10), strip.text.x = element_text(size = 12)) + + scale_fill_discrete(labels = c("tissues", "genes")) + + ggtitle(paste0("Differential expression network for all brain regions (", + gorder(g_diff), " nodes, ", gsize(g_diff), " edges)")) +ggsave(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/multipleRegions/", + "08_funcoup_focusHubGenes_diffNetwork_betacutoff",beta_cutoff,".png"), + width = 8, height = 6) + + +# ANALYZE EACH BRAIN REGION ------------------------ + +for (region in regions){ + + # A1. Get single region network with top genes + # Subset network + net_region <- subset_network(data_diff, region) + + # Nodes in network + node_vec <- unique(c(net_region$target, net_region$predictor)) + + # Edges in network + relations <- data.frame(from=net_region$target, + to=net_region$predictor, + value=net_region$z, + performance=net_region$p_adj) + g_diff_reg <- graph_from_data_frame(relations, directed=FALSE, vertices=node_vec) + # does graph contain multiple edges with same start and endpoint or loop edges + is_simple(g_diff_reg) + g_diff <- simplify(g_diff_reg) # check the edge attribute parameter + + # Calculate nodedegree + nodedegree_reg <- igraph::degree(g_diff_reg) + nodedegree_reg <- sort(nodedegree_reg, decreasing = TRUE) + + # Calculate nodebetweenness + nodebetweenness_reg <- betweenness(g_diff_reg, directed = FALSE) # node betweenness: number of shortest paths going through a node + nodebetweenness_reg <- sort(nodebetweenness_reg, decreasing = TRUE) # same when values are included in g_diff or not + top100genes_reg <- names(nodebetweenness_reg[1:100]) + + + # A2. Compare extracted single region network with kimono single region network + # Read nodedegree and nodebetweenness + nodedegree_single <- fread(file = paste0(basepath, "/tables/coExpression_kimono/03_AnalysisFuncoup/", + "04_",region,"_funcoup_differential_nodedegreesNorm_betacutoff",beta_cutoff,".csv"), + quote = FALSE) + nodebetweenness_single <- fread(file = paste0(basepath, "/tables/coExpression_kimono/03_AnalysisFuncoup/", + "04_",region,"_funcoup_differential_nodebetweennessNorm_betacutoff",beta_cutoff,".csv"), + quote = FALSE) + + # A3. Comparison nodedegree + # Rank nodedegree + nodedegree_multi_rank <- rank(-nodedegree_reg[nodedegree_single$ensembl_id]) + nodedegree_single_rank <- rank(-nodedegree_single$nodedegree) + names(nodedegree_single_rank) <- nodedegree_single$ensembl_id + + # Correlation between ranks of nodedegrees + cor_nodedegree <- cor.test(nodedegree_single_rank, nodedegree_multi_rank, + method = "spearman") + + data.frame("single" = nodedegree_single$nodedegree, + "multi" = nodedegree_reg[nodedegree_single$ensembl_id]) %>% + ggplot(aes(x=single, y=multi)) + + # geom_point(size=1,alpha = 0.1) + # geom_hex() + + geom_bin2d() + + xlab("nodedegree single tissue network") + + ylab("nodedegree multi tissue network") + + ggtitle(paste0("Nodedegree in single and multi tissue network (Correlation between ranks: ", + round(cor_nodedegree$estimate, digits = 2),")")) + ggsave(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/multipleRegions/", + "08_",region,"_single_multi_funcoup_correlationNodedegree_betacutoff",beta_cutoff,".png"), + width = 9, height = 8) + + + # A4. Comparison nodebetweenness + # Rank nodebetweenness + nodebetween_multi_rank <- rank(-nodebetweenness_reg[nodebetweenness_single$ensembl_id]) + nodebetween_single_rank <- rank(-nodebetweenness_single$nodebetweenness) + names(nodebetween_single_rank) <- nodebetweenness_single$ensembl_id + + # Correlation between ranks of nodedegrees + cor_nodebetween <- cor.test(nodebetween_single_rank, nodebetween_multi_rank, + method = "spearman") + + data.frame("single" = nodebetweenness_single$nodebetweenness, + "multi" = nodebetweenness_reg[nodebetweenness_single$ensembl_id]) %>% + ggplot(aes(x=single, y=multi)) + + # geom_point(size=1,alpha = 0.1) + # geom_hex() + + geom_bin2d() + + xlab("nodebetweenness single tissue network") + + ylab("nodebetweenness multi tissue network") + + ggtitle(paste0("Nodebetweenness in single and multi tissue network (Correlation between ranks: ", + round(cor_nodedegree$estimate, digits = 2),")")) + ggsave(filename = paste0(basepath, "figures/02_CoExp_Kimono/03_AnalysisFuncoup/multipleRegions/", + "08_",region,"_single_multi_funcoup_correlationNodebetweenness_betacutoff",beta_cutoff,".png"), + width = 9, height = 8) + +} + diff --git a/03_CoExp_Analysis/09_tablesAnthi_kimono.R b/03_CoExp_Analysis/09_tablesAnthi_kimono.R new file mode 100644 index 0000000..b21e873 --- /dev/null +++ b/03_CoExp_Analysis/09_tablesAnthi_kimono.R @@ -0,0 +1,113 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 15.02.2020 +## Author: Nathalie +################################################## +# Make tables for Anthi with kimono results + +setwd("~/Documents/ownCloud/DexStim_RNAseq_Mouse") + +library(org.Mm.eg.db) +library(dplyr) +library(data.table) +library(anRichment) +library(anRichmentMethods) + +regions <- c("AMY", "PFC", "PVN", "CER", "vDG", "dDG", "vCA1", "dCA1") +mode <- "differential" + +# 1. read hub gene tables from all regions ---------- + +list_reg <- list() +for (reg in regions){ + if (mode == "differential"){ + res <- fread(file=paste0("tables/coExpression_kimono/03_AnalysisFuncoup/", + "04_", reg, "_funcoup_", mode, "_nodebetweennessNorm_betacutoff0.01.csv")) + } else { + res <- fread(file=paste0("tables/coExpression_kimono/03_AnalysisFuncoup/", + "03_", reg, "_funcoup_", mode, "_nodebetweennessNorm_betacutoff0.01.csv")) + } + res <- res %>% + filter(nodebetweenness_norm >= 1) + res$entrez <- mapIds(org.Mm.eg.db, keys = res$ensembl_id, + keytype = "ENSEMBL", column="ENTREZID") + list_reg[[reg]] <- res +} + + +# 2. check uniqueness of hub genes --------------- + +for (reg in regions){ + index_reg <- which(regions == reg) + df <- bind_rows(list_reg[-index_reg], .id="region") + # find regions where gene is also hub + list_reg[[reg]]$regions_hub <- sapply(list_reg[[reg]]$ensembl_id, + function(x) paste(df[df$ensembl_id == x,]$region, collapse = " ")) + # boolean if gene is hub uniquely in this region + list_reg[[reg]]$unique_hub <- sapply(list_reg[[reg]]$regions_hub, + function(x) x == "") +} + + +# 3. GO enrichment for the genes of each region ------------------ + +go_enrichment_all <- function(df_reg, GOcoll, unique, background){ + if (unique){ + genes <- df_reg$entrez[df_reg$unique_hub] + } else { + genes <- df_reg$entrez + } + modules <- rep("not_significant", length(background)) + modules[which(background %in% genes)] <- "significant" + + # enrichment + GOenrichment <- enrichmentAnalysis( + classLabels = modules, + identifiers = background, + refCollection = GOcoll, + useBackground = "given", + nBestDataSets = length(GOcoll$dataSets), + # threshold = 0.1, + # thresholdType = "Bonferroni", + getOverlapEntrez = TRUE, + getOverlapSymbols = TRUE, + ignoreLabels = "not_significant", + maxReportedOverlapGenes = 500 + ) + + enrichmentTable <- GOenrichment$enrichmentTable %>% + filter(nCommonGenes > 10, pValue <= 0.1) + + return(enrichmentTable) +} + +GOcollection <- buildGOcollection(organism = "mouse") +# GO.BPcollection = subsetCollection(GOcollection, tags = "GO.BP") +background <- fread(file = "data/kimono_input/prior_expr_funcoup_mm.csv") +background <- unique(c(background$Gene_A, background$Gene_B)) +background <- mapIds(org.Mm.eg.db, keys = background, + keytype = "ENSEMBL", column="ENTREZID") +for (reg in regions){ + + go_enr_unique <- go_enrichment_all(list_reg[[reg]], GOcollection, TRUE, background) + fwrite(go_enr_unique, file = paste0("tables/coExpression_kimono/03_AnalysisFuncoup/", + "/09_", reg, "_GOterms_unique_", mode, ".csv")) + go_enr_all <- go_enrichment_all(list_reg[[reg]], GOcollection, FALSE, background) + fwrite(go_enr_all, file = paste0("tables/coExpression_kimono/03_AnalysisFuncoup/", + "/09_", reg, "_GOterms_all_", mode, ".csv")) + + list_reg[[reg]]$GOterms_unique <- sapply(list_reg[[reg]]$entrez, + function(x) paste(go_enr_unique$dataSetName[which(str_detect(go_enr_unique$overlapGenes, x))], collapse="|")) + list_reg[[reg]]$GOterms_all <- sapply(list_reg[[reg]]$entrez, + function(x) paste(go_enr_all$dataSetName[which(str_detect(go_enr_all$overlapGenes, x))], collapse="|")) +} + + +# 4. Print df of each brain region to file ------------------- + +for (reg in regions){ + # list_reg[[reg]]$ensembl_id <- NULL + fwrite(list_reg[[reg]], file = paste0("tables/coExpression_kimono/03_AnalysisFuncoup/", + "/09_", reg, "_hubGenes_unique_GOterms_", mode, ".csv")) +} + diff --git a/03_CoExp_Analysis/10_GOenrichment_comparisonHubAndDE.R b/03_CoExp_Analysis/10_GOenrichment_comparisonHubAndDE.R new file mode 100644 index 0000000..400a7a7 --- /dev/null +++ b/03_CoExp_Analysis/10_GOenrichment_comparisonHubAndDE.R @@ -0,0 +1,186 @@ +################################################## +## Project: DexStim Mouse Brain +## Date: 05.10.2021 +## Author: Nathalie +################################################## +# Compare GO enrichment of unique DE and hub genes + +library(data.table) +library(dplyr) +library(ggplot2) +library(org.Mm.eg.db) +library(clusterProfiler) + +basepath <- "~/Documents/ownCloud/DexStim_RNAseq_Mouse/" +regions <- c("AMY", "CER", "dCA1", "dDG", "PFC", "PVN", "vCA1", "vDG") +beta_cutoff <- 0.01 + +folder_plots <- paste0("figures") +folder_tables <- paste0("tables") + + +# 1. Read DE and hub genes from all regions and background ---------- + +# DE genes +list_reg_de <- list() +for (reg in regions){ + res <- read.table(file=paste0(basepath, folder_tables, "/02_", reg, "_deseq2_Dex_1_vs_0_lfcShrink.txt"), + sep="\t", header = TRUE) + res <- res[res$padj <= 0.1,] + res$gene_symbol <- mapIds(org.Mm.eg.db, keys = res$Ensembl_ID, + keytype = "ENSEMBL", column="SYMBOL") + res$entrez <- mapIds(org.Mm.eg.db, keys = res$Ensembl_ID, + keytype = "ENSEMBL", column="ENTREZID") + list_reg_de[[reg]] <- res +} + +# hub genes +list_reg_hub <- list() +for (reg in regions){ + res <- fread(paste0(basepath, "tables/coExpression_kimono/03_AnalysisFuncoup/04_", + reg,"_funcoup_differential_nodeBetweennessNorm_betacutoff",beta_cutoff,".csv")) + res <- res[res$nodebetweenness_norm>=1.0 & ! is.na(res$nodebetweenness_norm)] + res$entrez <- mapIds(org.Mm.eg.db, keys = res$ensembl_id, + keytype = "ENSEMBL", column="ENTREZID") + list_reg_hub[[reg]] <- res +} + +# background are all genes in out dataset +background <- read.table(file = paste0(basepath, "tables/06_background_entrezID.txt"), + header = FALSE)[,1] + + +# 2. check uniqueness of DE and hub genes --------------- + +# DE genes +for (reg in regions){ + index_reg <- which(regions == reg) + df <- bind_rows(list_reg_de[-index_reg], .id="region") + list_reg_de[[reg]]$regions_DE <- sapply(list_reg_de[[reg]]$Ensembl_ID, + function(x) paste(df[df$Ensembl_ID == x,]$region, collapse = " ")) + list_reg_de[[reg]]$unique_DE <- sapply(list_reg_de[[reg]]$regions_DE, + function(x) x == "") +} + +# hub genes +for (reg in regions){ + index_reg <- which(regions == reg) + df <- bind_rows(list_reg_hub[-index_reg], .id="region") + list_reg_hub[[reg]]$regions_hub <- sapply(list_reg_hub[[reg]]$ensembl_id, + function(x) paste(df[df$ensembl_id == x,]$region, collapse = " ")) + list_reg_hub[[reg]]$unique_hub <- sapply(list_reg_hub[[reg]]$regions_hub, + function(x) x == "") +} + + +# 2. Plot top GO enrichment of DE and hub genes per brain regions + +#minCount <- 0 +#minCount <- 5 +# minCount <- 10 + +# --> changed to minCount with regard to number of genes in geneset + +for (reg in regions){ + + # Unique DE genes + genes <- list_reg_de[[reg]]$entrez[list_reg_de[[reg]]$unique_DE] + + # GO enrichment for unique DE genes + ego_de <- enrichGO(gene = as.character(genes), + universe = as.character(background), + OrgDb = org.Mm.eg.db, + ont = "BP", + pAdjustMethod = "BH", + pvalueCutoff = 0.01, + qvalueCutoff = 0.05, + # pvalueCutoff = 1, + # qvalueCutoff = 1, + minGSSize = 5, # min number of genes associated with GO term + maxGSSize = 10000, # max number of genes associated with GO term + readable = TRUE)@result + # min number of genes overlapping + min_count <- ceiling(length(genes)*0.15) + ego_de_filt <- ego_de[ego_de$Count >= min_count,] + + + # Unique hub genes + genes <- list_reg_hub[[reg]]$entrez[list_reg_hub[[reg]]$unique_hub] + + # GO enrichment for unique hub genes + ego_hub <- enrichGO(gene = as.character(genes), + universe = as.character(background), + OrgDb = org.Mm.eg.db, + ont = "BP", + pAdjustMethod = "BH", + pvalueCutoff = 0.01, + qvalueCutoff = 0.05, + # pvalueCutoff = 1, + # qvalueCutoff = 1, + minGSSize = 5, # min number of genes associated with GO term + maxGSSize = 10000, # max number of genes associated with GO term + readable = TRUE)@result + # min number of genes overlapping + min_count <- ceiling(length(genes)*0.15) + ego_hub_filt <- ego_hub[ego_hub$Count >= min_count,] + + + # Plot top 10 genes of DE and hub with comparison to each other + + # Top 10 terms for DE genes + ego_de_top10 <- head(ego_de_filt, n = 10) %>% + dplyr::mutate("main" = TRUE) + ego_de_top10_hub <- ego_hub[ego_hub$ID %in% ego_de_top10$ID,] %>% + dplyr::mutate("main" = FALSE) + + ego_de_plot <- bind_rows("de" = ego_de_top10, "hub" = ego_de_top10_hub, + .id = "groups") + + # Top 10 terms for hub genes + ego_hub_top10 <- head(ego_hub_filt, n = 10) %>% + dplyr::mutate("main" = TRUE) + ego_hub_top10_de <- ego_de[ego_de$ID %in% ego_hub_top10$ID,] %>% + dplyr::mutate("main" = FALSE) + + ego_hub_plot <- bind_rows("hub" = ego_hub_top10, "de" = ego_hub_top10_de, + .id = "groups") + + # Combine them into one dataframe + ego_plot <- bind_rows("de" = ego_de_plot, "hub" = ego_hub_plot, + .id = "subplot") + ego_plot$Description <- factor(ego_plot$Description, + levels = unique(ego_plot$Description)) + + # labeller function for facet titles + facet_names <- c( + 'de'="Top 10 GO terms for DE genes", + 'hub'="Top 10 GO terms for hub genes" + ) + + # Plot them all together + print(ggplot(ego_plot, aes(x=Description, y = -log10(p.adjust), + fill = groups, colour = main)) + + geom_bar(position = position_dodge2(reverse=TRUE), stat="identity") + + geom_hline(yintercept = -log10(0.1),linetype="dashed", color = "red") + + scale_colour_manual(values = c("grey", "black"), guide = FALSE) + + scale_fill_manual(name = "Geneset", + values = c("orange", "darkred"), + labels = c("DE genes", "hub genes")) + + coord_flip() + + scale_x_discrete(limits = rev) + + xlab("GOterm") + + ggtitle(paste0("GO terms enriched for DE and hub genes only in ",reg)) + + facet_wrap(~subplot, scales="free", labeller = as_labeller(facet_names)) + + theme(axis.text.y = element_text(size = 10))) + # ggsave(filename = paste0(basepath, folder_plots, "/02_CoExp_Kimono/03_AnalysisFuncoup/comparisonRegions/", + # "10_",reg,"_GOterms_DEandHubGenes_min", minCount,".png"), + # width = 13, height = 7) + ggsave(filename = paste0(basepath, folder_plots, "/02_CoExp_Kimono/03_AnalysisFuncoup/comparisonRegions/", + "10_",reg,"_GOterms_DEandHubGenes.pdf"), + width = 11, height = 7) + + + +} + + diff --git a/06_Shiny/.DS_Store b/06_Shiny/.DS_Store new file mode 100644 index 0000000..b455ec8 Binary files /dev/null and b/06_Shiny/.DS_Store differ diff --git a/06_Shiny/.Rproj.user/.DS_Store b/06_Shiny/.Rproj.user/.DS_Store new file mode 100644 index 0000000..7651473 Binary files /dev/null and b/06_Shiny/.Rproj.user/.DS_Store differ diff --git a/06_Shiny/.Rproj.user/94131CCE/pcs/files-pane.pper b/06_Shiny/.Rproj.user/94131CCE/pcs/files-pane.pper new file mode 100644 index 0000000..7935ff1 --- /dev/null +++ b/06_Shiny/.Rproj.user/94131CCE/pcs/files-pane.pper @@ -0,0 +1,9 @@ +{ + "sortOrder": [ + { + "columnIndex": 2, + "ascending": true + } + ], + "path": "~/Documents/ownCloud/DexStim_RNAseq_Mouse/scripts/06_Shiny" +} \ No newline at end of file diff --git a/06_Shiny/.Rproj.user/94131CCE/pcs/source-pane.pper b/06_Shiny/.Rproj.user/94131CCE/pcs/source-pane.pper new file mode 100644 index 0000000..bc7dc99 --- /dev/null +++ b/06_Shiny/.Rproj.user/94131CCE/pcs/source-pane.pper @@ -0,0 +1,3 @@ +{ + "activeTab": 4 +} \ No newline at end of file diff --git a/06_Shiny/.Rproj.user/94131CCE/pcs/windowlayoutstate.pper b/06_Shiny/.Rproj.user/94131CCE/pcs/windowlayoutstate.pper new file mode 100644 index 0000000..104979b --- /dev/null +++ b/06_Shiny/.Rproj.user/94131CCE/pcs/windowlayoutstate.pper @@ -0,0 +1,14 @@ +{ + "left": { + "splitterpos": 312, + "topwindowstate": "NORMAL", + "panelheight": 743, + "windowheight": 781 + }, + "right": { + "splitterpos": 468, + "topwindowstate": "NORMAL", + "panelheight": 743, + "windowheight": 781 + } +} \ No newline at end of file diff --git a/06_Shiny/.Rproj.user/94131CCE/pcs/workbench-pane.pper b/06_Shiny/.Rproj.user/94131CCE/pcs/workbench-pane.pper new file mode 100644 index 0000000..07157f3 --- /dev/null +++ b/06_Shiny/.Rproj.user/94131CCE/pcs/workbench-pane.pper @@ -0,0 +1,5 @@ +{ + "TabSet1": 0, + "TabSet2": 4, + "TabZoom": {} +} \ No newline at end of file diff --git a/06_Shiny/.Rproj.user/94131CCE/rmd-outputs b/06_Shiny/.Rproj.user/94131CCE/rmd-outputs new file mode 100644 index 0000000..3f2ff2d --- /dev/null +++ b/06_Shiny/.Rproj.user/94131CCE/rmd-outputs @@ -0,0 +1,5 @@ + + + + + diff --git a/06_Shiny/.Rproj.user/94131CCE/saved_source_markers b/06_Shiny/.Rproj.user/94131CCE/saved_source_markers new file mode 100644 index 0000000..2b1bef1 --- /dev/null +++ b/06_Shiny/.Rproj.user/94131CCE/saved_source_markers @@ -0,0 +1 @@ +{"active_set":"","sets":[]} \ No newline at end of file diff --git a/06_Shiny/.Rproj.user/94131CCE/sources/per/t/1532F1BD b/06_Shiny/.Rproj.user/94131CCE/sources/per/t/1532F1BD new file mode 100644 index 0000000..0a412e4 --- /dev/null +++ b/06_Shiny/.Rproj.user/94131CCE/sources/per/t/1532F1BD @@ -0,0 +1,24 @@ +{ + "id": "1532F1BD", + "path": "~/Documents/ownCloud/DexStim_RNAseq_Mouse/scripts/06_Shiny/R/network_multiRegion.R", + "project_path": "R/network_multiRegion.R", + "type": "r_source", + "hash": "4109658605", + "contents": "", + "dirty": false, + "created": 1636637444100.0, + "source_on_save": false, + "relative_order": 5, + "properties": { + "cursorPosition": "219,31", + "scrollLine": "211" + }, + "folds": "", + "lastKnownWriteTime": 1636637543, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1636637543028, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/06_Shiny/.Rproj.user/94131CCE/sources/per/t/1532F1BD-contents b/06_Shiny/.Rproj.user/94131CCE/sources/per/t/1532F1BD-contents new file mode 100644 index 0000000..402cda3 --- /dev/null +++ b/06_Shiny/.Rproj.user/94131CCE/sources/per/t/1532F1BD-contents @@ -0,0 +1,494 @@ +# module UI function +networkMultiUI <- function(id, label = "Network Multiple Regions"){ + + ns <- NS(id) + + tagList( + + sidebarPanel( + pickerInput( + inputId = ns("network_multireg"), + label = "Select brain regions", + choices = c("Amygdala" = "AMY", + "Cerebellum" = "CER", + "Dorsal DG of Hippocampus" = "dDG", + "Dorsal CA1 of Hippocampus" = "dCA1", + "Prefrontal Cortex" = "PFC", + "PVN of Hypothalamus" = "PVN", + "Ventral DG of Hippocampus" = "vDG", + "Ventral CA1 of Hippocampus" = "vCA1"), + options = list( + `actions-box` = TRUE, + size = 8, + `selected-text-format` = "count > 3" + ), + multiple = TRUE + ), + searchInput( + inputId = ns("network_gene"), + label = "Enter comma-separated list of genes: ", + placeholder = "e.g. Nr3c1,Fkbp5", + btnSearch = icon("search", class = "fas fa-search", lib = "font-awesome"), + btnReset = icon("remove", class = "fas fa-remove", lib = "font-awesome"), + width = "100%" + ), + fileInput( + inputId = ns("file1"), + label = "Upload file with list of genes", + accept = "text/plain", + buttonLabel = "Upload", + placeholder = "Upload file with list of genes"), + radioButtons( + ns("network_type"), + label = "Select type of network to display:", + choiceNames = list("Baseline", "Differential", "Treatment"), + choiceValues = list("baseline", "diff", "treatment"), + selected = "diff" + ), + radioButtons( + ns("vis_neighbours"), + label = "Include gene neighbourhood", + choiceNames = c("Yes", "No"), + choiceValues = c(TRUE, FALSE), + selected = FALSE + ), + radioButtons( + ns("id_type"), + label = "Select type of gene ID:", + choices = list("Ensembl", "Gene Symbol"), + selected = "Gene Symbol" + ), + checkboxGroupInput( + ns("tableContent"), + label = "Table content:", + choices = list( + "z-scores" = "z", + "p-values" = "p", + "beta values" = "beta" + ), + selected = "z" + ), + downloadButton(ns("download2"),"Download (filtered) table as csv"), + width = 3 + ), + mainPanel( + # visNetwork + fluidPage( + br(), + tags$style(".fa-project-diagram {color:#2980B9}"), + h3(p( + em("Network analysis of gene expression "), + icon("project-diagram", lib = "font-awesome"), + style = "color:black;text-align:center" + )), + hr(), + h4(p("Differential network", style = "color:black;text-align:center")), + visNetworkOutput(ns("network_diff_multi"), height = "600px") + %>% withSpinner(color="#0dc5c1"), + DT::dataTableOutput(ns("network_table")) + ) + ) + + + ) +} + + + +# module server function +networkMultiServer <- function(id) { + moduleServer( + id, + + function(input, output, session) { + + ### MULTI REGION NETWORK ------------------------------- + + # Load data from multiple brain region + network_data <- reactive({ + + req(input$network_multireg) + + # Read data tables from all required regions + list_network <- list() + + for (reg in input$network_multireg){ + file_path <- paste0( + "tables/network/", + "04_singleRegion_", + reg, + "_filtered_", + input$network_type, + "Network.csv" + ) + # Read data + if (input$network_type == "diff") { + data <- + fread(file = file_path) %>% + dplyr::select(target, predictor, beta_mean.base, beta_mean.dex, z, p_adj) + #dplyr::select(target, predictor, z) %>% + #dplyr::rename_with(.fn = ~ reg, .cols = z) + + cols <- colnames(data)[3:6] + data <- data %>% + dplyr::rename_with(.fn = ~paste0(., ".", reg), .cols = all_of(cols) ) + } else { + data <- + fread(file = file_path) %>% + dplyr::select(target, predictor, beta_mean) + # cols <- colnames(data)[3] + data <- data %>% + dplyr::rename_with(.fn = ~ reg, .cols = beta_mean) + # dplyr::rename_with(.fn = ~paste0(., ".", reg), .cols = all_of(cols) ) + } + list_network[[reg]] <- data + } + + data_joined <- list_network %>% + purrr::reduce(full_join, by = c("target","predictor")) + + return(data_joined) + }) + + + # #### TEST + # + # list_network <- list() + # + # for (reg in c("AMY", "CER", "PFC")){ + # file_path <- paste0( + # basepath, + # "tables/coExpression_kimono/03_AnalysisFuncoup/", + # "04_singleRegion_", + # reg, + # "_filtered_", + # "baseline", + # "Network.csv" + # ) + # # Read data + # data <- + # fread(file = file_path) %>% + # #dplyr::select(target, predictor, beta_mean.base, beta_mean.dex, z, p_adj) + # dplyr::select(target, predictor, beta_mean, beta_mean.dex, z, p_adj) + # + # cols <- colnames(data)[3:6] + # #cols <- colnames(data)[3] + # data <- data %>% + # dplyr::rename_with(.fn = ~paste0(., ".", reg), .cols = all_of(cols) ) + # list_network[[reg]] <- data + # } + # + # data_joined <- list_network %>% + # purrr::reduce(full_join, by = c("target","predictor")) + + + # DE genes of required brain regions + de_genes <- reactive({ + + list_de <- list() + # Read and subset DE genes for coloring + for (reg in input$network_multireg){ + df_de <- fread(paste0("tables/de/02_", + reg,"_deseq2_Dex_1_vs_0_lfcShrink.txt")) + na_indices <- which(is.na(df_de$padj)) + df_de$padj[na_indices] <- 1 + df_de <- df_de[df_de$padj <= 0.1,] + + df_de <- df_de %>% + dplyr::select(Ensembl_ID, padj) %>% + dplyr::rename_with(.fn = ~ reg, .cols = padj) + + list_de[[reg]] <- df_de + } + + de_joined <- list_de %>% + purrr::reduce(full_join, by = c("Ensembl_ID")) + + return(de_joined) + }) + + + # hub genes of required brain regions + hub_genes <- reactive({ + + list_hub <- list() + # Read and subset hub genes for coloring + for (reg in input$network_multireg) { + df_hub <- + fread( + paste0( + "tables/network/04_", + reg, + "_funcoup_differential", + "_nodebetweennessNorm_betacutoff0.01.csv" + ) + ) %>% + filter(nodebetweenness_norm >= 1) %>% + dplyr::select(ensembl_id, nodebetweenness_norm) %>% + dplyr::rename_with(.fn = ~ reg, .cols = nodebetweenness_norm) + + list_hub[[reg]] <- df_hub + } + + hub_joined <- list_hub %>% + purrr::reduce(full_join, by = c("ensembl_id")) + + return(hub_joined) + }) + + + # input genes + input_genes <- reactive({ + + if (isTruthy(input$network_gene)){ + genes <- input$network_gene + genes <- stringr::str_split(genes, ",\\s?")[[1]] + genes <- reformat_genes(genes) + } else if (isTruthy(input$file1)){ + genes <- read.csv(input$file1$datapath, header = FALSE)$V1 + genes <- reformat_genes(genes) + } + }) + + + # bring input genes to correct format + reformat_genes <- function(list_genes){ + if (!startsWith(list_genes[1], "ENSMUSG")){ + format_gene <- sapply(list_genes, stringr::str_to_title) + format_gene <- mapIds(org.Mm.eg.db, keys = format_gene, column = "ENSEMBL", keytype = "SYMBOL") + ids_na <- names(format_gene)[which(is.na(format_gene))] + #print(ids_na) + if (length(ids_na) > 0) { + showNotification(paste("No Ensembl ID found for following genes: ", + ids_na), type = "message", duration = 5) + } + format_gene <- format_gene[which(!is.na(format_gene))] + } else { + format_gene <- list_genes + } + return(format_gene) + } + + + # Network visualization + output$network_diff_multi <- renderVisNetwork({ + + req(input_genes()) + req(network_data()) + req(de_genes()) + req(hub_genes()) + + # Get Nodes and Edges + network <- network(network_data(), + de_genes(), + hub_genes(), + input_genes(), + input$network_type, + input$id_type, + input$vis_neighbours) + + visNetwork(network$nodes, network$edges) %>% + visEdges(arrows = "to") %>% + visOptions(highlightNearest = list(enabled = T, degree = 1, hover = T)) %>% + visLegend(addEdges = network$ledges, #addNodes = network$lnodes, + useGroups = FALSE) %>% + visIgraphLayout() + }) + + + # Data table with network results + output$network_table <- DT::renderDataTable({ + + network_table() %>% + datatable(filter = list(position = 'top')) + }, server = TRUE) # FALSE to enable download of all pages with button + + + # Download of (filtered) network table + output$download2 <- downloadHandler( + filename = function() { + paste0("network_dexStimMouse_multiRegion.csv") + }, + content = function(file) { + indices <- input$network_table_rows_all + write.csv(network_table()[indices, ], file) + } + ) + + + # prepare network for visualization + network <- function(data, de_genes, hub_genes, input_genes, mode, id_type, neighbours){ + + # input genes + print(input_genes) + + # filter data + data <- data %>% + dplyr::rename(from = predictor, to = target) %>% + dplyr::filter(from != to) + data <- subset_network(data, input_genes, neighbours) + + # color palettes for edges and nodes + # edge palette --> assumes that data stores only target, predictor and z scores + # edge_colors <- brewer.pal(ncol(data)-1, "Blues")[2:(ncol(data)-1)] + + # Edges + if(mode == "baseline" | mode == "treatment"){ + # color palettes for edges and nodes + edge_colors <- brewer.pal(ncol(data)-1, "Blues")[2:(ncol(data)-1)] + + # count regions per diff edge that are not na + data$edge_reg <- apply(data[,3:ncol(data)], 1, function(x) names(which(!is.na(x))) ) + data$count_reg <- sapply(data$edge_reg, length) + data$title <- paste0("
Connection in: ", sapply(data$edge_reg, paste, collapse = ", "), + "
") + # assign color to edges according to number of regions + data$c <- sapply(data$count_reg, function(x) edge_colors[x]) + + print(head(data)) + # df for edges + relations <- data.table(from=data$from, + to=data$to, + #beta = data$beta_mean, + color = c, + title = data$title) + # df for edge legend + ledges <- data.frame(color = edge_colors, + label = 1:(ncol(data)-6), + font.align = "top") + } else { + # remove columns + data <- data %>% dplyr::select(-contains("beta"), -contains("p_adj")) + + # color palettes for edges and nodes + # edge palette --> assumes that data stores only target, predictor and z scores + edge_colors <- brewer.pal(ncol(data)-1, "Blues")[2:(ncol(data)-1)] + + # count regions per diff edge that are not na + data$edge_reg <- apply(data[,3:ncol(data)], 1, function(x) names(which(!is.na(x))) ) + data$count_reg <- sapply(data$edge_reg, length) + data$title <- paste0("Diff. co-expressed in: ", sapply(data$edge_reg, paste, collapse = ", "), + "
") + # assign color to edges according to number of regions + data$c <- sapply(data$count_reg, function(x) edge_colors[x]) + + print(head(data)) + # df for edges + relations <- data.table(from=data$from, + to=data$to, + #z = data$z, + #p_adj = data$p_adj, + color = data$c, + title = data$title) + # df for edge legend + ledges <- data.frame(color = edge_colors, + label = 1:(ncol(data)-6), + font.align = "top") + } + print(nrow(relations)) + + + # Nodes + # get unique nodes with correct id + nodes <- data.frame("id" = unique(union( + c(relations$from, relations$to), + input_genes + )), stringsAsFactors = FALSE) + if (id_type == "Gene Symbol"){ + print(nodes$id) + nodes$label <- mapIds(org.Mm.eg.db, keys = nodes$id, + column = "SYMBOL", keytype = "ENSEMBL") + } else { + nodes$label <- nodes$id + } + + # count regions where gene is DE or/and hub + nodes_de <- left_join(x = nodes, y = de_genes, + by = c("id" = "Ensembl_ID")) + nodes_hub <- left_join(x = nodes, y = hub_genes, + by = c("id" = "ensembl_id")) + + nodes$de_reg <- apply(nodes_de[,3:ncol(nodes_de)], 1, + function(x) names(which(!is.na(x))) ) + nodes$de_count <- sapply(nodes$de_reg, length) + + nodes$hub_reg <- apply(nodes_hub[,3:ncol(nodes_hub)], 1, + function(x) names(which(!is.na(x))) ) + nodes$hub_count <- sapply(nodes$hub_reg, length) + + # set color according to DE/hub status + nodes$color <- rep("darkblue", nrow(nodes_de)) + nodes$color[nodes$de_count > 0] <- "orange" + nodes$color[nodes$hub_count > 0] <- "darkred" + nodes$color[nodes$de_count > 0 & nodes$hub_count > 0] <- "purple" + + #nodes$opacity <- nodes$de_count + nodes$hub_count + #nodes$opacity <- nodes$opacity/max(nodes$opacity) + + nodes$title <- paste0("",nodes$label,"",
+ "
DE in regions: ", sapply(nodes$de_reg, paste, collapse = ", "),
+ "
Hub in regions: ", sapply(nodes$hub_reg, paste, collapse = ", "),"
Connection in: ", sapply(data$edge_reg, paste, collapse = ", "), + "
") + # assign color to edges according to number of regions + data$c <- sapply(data$count_reg, function(x) edge_colors[x]) + + print(head(data)) + # df for edges + relations <- data.table(from=data$from, + to=data$to, + #beta = data$beta_mean, + color = c, + title = data$title) + # df for edge legend + ledges <- data.frame(color = edge_colors, + label = 1:(ncol(data)-6), + font.align = "top") + } else { + # remove columns + data <- data %>% dplyr::select(-contains("beta"), -contains("p_adj")) + + # color palettes for edges and nodes + # edge palette --> assumes that data stores only target, predictor and z scores + edge_colors <- brewer.pal(ncol(data)-1, "Blues")[2:(ncol(data)-1)] + + # count regions per diff edge that are not na + data$edge_reg <- apply(data[,3:ncol(data)], 1, function(x) names(which(!is.na(x))) ) + data$count_reg <- sapply(data$edge_reg, length) + data$title <- paste0("Diff. co-expressed in: ", sapply(data$edge_reg, paste, collapse = ", "), + "
") + # assign color to edges according to number of regions + data$c <- sapply(data$count_reg, function(x) edge_colors[x]) + + print(head(data)) + # df for edges + relations <- data.table(from=data$from, + to=data$to, + #z = data$z, + #p_adj = data$p_adj, + color = data$c, + title = data$title) + # df for edge legend + ledges <- data.frame(color = edge_colors, + label = 1:(ncol(data)-6), + font.align = "top") + } + print(nrow(relations)) + + + # Nodes + # get unique nodes with correct id + nodes <- data.frame("id" = unique(union( + c(relations$from, relations$to), + input_genes + )), stringsAsFactors = FALSE) + if (id_type == "Gene Symbol"){ + print(nodes$id) + nodes$label <- mapIds(org.Mm.eg.db, keys = nodes$id, + column = "SYMBOL", keytype = "ENSEMBL") + } else { + nodes$label <- nodes$id + } + + # count regions where gene is DE or/and hub + nodes_de <- left_join(x = nodes, y = de_genes, + by = c("id" = "Ensembl_ID")) + nodes_hub <- left_join(x = nodes, y = hub_genes, + by = c("id" = "ensembl_id")) + + nodes$de_reg <- apply(nodes_de[,3:ncol(nodes_de)], 1, + function(x) names(which(!is.na(x))) ) + nodes$de_count <- sapply(nodes$de_reg, length) + + nodes$hub_reg <- apply(nodes_hub[,3:ncol(nodes_hub)], 1, + function(x) names(which(!is.na(x))) ) + nodes$hub_count <- sapply(nodes$hub_reg, length) + + # set color according to DE/hub status + nodes$color <- rep("darkblue", nrow(nodes_de)) + nodes$color[nodes$de_count > 0] <- "orange" + nodes$color[nodes$hub_count > 0] <- "darkred" + nodes$color[nodes$de_count > 0 & nodes$hub_count > 0] <- "purple" + + #nodes$opacity <- nodes$de_count + nodes$hub_count + #nodes$opacity <- nodes$opacity/max(nodes$opacity) + + nodes$title <- paste0("",nodes$label,"",
+ "
DE in regions: ", sapply(nodes$de_reg, paste, collapse = ", "),
+ "
Hub in regions: ", sapply(nodes$hub_reg, paste, collapse = ", "),"