From 347bbda69f9326d3b976929861666d538d66622d Mon Sep 17 00:00:00 2001 From: "Vera N. Karlbauer" Date: Sat, 4 Nov 2023 17:00:36 +0100 Subject: [PATCH 1/4] Changed color scheme for batch detection plots back to default --- scripts/10_correction_batch_effects_unfiltered.Rmd | 3 --- scripts/8_removal_of_outliers_detect_batch_effects.Rmd | 4 ---- scripts/9_correction_batch_effects_filtered.Rmd | 3 --- 3 files changed, 10 deletions(-) diff --git a/scripts/10_correction_batch_effects_unfiltered.Rmd b/scripts/10_correction_batch_effects_unfiltered.Rmd index 1ff132b..31d08a5 100644 --- a/scripts/10_correction_batch_effects_unfiltered.Rmd +++ b/scripts/10_correction_batch_effects_unfiltered.Rmd @@ -141,7 +141,6 @@ if (correction_variable_1 == "PLEASE FILL IN"){ technical_batch_plots[[i]] <- PCs %>% ggplot(aes(x = PC1, y = PC2)) + geom_point(aes(color = .data[[i]], fill = .data[[i]])) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } @@ -301,7 +300,6 @@ if (correction_variable_2 == "PLEASE FILL IN"){ technical_batch_plots[[i]] <- PCs %>% ggplot(aes(x = PC1, y = PC2)) + geom_point(aes(color = .data[[i]], fill = .data[[i]])) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } @@ -461,7 +459,6 @@ if (correction_variable_3 == "PLEASE FILL IN"){ technical_batch_plots[[i]] <- PCs %>% ggplot(aes(x = PC1, y = PC2)) + geom_point(aes(color = .data[[i]], fill = .data[[i]])) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } diff --git a/scripts/8_removal_of_outliers_detect_batch_effects.Rmd b/scripts/8_removal_of_outliers_detect_batch_effects.Rmd index bf12934..73f59b4 100644 --- a/scripts/8_removal_of_outliers_detect_batch_effects.Rmd +++ b/scripts/8_removal_of_outliers_detect_batch_effects.Rmd @@ -198,7 +198,6 @@ for (i in technical_batch_variables){ size = 3, box.padding = unit(0.35, "lines"), segment.colour = "black", point.padding = unit(0.3, "lines")) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } @@ -332,7 +331,6 @@ if(!identical(extreme_outliers_filtered %>% pull(personid),character(0))){ size = 3, box.padding = unit(0.35, "lines"), segment.colour = "black", point.padding = unit(0.3, "lines")) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } @@ -546,7 +544,6 @@ for (i in technical_batch_variables){ size = 3, box.padding = unit(0.35, "lines"), segment.colour = "black", point.padding = unit(0.3, "lines")) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } @@ -681,7 +678,6 @@ if(!identical(extreme_outliers_unfiltered %>% pull(personid),character(0))){ size = 3, box.padding = unit(0.35, "lines"), segment.colour = "black", point.padding = unit(0.3, "lines")) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } diff --git a/scripts/9_correction_batch_effects_filtered.Rmd b/scripts/9_correction_batch_effects_filtered.Rmd index 7462e35..6091378 100644 --- a/scripts/9_correction_batch_effects_filtered.Rmd +++ b/scripts/9_correction_batch_effects_filtered.Rmd @@ -151,7 +151,6 @@ if (correction_variable_1 == "PLEASE FILL IN"){ technical_batch_plots[[i]] <- PCs %>% ggplot(aes(x = PC1, y = PC2)) + geom_point(aes(color = .data[[i]], fill = .data[[i]])) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } @@ -311,7 +310,6 @@ if (correction_variable_2 == "PLEASE FILL IN"){ technical_batch_plots[[i]] <- PCs %>% ggplot(aes(x = PC1, y = PC2)) + geom_point(aes(color = .data[[i]], fill = .data[[i]])) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } @@ -471,7 +469,6 @@ if (correction_variable_3 == "PLEASE FILL IN"){ technical_batch_plots[[i]] <- PCs %>% ggplot(aes(x = PC1, y = PC2)) + geom_point(aes(color = .data[[i]], fill = .data[[i]])) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } From 3182e7d305d6b0e6a211a5529017f2a39409242e Mon Sep 17 00:00:00 2001 From: "Vera N. Karlbauer" Date: Sat, 4 Nov 2023 17:05:03 +0100 Subject: [PATCH 2/4] Numerical ordering of PCs in scree plots --- .../10_correction_batch_effects_unfiltered.Rmd | 6 +++--- ..._removal_of_outliers_detect_batch_effects.Rmd | 16 ++++++++-------- scripts/9_correction_batch_effects_filtered.Rmd | 6 +++--- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/scripts/10_correction_batch_effects_unfiltered.Rmd b/scripts/10_correction_batch_effects_unfiltered.Rmd index 31d08a5..ce10593 100644 --- a/scripts/10_correction_batch_effects_unfiltered.Rmd +++ b/scripts/10_correction_batch_effects_unfiltered.Rmd @@ -78,7 +78,7 @@ if (correction_variable_1 == "PLEASE FILL IN"){ variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + @@ -237,7 +237,7 @@ if (correction_variable_2 == "PLEASE FILL IN"){ variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + @@ -396,7 +396,7 @@ if (correction_variable_3 == "PLEASE FILL IN"){ variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + diff --git a/scripts/8_removal_of_outliers_detect_batch_effects.Rmd b/scripts/8_removal_of_outliers_detect_batch_effects.Rmd index 73f59b4..81f2c90 100644 --- a/scripts/8_removal_of_outliers_detect_batch_effects.Rmd +++ b/scripts/8_removal_of_outliers_detect_batch_effects.Rmd @@ -117,7 +117,7 @@ variance_explained <- data.frame(PC = paste0("PC",1:ncol(PCs)), variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + @@ -209,7 +209,7 @@ saveRDS(PCA_object, paste0(user_choices$project_name, "/reports/PCA_object_filte pdf(paste0(user_choices$project_name, "/reports/PCA_screeplot_filtered.pdf")) variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + @@ -252,7 +252,7 @@ if(!identical(extreme_outliers_filtered %>% pull(personid),character(0))){ variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + @@ -344,7 +344,7 @@ if(!identical(extreme_outliers_filtered %>% pull(personid),character(0))){ pdf(paste0(user_choices$project_name, "/reports/PCA_screeplot_filtered_no_outliers.pdf")) variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + @@ -463,7 +463,7 @@ variance_explained <- data.frame(PC = paste0("PC",1:ncol(PCs)), variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + @@ -555,7 +555,7 @@ saveRDS(PCA_object, paste0(user_choices$project_name, "/reports/PCA_object_unfil pdf(paste0(user_choices$project_name, "/reports/PCA_screeplot_unfiltered.pdf")) variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + @@ -599,7 +599,7 @@ if(!identical(extreme_outliers_unfiltered %>% pull(personid),character(0))){ variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + @@ -691,7 +691,7 @@ if(!identical(extreme_outliers_unfiltered %>% pull(personid),character(0))){ pdf(paste0(user_choices$project_name, "/reports/PCA_screeplot_unfiltered_no_outliers.pdf")) variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + diff --git a/scripts/9_correction_batch_effects_filtered.Rmd b/scripts/9_correction_batch_effects_filtered.Rmd index 6091378..28c4525 100644 --- a/scripts/9_correction_batch_effects_filtered.Rmd +++ b/scripts/9_correction_batch_effects_filtered.Rmd @@ -88,7 +88,7 @@ if (correction_variable_1 == "PLEASE FILL IN"){ variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + @@ -247,7 +247,7 @@ if (correction_variable_2 == "PLEASE FILL IN"){ variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + @@ -406,7 +406,7 @@ if (correction_variable_3 == "PLEASE FILL IN"){ variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + From acfd8db9046d985999e033a8abc3f6396ac529e2 Mon Sep 17 00:00:00 2001 From: "Vera N. Karlbauer" Date: Sat, 4 Nov 2023 17:16:29 +0100 Subject: [PATCH 3/4] Function for calculating appropriate number of PCs to include for batch identification/correction --- data/Calculate_PC_Cutoff.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 data/Calculate_PC_Cutoff.R diff --git a/data/Calculate_PC_Cutoff.R b/data/Calculate_PC_Cutoff.R new file mode 100644 index 0000000..5a93226 --- /dev/null +++ b/data/Calculate_PC_Cutoff.R @@ -0,0 +1,14 @@ +### Function calculating number of PCs to be kept for batch identification and correction +# threshold for N of PCs: start at PC1 and keep including PCs until drop in explained variance from PCi to PCi+2 < 1% +# if threshold >2, include first 2 PCs (needed for plots) +# include max 10 PCs + +pc_cutoff <- function(data){ + for(i in 1:nrow(data)) { + if(i > 10) break + else if((data[i,]$var_explained - data[i + 2,]$var_explained) >=1) PC_number <- i + else if(PC_number < 2) PC_number = 2 + else break + } + return(PC_number) +} From bb2670f44def0f44aeac520f9f1c5de5d64abb66 Mon Sep 17 00:00:00 2001 From: "Vera N. Karlbauer" Date: Sat, 4 Nov 2023 17:16:58 +0100 Subject: [PATCH 4/4] Determine number of included PCs via pc_cutoff function --- .../10_correction_batch_effects_unfiltered.Rmd | 13 +++++++------ ...removal_of_outliers_detect_batch_effects.Rmd | 17 +++++++++-------- scripts/9_correction_batch_effects_filtered.Rmd | 13 +++++++------ 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/scripts/10_correction_batch_effects_unfiltered.Rmd b/scripts/10_correction_batch_effects_unfiltered.Rmd index ce10593..fae0021 100644 --- a/scripts/10_correction_batch_effects_unfiltered.Rmd +++ b/scripts/10_correction_batch_effects_unfiltered.Rmd @@ -30,6 +30,7 @@ start_time <- Sys.time() ``` ```{r setup, include=FALSE} +source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2", "ggrepel", "broom", "gplots", "tidyr", "sva", "methods") installed_packages <- needed_packages %in% rownames(installed.packages()) @@ -85,8 +86,8 @@ if (correction_variable_1 == "PLEASE FILL IN"){ labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") - R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs - print(paste0(R, " PCs that explained a minimum of >10% change of variance were included")) + R <- pc_cutoff(variance_explained) # automatically calculate number of PCs + print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) @@ -244,8 +245,8 @@ if (correction_variable_2 == "PLEASE FILL IN"){ labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") - R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs - print(paste0(R, " PCs that explained a minimum of >10% change of variance were included")) + R <- pc_cutoff(variance_explained) # automatically calculate number of PCs + print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) @@ -403,8 +404,8 @@ if (correction_variable_3 == "PLEASE FILL IN"){ labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") - R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs - print(paste0(R, " PCs that explained >10% change of variance were included")) + R <- pc_cutoff(variance_explained) # automatically calculate number of PCs + print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) diff --git a/scripts/8_removal_of_outliers_detect_batch_effects.Rmd b/scripts/8_removal_of_outliers_detect_batch_effects.Rmd index 81f2c90..6b325a7 100644 --- a/scripts/8_removal_of_outliers_detect_batch_effects.Rmd +++ b/scripts/8_removal_of_outliers_detect_batch_effects.Rmd @@ -46,6 +46,7 @@ start_time <- Sys.time() ```{r setup, include=FALSE} +source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2", "ggrepel", "broom", "gplots", "tidyr", "sva", "methods") installed_packages <- needed_packages %in% rownames(installed.packages()) @@ -124,8 +125,8 @@ variance_explained %>% labs(x = "Principal Component", y = "Portion of variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") -R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs -print(paste0(R, " PCs that expained >10% of the variance were included")) +R <- pc_cutoff(variance_explained) # automatically calculate number of PCs +print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) @@ -259,8 +260,8 @@ if(!identical(extreme_outliers_filtered %>% pull(personid),character(0))){ labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") - R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs - print(paste0(R, " PCs that expained >10% of the variance were included")) + R <- pc_cutoff(variance_explained) # automatically calculate number of PCs + print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) @@ -470,8 +471,8 @@ variance_explained %>% labs(x = "Principal Component", y = "Portion of variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") -R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs -print(paste0(R, " PCs that expained >10% of the variance were included")) +R <- pc_cutoff(variance_explained) # automatically calculate number of PCs +print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) @@ -606,8 +607,8 @@ if(!identical(extreme_outliers_unfiltered %>% pull(personid),character(0))){ labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") - R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs - print(paste0(R, " PCs that expained >10% of the variance were included")) + R <- pc_cutoff(variance_explained) # automatically calculate number of PCs + print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) diff --git a/scripts/9_correction_batch_effects_filtered.Rmd b/scripts/9_correction_batch_effects_filtered.Rmd index 28c4525..2081997 100644 --- a/scripts/9_correction_batch_effects_filtered.Rmd +++ b/scripts/9_correction_batch_effects_filtered.Rmd @@ -41,6 +41,7 @@ start_time <- Sys.time() ``` ```{r setup, include=FALSE} +source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2", "ggrepel", "broom", "gplots", "tidyr", "sva", "methods") installed_packages <- needed_packages %in% rownames(installed.packages()) @@ -95,8 +96,8 @@ if (correction_variable_1 == "PLEASE FILL IN"){ labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") - R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs - print(paste0(R, " PCs that explained a minimum of >10% change of variance were included")) + R <- pc_cutoff(variance_explained) # automatically calculate number of PCs + print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) @@ -254,8 +255,8 @@ if (correction_variable_2 == "PLEASE FILL IN"){ labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") - R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs - print(paste0(R, " PCs that explained a minimum of >10% change of variance were included")) + R <- pc_cutoff(variance_explained) # automatically calculate number of PCs + print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) @@ -413,8 +414,8 @@ if (correction_variable_3 == "PLEASE FILL IN"){ labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") - R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs - print(paste0(R, " PCs that explained >10% change of variance were included")) + R <- pc_cutoff(variance_explained) # automatically calculate number of PCs + print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid)