diff --git a/data/Calculate_PC_Cutoff.R b/data/Calculate_PC_Cutoff.R new file mode 100644 index 0000000..5a93226 --- /dev/null +++ b/data/Calculate_PC_Cutoff.R @@ -0,0 +1,14 @@ +### Function calculating number of PCs to be kept for batch identification and correction +# threshold for N of PCs: start at PC1 and keep including PCs until drop in explained variance from PCi to PCi+2 < 1% +# if threshold >2, include first 2 PCs (needed for plots) +# include max 10 PCs + +pc_cutoff <- function(data){ + for(i in 1:nrow(data)) { + if(i > 10) break + else if((data[i,]$var_explained - data[i + 2,]$var_explained) >=1) PC_number <- i + else if(PC_number < 2) PC_number = 2 + else break + } + return(PC_number) +} diff --git a/scripts/10_correction_batch_effects_unfiltered.Rmd b/scripts/10_correction_batch_effects_unfiltered.Rmd index 1ff132b..fae0021 100644 --- a/scripts/10_correction_batch_effects_unfiltered.Rmd +++ b/scripts/10_correction_batch_effects_unfiltered.Rmd @@ -30,6 +30,7 @@ start_time <- Sys.time() ``` ```{r setup, include=FALSE} +source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2", "ggrepel", "broom", "gplots", "tidyr", "sva", "methods") installed_packages <- needed_packages %in% rownames(installed.packages()) @@ -78,15 +79,15 @@ if (correction_variable_1 == "PLEASE FILL IN"){ variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") - R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs - print(paste0(R, " PCs that explained a minimum of >10% change of variance were included")) + R <- pc_cutoff(variance_explained) # automatically calculate number of PCs + print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) @@ -141,7 +142,6 @@ if (correction_variable_1 == "PLEASE FILL IN"){ technical_batch_plots[[i]] <- PCs %>% ggplot(aes(x = PC1, y = PC2)) + geom_point(aes(color = .data[[i]], fill = .data[[i]])) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } @@ -238,15 +238,15 @@ if (correction_variable_2 == "PLEASE FILL IN"){ variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") - R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs - print(paste0(R, " PCs that explained a minimum of >10% change of variance were included")) + R <- pc_cutoff(variance_explained) # automatically calculate number of PCs + print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) @@ -301,7 +301,6 @@ if (correction_variable_2 == "PLEASE FILL IN"){ technical_batch_plots[[i]] <- PCs %>% ggplot(aes(x = PC1, y = PC2)) + geom_point(aes(color = .data[[i]], fill = .data[[i]])) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } @@ -398,15 +397,15 @@ if (correction_variable_3 == "PLEASE FILL IN"){ variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") - R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs - print(paste0(R, " PCs that explained >10% change of variance were included")) + R <- pc_cutoff(variance_explained) # automatically calculate number of PCs + print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) @@ -461,7 +460,6 @@ if (correction_variable_3 == "PLEASE FILL IN"){ technical_batch_plots[[i]] <- PCs %>% ggplot(aes(x = PC1, y = PC2)) + geom_point(aes(color = .data[[i]], fill = .data[[i]])) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } diff --git a/scripts/8_removal_of_outliers_detect_batch_effects.Rmd b/scripts/8_removal_of_outliers_detect_batch_effects.Rmd index bf12934..6b325a7 100644 --- a/scripts/8_removal_of_outliers_detect_batch_effects.Rmd +++ b/scripts/8_removal_of_outliers_detect_batch_effects.Rmd @@ -46,6 +46,7 @@ start_time <- Sys.time() ```{r setup, include=FALSE} +source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2", "ggrepel", "broom", "gplots", "tidyr", "sva", "methods") installed_packages <- needed_packages %in% rownames(installed.packages()) @@ -117,15 +118,15 @@ variance_explained <- data.frame(PC = paste0("PC",1:ncol(PCs)), variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + labs(x = "Principal Component", y = "Portion of variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") -R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs -print(paste0(R, " PCs that expained >10% of the variance were included")) +R <- pc_cutoff(variance_explained) # automatically calculate number of PCs +print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) @@ -198,7 +199,6 @@ for (i in technical_batch_variables){ size = 3, box.padding = unit(0.35, "lines"), segment.colour = "black", point.padding = unit(0.3, "lines")) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } @@ -210,7 +210,7 @@ saveRDS(PCA_object, paste0(user_choices$project_name, "/reports/PCA_object_filte pdf(paste0(user_choices$project_name, "/reports/PCA_screeplot_filtered.pdf")) variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + @@ -253,15 +253,15 @@ if(!identical(extreme_outliers_filtered %>% pull(personid),character(0))){ variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") - R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs - print(paste0(R, " PCs that expained >10% of the variance were included")) + R <- pc_cutoff(variance_explained) # automatically calculate number of PCs + print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) @@ -332,7 +332,6 @@ if(!identical(extreme_outliers_filtered %>% pull(personid),character(0))){ size = 3, box.padding = unit(0.35, "lines"), segment.colour = "black", point.padding = unit(0.3, "lines")) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } @@ -346,7 +345,7 @@ if(!identical(extreme_outliers_filtered %>% pull(personid),character(0))){ pdf(paste0(user_choices$project_name, "/reports/PCA_screeplot_filtered_no_outliers.pdf")) variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + @@ -465,15 +464,15 @@ variance_explained <- data.frame(PC = paste0("PC",1:ncol(PCs)), variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + labs(x = "Principal Component", y = "Portion of variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") -R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs -print(paste0(R, " PCs that expained >10% of the variance were included")) +R <- pc_cutoff(variance_explained) # automatically calculate number of PCs +print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) @@ -546,7 +545,6 @@ for (i in technical_batch_variables){ size = 3, box.padding = unit(0.35, "lines"), segment.colour = "black", point.padding = unit(0.3, "lines")) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } @@ -558,7 +556,7 @@ saveRDS(PCA_object, paste0(user_choices$project_name, "/reports/PCA_object_unfil pdf(paste0(user_choices$project_name, "/reports/PCA_screeplot_unfiltered.pdf")) variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + @@ -602,15 +600,15 @@ if(!identical(extreme_outliers_unfiltered %>% pull(personid),character(0))){ variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") - R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs - print(paste0(R, " PCs that expained >10% of the variance were included")) + R <- pc_cutoff(variance_explained) # automatically calculate number of PCs + print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) @@ -681,7 +679,6 @@ if(!identical(extreme_outliers_unfiltered %>% pull(personid),character(0))){ size = 3, box.padding = unit(0.35, "lines"), segment.colour = "black", point.padding = unit(0.3, "lines")) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } @@ -695,7 +692,7 @@ if(!identical(extreme_outliers_unfiltered %>% pull(personid),character(0))){ pdf(paste0(user_choices$project_name, "/reports/PCA_screeplot_unfiltered_no_outliers.pdf")) variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + diff --git a/scripts/9_correction_batch_effects_filtered.Rmd b/scripts/9_correction_batch_effects_filtered.Rmd index 7462e35..2081997 100644 --- a/scripts/9_correction_batch_effects_filtered.Rmd +++ b/scripts/9_correction_batch_effects_filtered.Rmd @@ -41,6 +41,7 @@ start_time <- Sys.time() ``` ```{r setup, include=FALSE} +source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2", "ggrepel", "broom", "gplots", "tidyr", "sva", "methods") installed_packages <- needed_packages %in% rownames(installed.packages()) @@ -88,15 +89,15 @@ if (correction_variable_1 == "PLEASE FILL IN"){ variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") - R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs - print(paste0(R, " PCs that explained a minimum of >10% change of variance were included")) + R <- pc_cutoff(variance_explained) # automatically calculate number of PCs + print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) @@ -151,7 +152,6 @@ if (correction_variable_1 == "PLEASE FILL IN"){ technical_batch_plots[[i]] <- PCs %>% ggplot(aes(x = PC1, y = PC2)) + geom_point(aes(color = .data[[i]], fill = .data[[i]])) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } @@ -248,15 +248,15 @@ if (correction_variable_2 == "PLEASE FILL IN"){ variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") - R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs - print(paste0(R, " PCs that explained a minimum of >10% change of variance were included")) + R <- pc_cutoff(variance_explained) # automatically calculate number of PCs + print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) @@ -311,7 +311,6 @@ if (correction_variable_2 == "PLEASE FILL IN"){ technical_batch_plots[[i]] <- PCs %>% ggplot(aes(x = PC1, y = PC2)) + geom_point(aes(color = .data[[i]], fill = .data[[i]])) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) } @@ -408,15 +407,15 @@ if (correction_variable_3 == "PLEASE FILL IN"){ variance_explained %>% dplyr::slice(1:10) %>% - ggplot(aes(x = PC, y = var_explained, group = 1)) + + ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) + geom_point() + geom_line() + geom_text(aes(label = cumm_var_explained), vjust = 1.5) + labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot", subtitle = "Note: cumulative variance explained (%) is displayed as text") - R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs - print(paste0(R, " PCs that explained >10% change of variance were included")) + R <- pc_cutoff(variance_explained) # automatically calculate number of PCs + print(paste0(R, " PCs were included")) phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>% select(personid, plate, slide, array, row, column, arrayid) @@ -471,7 +470,6 @@ if (correction_variable_3 == "PLEASE FILL IN"){ technical_batch_plots[[i]] <- PCs %>% ggplot(aes(x = PC1, y = PC2)) + geom_point(aes(color = .data[[i]], fill = .data[[i]])) + - scale_colour_brewer(palette = "Dark2") + ggtitle(i) + theme(legend.title = element_blank()) print(technical_batch_plots[[i]]) }