Skip to content

New features #1

Merged
merged 4 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 14 additions & 0 deletions data/Calculate_PC_Cutoff.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
### Function calculating number of PCs to be kept for batch identification and correction
# threshold for N of PCs: start at PC1 and keep including PCs until drop in explained variance from PCi to PCi+2 < 1%
# if threshold >2, include first 2 PCs (needed for plots)
# include max 10 PCs

pc_cutoff <- function(data){
for(i in 1:nrow(data)) {
if(i > 10) break
else if((data[i,]$var_explained - data[i + 2,]$var_explained) >=1) PC_number <- i
else if(PC_number < 2) PC_number = 2
else break
}
return(PC_number)
}
22 changes: 10 additions & 12 deletions scripts/10_correction_batch_effects_unfiltered.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ start_time <- Sys.time()
```

```{r setup, include=FALSE}
source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff
needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2",
"ggrepel", "broom", "gplots", "tidyr", "sva", "methods")
installed_packages <- needed_packages %in% rownames(installed.packages())
Expand Down Expand Up @@ -78,15 +79,15 @@ if (correction_variable_1 == "PLEASE FILL IN"){

variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")

R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that explained a minimum of >10% change of variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))

phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -141,7 +142,6 @@ if (correction_variable_1 == "PLEASE FILL IN"){
technical_batch_plots[[i]] <- PCs %>%
ggplot(aes(x = PC1, y = PC2)) +
geom_point(aes(color = .data[[i]], fill = .data[[i]])) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand Down Expand Up @@ -238,15 +238,15 @@ if (correction_variable_2 == "PLEASE FILL IN"){

variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")

R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that explained a minimum of >10% change of variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))

phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -301,7 +301,6 @@ if (correction_variable_2 == "PLEASE FILL IN"){
technical_batch_plots[[i]] <- PCs %>%
ggplot(aes(x = PC1, y = PC2)) +
geom_point(aes(color = .data[[i]], fill = .data[[i]])) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand Down Expand Up @@ -398,15 +397,15 @@ if (correction_variable_3 == "PLEASE FILL IN"){

variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")

R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that explained >10% change of variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))

phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -461,7 +460,6 @@ if (correction_variable_3 == "PLEASE FILL IN"){
technical_batch_plots[[i]] <- PCs %>%
ggplot(aes(x = PC1, y = PC2)) +
geom_point(aes(color = .data[[i]], fill = .data[[i]])) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand Down
37 changes: 17 additions & 20 deletions scripts/8_removal_of_outliers_detect_batch_effects.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ start_time <- Sys.time()


```{r setup, include=FALSE}
source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff
needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2",
"ggrepel", "broom", "gplots", "tidyr", "sva", "methods")
installed_packages <- needed_packages %in% rownames(installed.packages())
Expand Down Expand Up @@ -117,15 +118,15 @@ variance_explained <- data.frame(PC = paste0("PC",1:ncol(PCs)),

variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Portion of variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")

R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that expained >10% of the variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))

phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -198,7 +199,6 @@ for (i in technical_batch_variables){
size = 3, box.padding = unit(0.35, "lines"),
segment.colour = "black",
point.padding = unit(0.3, "lines")) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand All @@ -210,7 +210,7 @@ saveRDS(PCA_object, paste0(user_choices$project_name, "/reports/PCA_object_filte
pdf(paste0(user_choices$project_name, "/reports/PCA_screeplot_filtered.pdf"))
variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
Expand Down Expand Up @@ -253,15 +253,15 @@ if(!identical(extreme_outliers_filtered %>% pull(personid),character(0))){

variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")

R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that expained >10% of the variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))

phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -332,7 +332,6 @@ if(!identical(extreme_outliers_filtered %>% pull(personid),character(0))){
size = 3, box.padding = unit(0.35, "lines"),
segment.colour = "black",
point.padding = unit(0.3, "lines")) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand All @@ -346,7 +345,7 @@ if(!identical(extreme_outliers_filtered %>% pull(personid),character(0))){
pdf(paste0(user_choices$project_name, "/reports/PCA_screeplot_filtered_no_outliers.pdf"))
variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
Expand Down Expand Up @@ -465,15 +464,15 @@ variance_explained <- data.frame(PC = paste0("PC",1:ncol(PCs)),

variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Portion of variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")

R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that expained >10% of the variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))

phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -546,7 +545,6 @@ for (i in technical_batch_variables){
size = 3, box.padding = unit(0.35, "lines"),
segment.colour = "black",
point.padding = unit(0.3, "lines")) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand All @@ -558,7 +556,7 @@ saveRDS(PCA_object, paste0(user_choices$project_name, "/reports/PCA_object_unfil
pdf(paste0(user_choices$project_name, "/reports/PCA_screeplot_unfiltered.pdf"))
variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
Expand Down Expand Up @@ -602,15 +600,15 @@ if(!identical(extreme_outliers_unfiltered %>% pull(personid),character(0))){

variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")

R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that expained >10% of the variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))

phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -681,7 +679,6 @@ if(!identical(extreme_outliers_unfiltered %>% pull(personid),character(0))){
size = 3, box.padding = unit(0.35, "lines"),
segment.colour = "black",
point.padding = unit(0.3, "lines")) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand All @@ -695,7 +692,7 @@ if(!identical(extreme_outliers_unfiltered %>% pull(personid),character(0))){
pdf(paste0(user_choices$project_name, "/reports/PCA_screeplot_unfiltered_no_outliers.pdf"))
variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
Expand Down
22 changes: 10 additions & 12 deletions scripts/9_correction_batch_effects_filtered.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ start_time <- Sys.time()
```

```{r setup, include=FALSE}
source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff
needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2",
"ggrepel", "broom", "gplots", "tidyr", "sva", "methods")
installed_packages <- needed_packages %in% rownames(installed.packages())
Expand Down Expand Up @@ -88,15 +89,15 @@ if (correction_variable_1 == "PLEASE FILL IN"){

variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")

R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that explained a minimum of >10% change of variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))

phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -151,7 +152,6 @@ if (correction_variable_1 == "PLEASE FILL IN"){
technical_batch_plots[[i]] <- PCs %>%
ggplot(aes(x = PC1, y = PC2)) +
geom_point(aes(color = .data[[i]], fill = .data[[i]])) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand Down Expand Up @@ -248,15 +248,15 @@ if (correction_variable_2 == "PLEASE FILL IN"){

variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")

R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that explained a minimum of >10% change of variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))

phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -311,7 +311,6 @@ if (correction_variable_2 == "PLEASE FILL IN"){
technical_batch_plots[[i]] <- PCs %>%
ggplot(aes(x = PC1, y = PC2)) +
geom_point(aes(color = .data[[i]], fill = .data[[i]])) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand Down Expand Up @@ -408,15 +407,15 @@ if (correction_variable_3 == "PLEASE FILL IN"){

variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")

R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that explained >10% change of variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))

phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -471,7 +470,6 @@ if (correction_variable_3 == "PLEASE FILL IN"){
technical_batch_plots[[i]] <- PCs %>%
ggplot(aes(x = PC1, y = PC2)) +
geom_point(aes(color = .data[[i]], fill = .data[[i]])) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand Down