Skip to content

Commit

Permalink
Merge pull request #1 from mpip/new_features
Browse files Browse the repository at this point in the history
New features
  • Loading branch information
NatanYusupov authored Mar 14, 2024
2 parents 7e5618a + bb2670f commit 7219dcb
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 44 deletions.
14 changes: 14 additions & 0 deletions data/Calculate_PC_Cutoff.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
### Function calculating number of PCs to be kept for batch identification and correction
# threshold for N of PCs: start at PC1 and keep including PCs until drop in explained variance from PCi to PCi+2 < 1%
# if threshold >2, include first 2 PCs (needed for plots)
# include max 10 PCs

pc_cutoff <- function(data){
for(i in 1:nrow(data)) {
if(i > 10) break
else if((data[i,]$var_explained - data[i + 2,]$var_explained) >=1) PC_number <- i
else if(PC_number < 2) PC_number = 2
else break
}
return(PC_number)
}
22 changes: 10 additions & 12 deletions scripts/10_correction_batch_effects_unfiltered.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ start_time <- Sys.time()
```

```{r setup, include=FALSE}
source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff
needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2",
"ggrepel", "broom", "gplots", "tidyr", "sva", "methods")
installed_packages <- needed_packages %in% rownames(installed.packages())
Expand Down Expand Up @@ -78,15 +79,15 @@ if (correction_variable_1 == "PLEASE FILL IN"){
variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")
R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that explained a minimum of >10% change of variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))
phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -141,7 +142,6 @@ if (correction_variable_1 == "PLEASE FILL IN"){
technical_batch_plots[[i]] <- PCs %>%
ggplot(aes(x = PC1, y = PC2)) +
geom_point(aes(color = .data[[i]], fill = .data[[i]])) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand Down Expand Up @@ -238,15 +238,15 @@ if (correction_variable_2 == "PLEASE FILL IN"){
variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")
R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that explained a minimum of >10% change of variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))
phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -301,7 +301,6 @@ if (correction_variable_2 == "PLEASE FILL IN"){
technical_batch_plots[[i]] <- PCs %>%
ggplot(aes(x = PC1, y = PC2)) +
geom_point(aes(color = .data[[i]], fill = .data[[i]])) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand Down Expand Up @@ -398,15 +397,15 @@ if (correction_variable_3 == "PLEASE FILL IN"){
variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")
R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that explained >10% change of variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))
phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -461,7 +460,6 @@ if (correction_variable_3 == "PLEASE FILL IN"){
technical_batch_plots[[i]] <- PCs %>%
ggplot(aes(x = PC1, y = PC2)) +
geom_point(aes(color = .data[[i]], fill = .data[[i]])) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand Down
37 changes: 17 additions & 20 deletions scripts/8_removal_of_outliers_detect_batch_effects.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ start_time <- Sys.time()


```{r setup, include=FALSE}
source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff
needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2",
"ggrepel", "broom", "gplots", "tidyr", "sva", "methods")
installed_packages <- needed_packages %in% rownames(installed.packages())
Expand Down Expand Up @@ -117,15 +118,15 @@ variance_explained <- data.frame(PC = paste0("PC",1:ncol(PCs)),
variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Portion of variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")
R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that expained >10% of the variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))
phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -198,7 +199,6 @@ for (i in technical_batch_variables){
size = 3, box.padding = unit(0.35, "lines"),
segment.colour = "black",
point.padding = unit(0.3, "lines")) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand All @@ -210,7 +210,7 @@ saveRDS(PCA_object, paste0(user_choices$project_name, "/reports/PCA_object_filte
pdf(paste0(user_choices$project_name, "/reports/PCA_screeplot_filtered.pdf"))
variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
Expand Down Expand Up @@ -253,15 +253,15 @@ if(!identical(extreme_outliers_filtered %>% pull(personid),character(0))){
variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")
R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that expained >10% of the variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))
phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -332,7 +332,6 @@ if(!identical(extreme_outliers_filtered %>% pull(personid),character(0))){
size = 3, box.padding = unit(0.35, "lines"),
segment.colour = "black",
point.padding = unit(0.3, "lines")) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand All @@ -346,7 +345,7 @@ if(!identical(extreme_outliers_filtered %>% pull(personid),character(0))){
pdf(paste0(user_choices$project_name, "/reports/PCA_screeplot_filtered_no_outliers.pdf"))
variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
Expand Down Expand Up @@ -465,15 +464,15 @@ variance_explained <- data.frame(PC = paste0("PC",1:ncol(PCs)),
variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Portion of variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")
R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that expained >10% of the variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))
phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -546,7 +545,6 @@ for (i in technical_batch_variables){
size = 3, box.padding = unit(0.35, "lines"),
segment.colour = "black",
point.padding = unit(0.3, "lines")) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand All @@ -558,7 +556,7 @@ saveRDS(PCA_object, paste0(user_choices$project_name, "/reports/PCA_object_unfil
pdf(paste0(user_choices$project_name, "/reports/PCA_screeplot_unfiltered.pdf"))
variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
Expand Down Expand Up @@ -602,15 +600,15 @@ if(!identical(extreme_outliers_unfiltered %>% pull(personid),character(0))){
variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")
R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that expained >10% of the variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))
phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -681,7 +679,6 @@ if(!identical(extreme_outliers_unfiltered %>% pull(personid),character(0))){
size = 3, box.padding = unit(0.35, "lines"),
segment.colour = "black",
point.padding = unit(0.3, "lines")) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand All @@ -695,7 +692,7 @@ if(!identical(extreme_outliers_unfiltered %>% pull(personid),character(0))){
pdf(paste0(user_choices$project_name, "/reports/PCA_screeplot_unfiltered_no_outliers.pdf"))
variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
Expand Down
22 changes: 10 additions & 12 deletions scripts/9_correction_batch_effects_filtered.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ start_time <- Sys.time()
```

```{r setup, include=FALSE}
source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff
needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2",
"ggrepel", "broom", "gplots", "tidyr", "sva", "methods")
installed_packages <- needed_packages %in% rownames(installed.packages())
Expand Down Expand Up @@ -88,15 +89,15 @@ if (correction_variable_1 == "PLEASE FILL IN"){
variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")
R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that explained a minimum of >10% change of variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))
phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -151,7 +152,6 @@ if (correction_variable_1 == "PLEASE FILL IN"){
technical_batch_plots[[i]] <- PCs %>%
ggplot(aes(x = PC1, y = PC2)) +
geom_point(aes(color = .data[[i]], fill = .data[[i]])) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand Down Expand Up @@ -248,15 +248,15 @@ if (correction_variable_2 == "PLEASE FILL IN"){
variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")
R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that explained a minimum of >10% change of variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))
phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -311,7 +311,6 @@ if (correction_variable_2 == "PLEASE FILL IN"){
technical_batch_plots[[i]] <- PCs %>%
ggplot(aes(x = PC1, y = PC2)) +
geom_point(aes(color = .data[[i]], fill = .data[[i]])) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand Down Expand Up @@ -408,15 +407,15 @@ if (correction_variable_3 == "PLEASE FILL IN"){
variance_explained %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = PC, y = var_explained, group = 1)) +
ggplot(aes(x = factor(PC, level = PC), y = var_explained, group = 1)) +
geom_point() +
geom_line() +
geom_text(aes(label = cumm_var_explained), vjust = 1.5) +
labs(x = "Principal Component", y = "Variance Explained (%)", title = "Scree Plot",
subtitle = "Note: cumulative variance explained (%) is displayed as text")
R <- variance_explained %>% filter(var_explained > 10) %>% nrow() # threshold for N of PCs
print(paste0(R, " PCs that explained >10% change of variance were included"))
R <- pc_cutoff(variance_explained) # automatically calculate number of PCs
print(paste0(R, " PCs were included"))
phenodata_subset_df <- as.data.frame(PhenoData_clean@listData) %>%
select(personid, plate, slide, array, row, column, arrayid)
Expand Down Expand Up @@ -471,7 +470,6 @@ if (correction_variable_3 == "PLEASE FILL IN"){
technical_batch_plots[[i]] <- PCs %>%
ggplot(aes(x = PC1, y = PC2)) +
geom_point(aes(color = .data[[i]], fill = .data[[i]])) +
scale_colour_brewer(palette = "Dark2") +
ggtitle(i) + theme(legend.title = element_blank())
print(technical_batch_plots[[i]])
}
Expand Down

0 comments on commit 7219dcb

Please sign in to comment.