mpip · verakarlbauer · Mar 19, 2025 · Mar 6, 2025 · Mar 7, 2025 · Mar 7, 2025
diff --git a/scripts/10_correction_batch_effects_unfiltered.Rmd b/scripts/10_correction_batch_effects_unfiltered.Rmd
@@ -30,7 +30,7 @@ start_time <- Sys.time()
 ```
 
 ```{r setup, include=FALSE}
-source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff 
+source("../data/Calculate_PC_Cutoff.R") # source function for determining PC cutoff 
 needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2", 
                      "ggrepel", "broom", "gplots", "tidyr", "sva", "methods")
 user_choices <- readRDS("../data/user_choices.rds")
@@ -69,7 +69,7 @@ phenotype_data <- readRDS(paste0(user_choices$project_name, "/processed_data/phe
 
 ## Correction of technical batch effects with ComBat
 
-```{r, correction of first batch effect and PCA}
+```{r, correction of first batch effect and PCA, results='asis'}
 m_values <- apply(Betas_clean_unfiltered_quantile_bmiq, 2, function(x) log2((x)/(1-x))) # get M-values
 
 if (correction_variable_1 == "PLEASE FILL IN"){
@@ -230,7 +230,11 @@ if (correction_variable_1 == "PLEASE FILL IN"){
 }
 ```
 
-```{r, correction of second batch effect and PCA}
+```{r, print p-value table for batch effects after correction of first batch effect variable}
+anova_lm_pvalue_df %>% paged_table()
+```
+
+```{r, correction of second batch effect and PCA, results='asis'}
 if (correction_variable_2 == "PLEASE FILL IN"){
   print("No second batch correction variable was specified by user, data remains unchanged")
 } else {
@@ -389,7 +393,11 @@ if (correction_variable_2 == "PLEASE FILL IN"){
 }
 ```
 
-```{r, correction of third batch effect and PCA}
+```{r, print p-value table for batch effects after correction of second batch effect variable}
+anova_lm_pvalue_df %>% paged_table()
+```
+
+```{r, correction of third batch effect and PCA, results='asis'}
 if (correction_variable_3 == "PLEASE FILL IN"){
   print("No third batch correction variable was specified by user, data remains unchanged")
 } else {
@@ -548,7 +556,11 @@ if (correction_variable_3 == "PLEASE FILL IN"){
 }
 ```
 
-```{r, convertion M to Beta and save, include=FALSE}
+```{r, print p-value table for batch effects after correction of third batch effect variable}
+anova_lm_pvalue_df %>% paged_table()
+```
+
+```{r, conversion M to Beta and save, include=FALSE}
 if(exists("m_values_combat_3")){
   corrected_data <- m_values_combat_3
   changing_status <- "changed"

diff --git a/scripts/11_annotate_hg19_create_pseudo_epicv1_from_epicv2.Rmd b/scripts/11_annotate_hg19_create_pseudo_epicv1_from_epicv2.Rmd
@@ -1,6 +1,6 @@
 ---
 title: "Script 11: Create hg19 annotation for EPICv2 and pseudo-EPICv1 version from EPICv2 data"
-date: '`r format(Sys.time(), %d %B, %Y")`'
+date: '`r format(Sys.time(), "%d %B, %Y")`'
 output: html_document
 ---
 
@@ -27,11 +27,10 @@ start_time <- Sys.time()
 
 ```{r setup, include=FALSE}
 user_choices <- readRDS("../data/user_choices.rds")
-knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/"))
 knitr::opts_chunk$set(echo = FALSE)
 
-if(user_choices$package_directory != "PLEASE FILL IN"){
-  .libPaths(c(user_choices$package_directory, .libPaths()))
+if(user_choices$package_path != "PLEASE FILL IN"){
+  .libPaths(c(user_choices$package_path, .libPaths()))
 }
 
 needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "minfi", "janitor")
@@ -57,6 +56,7 @@ load(paste0(user_choices$project_name, "/reports/annotations_clean_unfiltered.Rd
 load(paste0(user_choices$project_name, "/processed_data/PhenoData_clean.Rdata"))
 annotation_hg19 <- read.table(paste0(user_choices$personal_path, "/epic_preprocessing_k2h/data/EPICv2.hg19.manifest.tsv"))
 annotation_hg19 <- row_to_names(annotation_hg19, row_number = 1, remove_row = TRUE, remove_rows_above = TRUE)
+knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/"))
 ```
 
 ```{r, check array type, include = TRUE}
@@ -68,7 +68,7 @@ if (user_choices$array_type == "v1") {
 
 ```{r, hg19 annotation filtered data, include = FALSE}
 annations_clean_filtered_hg19 <- annotation_hg19 %>%
-  dplyr::filter(Probe_ID %in% rownames(Betas_clean_quantile_bmiq_filtered_combated))
+  dplyr::filter(Probe_ID %in% rownames(Betas_clean_filtered_quantile_bmiq_combated))
 save(annations_clean_filtered_hg19, file = paste0(user_choices$project_name, "/reports/annations_clean_filtered_hg19.Rdata"))
 ```
 
@@ -81,10 +81,10 @@ save(annotions_clean_unfiltered_hg19, file = paste0(user_choices$project_name, "
 ```{r, create pseudo-EPICv1 version for filtered data, include=FALSE}
 # create annotation only containing filtered CpGs that are also on v1 and exclude any duplicated loci
 annotations_clean_filtered_pseudo_v1 <- annotations_clean_filtered %>%
-  filter(Name %in% rownames(Betas_clean_filtered_quantile_bmiq_combated)) %>%
+  dplyr::filter(Name %in% rownames(Betas_clean_filtered_quantile_bmiq_combated)) %>%
   # only keep loci on EPICv1, filter wrongly annotated empty value
-  filter(EPICv1_Loci != "") %>%
-  filter(duplicated(EPICv1_Loci) == FALSE)
+  dplyr::filter(EPICv1_Loci != "") %>%
+  dplyr::filter(duplicated(EPICv1_Loci) == FALSE)
 
 # only keep betas that are on v1 and exclude any duplicated loci (keep first CpG)
 keep_betas <- (rownames(Betas_clean_filtered_quantile_bmiq_combated) %in% annotations_clean_filtered_pseudo_v1$Name)
@@ -103,7 +103,7 @@ save(Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1, file = paste0(user_c
   cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), 
              " filtered CpGs were not found on EPICv1 and therefore excluded"), sep = "<br>\n")
 
-  dim_Betas_filtered <- dim(Betas_clean_quantile_bmiq_filtered_combated_pseudo_v1)
+  dim_Betas_filtered <- dim(Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1)
   step_number <- c("14")
   step <- c("Create pseudo-EPICv1 version (filtered data)")
   data_class <- c("Betas")
@@ -117,9 +117,9 @@ save(Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1, file = paste0(user_c
 ```{r, create pseudo-EPICv1 version for unfiltered data, include=FALSE}
 # create annotation only containing unfiltered CpGs that are also on v1 and exclude any duplicated loci
 annotations_clean_unfiltered_pseudo_v1 <- annotations_clean_unfiltered %>%
-  filter(Name %in% rownames(Betas_clean_unfiltered_quantile_bmiq_combated)) %>%
-  filter(EPICv1_Loci != "") %>%
-  filter(duplicated(EPICv1_Loci) == FALSE)
+  dplyr::filter(Name %in% rownames(Betas_clean_unfiltered_quantile_bmiq_combated)) %>%
+  dplyr::filter(EPICv1_Loci != "") %>%
+  dplyr::filter(duplicated(EPICv1_Loci) == FALSE)
 
 # only keep betas that are on v1 and exclude any duplicated loci (keep first CpG)
 keep_betas <- (rownames(Betas_clean_unfiltered_quantile_bmiq_combated) %in% annotations_clean_unfiltered_pseudo_v1$Name)
@@ -138,7 +138,7 @@ save(Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1, file = paste0(user
   cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), 
              " unfiltered CpGs were not found on EPICv1 and therefore excluded"), sep = "<br>\n")
 
-  dim_Betas_unfiltered <- dim(Betas_clean_quantile_bmiq_unfiltered_combated_pseudo_v1)
+  dim_Betas_unfiltered <- dim(Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1)
   step_number <- c("15")
   step <- c("Create pseudo-EPICv1 version (unfiltered data)")
   data_class <- c("Betas")

diff --git a/scripts/1_definitions_and_setup.Rmd b/scripts/1_definitions_and_setup.Rmd
@@ -43,6 +43,10 @@ This script asks the user to define or choose:
   + person id  
   + sex  
 * additional batch variables (up to 3)  
+* optionally:
+  + path for R packages
+  + path to the genotype data location
+  + prefix (project name) of the genotype files
 
 <br>  
 **Notes/Definitions:**  
@@ -52,9 +56,9 @@ The path name should not include the name of the github repository
 **Path to phenotype data:** The path to the data table containing phenotype information  
 The path name should include the name of data table itself. This file must be **.csv**  
 **Path to idat files:** The path to the raw files obtained from array  
-**Path to R package directories (optional):** The path to the personal directory for installing and loading packages. 
+**Path to R package directories (optional):** The path to a directory for installing and loading R packages. 
 This is needed if installation of R packages to the default R library are not permitted due to the lack of admin rights. 
-Thsi will enable R to install and load any needed packages without problems. 
+This will enable R to install and load any needed packages without problems. 
 Please make sure you have the needed writing permissions in this specified directory  
 **Array type:** "v1" or "v2". This stands for the different versions of the EPIC array  
 **Population ethnicity:** Choose "AFR", "AMR", "ASN", "EUR" or "Global". Global 
@@ -78,6 +82,9 @@ be listed as F, M, or W. If your sex data is described in a different way
 be considered in a batch check in script 8: plate, slide, array, row and column. Please 
 provide additional possible batch variables if needed as a character vector (up to 3, 
 add more if necessary). If not, please leave this field as is.  
++**Path to genotype files:** The path to the QC-ed genotype data in plink file format (.bed, .bim, .fam)  
++**Genotype project name:** Prefix of the genotype files, denoting the project (e.g. my_study_2020)  
+
 <br>  
 
 ## Directory structure
@@ -143,10 +150,11 @@ user_choices <- data.frame(
   "name_well_column" = name_well_column,
   "name_personid_column" = name_personid_column,
   "name_sex_column" = name_sex_column, 
-  "name_treatment_column" = name_treatment_column,
   "additional_batch_variable_1" = additional_batch_variable_1,
   "additional_batch_variable_2" = additional_batch_variable_2,
-  "additional_batch_variable_3" = additional_batch_variable_3
+  "additional_batch_variable_3" = additional_batch_variable_3,
+  "genotype_path" = genotype_path,
+  "genotype_projectname" = genotype_projectname
 )
 saveRDS(user_choices, "../data/user_choices.rds")
 

diff --git a/scripts/2_import_raw_DNAm_data.Rmd b/scripts/2_import_raw_DNAm_data.Rmd
@@ -29,7 +29,7 @@ if (any(installed_biocmanager_packages == FALSE)) {
 lapply(needed_biocmanager_packages, library, character.only = TRUE)
 
 if (user_choices$array_type == "v2") {
-  if (!("jokergoo/IlluminaHumanMethylationEPICv2manifest" %in% rownames(installed.packages()))) {
+  if (!("IlluminaHumanMethylationEPICv2manifest" %in% rownames(installed.packages()))) {
     BiocManager::install("jokergoo/IlluminaHumanMethylationEPICv2manifest")
   }
   if (!("IlluminaHumanMethylationEPICv2anno.20a1.hg38" %in% rownames(installed.packages()))) {

diff --git a/scripts/3_quality_control.Rmd b/scripts/3_quality_control.Rmd
@@ -27,7 +27,6 @@ library(minfi)
 
 knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/"))
 knitr::opts_chunk$set(echo = FALSE)
-phenotype_data <- readRDS(paste0(user_choices$project_name, "/processed_data/phenotype_data.rds"))
 ```
 
 ```{r load data, include=FALSE}
@@ -147,7 +146,7 @@ title_df <- title_df %>%
   dplyr::select(arrayid, personid)
 
 for (i in 1:ncol(RGSet_quality)){ 
-  title_df_current <- title_df %>% filter(arrayid == rownames(pData(RGSet_quality))[i] %>% pull(personid))
+  title_df_current <- title_df %>% filter(arrayid == rownames(pData(RGSet_quality))[i]) %>% pull(personid)
   titel <- paste0(rownames(pData(RGSet_quality))[i], " / ", title_df_current)
   densityPlot(as.matrix(Betas_quality[,i]), main = titel)
   print(i)

diff --git a/scripts/4_check_matching_of_epigenetic_sex.Rmd b/scripts/4_check_matching_of_epigenetic_sex.Rmd
@@ -54,6 +54,7 @@ fail the sex match check
 
 ```{r, predict sex with DNAm data, include= FALSE}
 predictedSex <- getSex(gRatioSet_quality, cutoff=-2)
+phenotype_data <- subset(phenotype_data, arrayid %in% rownames(predictedSex),)
 
 sex_df <- phenotype_data %>%
   select(personid, arrayid, reported_sex = sex) %>%