From 76ff99022a9f655c2ec1c8038960ce38b4546abc Mon Sep 17 00:00:00 2001
From: "Vera N. Karlbauer" <vera@karlbauer.org>
Date: Tue, 13 Aug 2024 18:24:54 +0200
Subject: [PATCH] Initial commit: hg19 annotation and pseudo-EPICv1 version for
 EPICv2 data only

---
 ...al_create_pseudo_epicv1_version_for_v2.Rmd | 191 ++++++++++++++++++
 1 file changed, 191 insertions(+)
 create mode 100644 scripts/11_optional_create_pseudo_epicv1_version_for_v2.Rmd
diff --git a/scripts/11_optional_create_pseudo_epicv1_version_for_v2.Rmd b/scripts/11_optional_create_pseudo_epicv1_version_for_v2.Rmd
new file mode 100644
index 0000000..f98128e
--- /dev/null
+++ b/scripts/11_optional_create_pseudo_epicv1_version_for_v2.Rmd
@@ -0,0 +1,191 @@
+---
+title: "Script 11 (optional): Pseudo-EPICv1 version of EPICv2 data"
+date: '`r format(Sys.time(), "%d %B, %Y")`'
+output: html_document
+---
+
+# Description
+
+This script  
+* creates a hg19 annotation of filtered and unfiltered data from the EPICv2 array (instead of the hg38 annotation
+provided by Illumina). The hg19 annotation is taken from: http://zwdzwd.github.io/InfiniumAnnotation/EPIC_hm450_hg19.html
+* creates a "pseudo-EPICv1" version of EPIC v2 data which only includes the probes found on the EPICv1 array.  
+The probe names are replaced with the probe names from the EPICv1 array. 
+
+Both of these steps are performed for filtered and unfiltered data.
+The pseudo-EPICv1 version is useful when using software designed for EPICv1, e.g. for clock calculation via methylclock or cell type estimation via EpiDISH.  
+
+<br>  
+**Instructions**  
+Please only run this script if your data was generated using the EPICv2 array.
+
+```{r runtime start, include=FALSE}
+start_time <- Sys.time()
+```
+
+```{r setup, include=FALSE}
+user_choices <- readRDS("../data/user_choices.rds")
+knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/"))
+knitr::opts_chunk$set(echo = FALSE)
+
+if(user_choices$package_directory != "PLEASE FILL IN"){
+  .libPaths(c(user_choices$package_directory, .libPaths()))
+}
+
+needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "minfi", "janitor")
+installed_packages <- needed_packages %in% rownames(installed.packages())
+if (!("minfi" %in% rownames(installed.packages()))) {
+  BiocManager::install("minfi")
+}
+if (any(installed_packages == FALSE)) {
+  install.packages(needed_packages[!installed_packages], repos = "http://cran.us.r-project.org")
+}
+lapply(needed_packages, library, character.only = TRUE)
+
+library(minfi)
+library(janitor)
+```
+
+```{r load data, include=FALSE}
+summary_table_preprocessing <- readRDS(paste0(user_choices$project_name, "/reports/summary_table_preprocessing.rds"))
+load(paste0(user_choices$project_name, "/processed_data/Betas_clean_unfiltered_quantile_bmiq_combated.Rdata"))
+load(paste0(user_choices$project_name, "/processed_data/Betas_clean_filtered_quantile_bmiq_combated.Rdata"))
+load(paste0(user_choices$project_name, "/reports/annotations_clean_filtered.Rdata"))
+load(paste0(user_choices$project_name, "/reports/annotations_clean_unfiltered.Rdata"))
+load(paste0(user_choices$project_name, "/processed_data/PhenoData_clean.Rdata"))
+annotation_hg19 <- read.table(paste0(user_choices$personal_path, "/epic_preprocessing_k2h/data/EPICv2.hg19.manifest.tsv"))
+annotation_hg19 <- row_to_names(annotation_hg19, row_number = 1, remove_row = TRUE, remove_rows_above = TRUE)
+```
+
+```{r, check array type, include = TRUE}
+if (user_choices$array_type == "v1") {
+  cat("Please do not run script 11 if your data was generated using an EPICv1 array")
+  knit_exit()
+  }
+```
+
+```{r, hg19 annotation filtered data, include = FALSE}
+annations_clean_filtered_hg19 <- annotation_hg19 %>%
+  dplyr::filter(Probe_ID %in% rownames(Betas_clean_quantile_bmiq_filtered_combated))
+save(annations_clean_filtered_hg19, file = paste0(user_choices$project_name, "/reports/annations_clean_filtered_hg19.Rdata"))
+```
+
+```{r, hg19 annotation unfiltered data, include = FALSE}
+annotions_clean_unfiltered_hg19 <- annotation_hg19 %>%
+  dplyr::filter(Probe_ID %in% rownames(Betas_clean_unfiltered_quantile_bmiq_combated))
+save(annotions_clean_unfiltered_hg19, file = paste0(user_choices$project_name, "/reports/annotions_clean_unfiltered_hg19.Rdata")) 
+```
+
+```{r, create pseudo-EPICv1 version for filtered data, include=FALSE}
+# create annotation only containing filtered CpGs that are also on v1 and exclude any duplicated loci
+annotations_clean_filtered_pseudo_v1 <- annotations_clean_filtered %>%
+  filter(Name %in% rownames(Betas_clean_filtered_quantile_bmiq_combated)) %>%
+  filter(EPICv1_Loci != "") %>%
+  filter(duplicated(EPICv1_Loci) == FALSE)
+
+# only keep betas that are on v1 and exclude any duplicated loci (keep first CpG)
+keep_betas <- (rownames(Betas_clean_filtered_quantile_bmiq_combated) %in% annotations_clean_filtered_pseudo_v1$Name)
+Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1 <- Betas_clean_filtered_quantile_bmiq_combated[keep_betas,]
+# order Beta rownames after annotation
+Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1 <- Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1[match(annotations_clean_filtered_pseudo_v1$Name, rownames(Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1)),]
+# rename Beta rownames
+rownames(Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1) <- annotations_clean_filtered_pseudo_v1$EPICv1_Loci
+
+# save Betas
+save(Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1, file = paste0(user_choices$project_name, "/processed_data/final_data/Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1.Rdata"))
+```
+
+```{r, info output for filtered pseudo-EPICv1 version, results='asis'}
+  keep_betas_df <- as.data.frame(keep_betas)
+  cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), 
+             " filtered CpGs were not found on EPICv1 and therefore excluded"), sep = "<br>\n")
+  
+  dim_Betas_filtered <- dim(Betas_clean_quantile_bmiq_filtered_combated_pseudo_v1)
+  step_number <- c("14")
+  step <- c("Create pseudo-EPICv1 version (filtered data)")
+  data_class <- c("Betas")
+  samples <- c(dim_Betas_filtered[2])
+  probes <- c(dim_Betas_filtered[1])
+  
+  table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
+  summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
+
+```
+
+```{r, create pseudo-EPICv1 version for unfiltered data, include=FALSE}
+# create annotation only containing unfiltered CpGs that are also on v1 and exclude any duplicated loci
+annotations_clean_unfiltered_pseudo_v1 <- annotations_clean_unfiltered %>%
+  filter(Name %in% rownames(Betas_clean_unfiltered_quantile_bmiq_combated)) %>%
+  filter(EPICv1_Loci != "") %>%
+  filter(duplicated(EPICv1_Loci) == FALSE)
+
+# only keep betas that are on v1 and exclude any duplicated loci (keep first CpG)
+keep_betas <- (rownames(Betas_clean_unfiltered_quantile_bmiq_combated) %in% annotations_clean_unfiltered_pseudo_v1$Name)
+Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1 <- Betas_clean_unfiltered_quantile_bmiq_combated[keep_betas,]
+# order Beta rownames after annotation
+Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1 <- Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1[match(annotations_clean_unfiltered_pseudo_v1$Name, rownames(Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1)),]
+# rename Beta rownames
+rownames(Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1) <- annotations_clean_unfiltered_pseudo_v1$EPICv1_Loci
+
+# save Betas
+save(Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1, file = paste0(user_choices$project_name, "/processed_data/final_data/Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1.Rdata"))
+```
+
+```{r, info output for unfiltered pseudo-EPICv1 version, results='asis'}
+  keep_betas_df <- as.data.frame(keep_betas)
+  cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), 
+             " unfiltered CpGs were not found on EPICv1 and therefore excluded"), sep = "<br>\n")
+  
+  dim_Betas_unfiltered <- dim(Betas_clean_quantile_bmiq_unfiltered_combated_pseudo_v1)
+  step_number <- c("15")
+  step <- c("Create pseudo-EPICv1 version (unfiltered data)")
+  data_class <- c("Betas")
+  samples <- c(dim_Betas_unfiltered[2])
+  probes <- c(dim_Betas_unfiltered[1])
+  
+  table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
+  summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
+```
+
+## User report
+
+```{r, info output for user, results='asis'}
+cat("Session info text file was updated", sep = "<br>\n")
+```
+
+## Preprocessing summary table
+
+```{r, preprocessing summary table}
+summary_table_preprocessing %>% paged_table()
+```
+
+```{r, document session info into a text file, include=FALSE}
+connection <- file(paste0(user_choices$personal_path, "/", user_choices$project_name, 
+                          "/session_info_all.txt"), "a+")
+writeLines("######################################################################", connection)
+writeLines("############################  Script 6:  #############################", connection)
+writeLines("######################################################################", connection)
+sessioninfo <- sessionInfo()
+writeLines("\nR Version:", connection)
+writeLines(paste0("   ", sessioninfo$R.version$version.string), connection)
+writeLines("\nAttached packages:", connection)
+# nicely format packages (with version)
+package_version <- unlist(lapply(sessioninfo$otherPkgs, function(x){paste0("   ",x$Package, " (", x$Version, ")")}))
+names(package_version) <- NULL
+for(i in 1:length(package_version)) {
+  writeLines(package_version[i], connection)
+}
+writeLines("\n", connection)
+close(connection)
+```
+
+```{r runtime end, include=FALSE}
+end_time <- Sys.time()
+run_time <- as.numeric(round((end_time - start_time), 1), units = "mins")
+```
+
+## Completion time
+
+```{r, completion time, results = 'asis'}
+cat("The time for the script to run was:", run_time, "minutes", sep = " ")
+```