From 76ff99022a9f655c2ec1c8038960ce38b4546abc Mon Sep 17 00:00:00 2001 From: "Vera N. Karlbauer" Date: Tue, 13 Aug 2024 18:24:54 +0200 Subject: [PATCH] Initial commit: hg19 annotation and pseudo-EPICv1 version for EPICv2 data only --- ...al_create_pseudo_epicv1_version_for_v2.Rmd | 191 ++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 scripts/11_optional_create_pseudo_epicv1_version_for_v2.Rmd diff --git a/scripts/11_optional_create_pseudo_epicv1_version_for_v2.Rmd b/scripts/11_optional_create_pseudo_epicv1_version_for_v2.Rmd new file mode 100644 index 0000000..f98128e --- /dev/null +++ b/scripts/11_optional_create_pseudo_epicv1_version_for_v2.Rmd @@ -0,0 +1,191 @@ +--- +title: "Script 11 (optional): Pseudo-EPICv1 version of EPICv2 data" +date: '`r format(Sys.time(), "%d %B, %Y")`' +output: html_document +--- + +# Description + +This script +* creates a hg19 annotation of filtered and unfiltered data from the EPICv2 array (instead of the hg38 annotation +provided by Illumina). The hg19 annotation is taken from: http://zwdzwd.github.io/InfiniumAnnotation/EPIC_hm450_hg19.html +* creates a "pseudo-EPICv1" version of EPIC v2 data which only includes the probes found on the EPICv1 array. +The probe names are replaced with the probe names from the EPICv1 array. + +Both of these steps are performed for filtered and unfiltered data. +The pseudo-EPICv1 version is useful when using software designed for EPICv1, e.g. for clock calculation via methylclock or cell type estimation via EpiDISH. + +
+**Instructions** +Please only run this script if your data was generated using the EPICv2 array. + +```{r runtime start, include=FALSE} +start_time <- Sys.time() +``` + +```{r setup, include=FALSE} +user_choices <- readRDS("../data/user_choices.rds") +knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/")) +knitr::opts_chunk$set(echo = FALSE) + +if(user_choices$package_directory != "PLEASE FILL IN"){ + .libPaths(c(user_choices$package_directory, .libPaths())) +} + +needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "minfi", "janitor") +installed_packages <- needed_packages %in% rownames(installed.packages()) +if (!("minfi" %in% rownames(installed.packages()))) { + BiocManager::install("minfi") +} +if (any(installed_packages == FALSE)) { + install.packages(needed_packages[!installed_packages], repos = "http://cran.us.r-project.org") +} +lapply(needed_packages, library, character.only = TRUE) + +library(minfi) +library(janitor) +``` + +```{r load data, include=FALSE} +summary_table_preprocessing <- readRDS(paste0(user_choices$project_name, "/reports/summary_table_preprocessing.rds")) +load(paste0(user_choices$project_name, "/processed_data/Betas_clean_unfiltered_quantile_bmiq_combated.Rdata")) +load(paste0(user_choices$project_name, "/processed_data/Betas_clean_filtered_quantile_bmiq_combated.Rdata")) +load(paste0(user_choices$project_name, "/reports/annotations_clean_filtered.Rdata")) +load(paste0(user_choices$project_name, "/reports/annotations_clean_unfiltered.Rdata")) +load(paste0(user_choices$project_name, "/processed_data/PhenoData_clean.Rdata")) +annotation_hg19 <- read.table(paste0(user_choices$personal_path, "/epic_preprocessing_k2h/data/EPICv2.hg19.manifest.tsv")) +annotation_hg19 <- row_to_names(annotation_hg19, row_number = 1, remove_row = TRUE, remove_rows_above = TRUE) +``` + +```{r, check array type, include = TRUE} +if (user_choices$array_type == "v1") { + cat("Please do not run script 11 if your data was generated using an EPICv1 array") + knit_exit() + } +``` + +```{r, hg19 annotation filtered data, include = FALSE} +annations_clean_filtered_hg19 <- annotation_hg19 %>% + dplyr::filter(Probe_ID %in% rownames(Betas_clean_quantile_bmiq_filtered_combated)) +save(annations_clean_filtered_hg19, file = paste0(user_choices$project_name, "/reports/annations_clean_filtered_hg19.Rdata")) +``` + +```{r, hg19 annotation unfiltered data, include = FALSE} +annotions_clean_unfiltered_hg19 <- annotation_hg19 %>% + dplyr::filter(Probe_ID %in% rownames(Betas_clean_unfiltered_quantile_bmiq_combated)) +save(annotions_clean_unfiltered_hg19, file = paste0(user_choices$project_name, "/reports/annotions_clean_unfiltered_hg19.Rdata")) +``` + +```{r, create pseudo-EPICv1 version for filtered data, include=FALSE} +# create annotation only containing filtered CpGs that are also on v1 and exclude any duplicated loci +annotations_clean_filtered_pseudo_v1 <- annotations_clean_filtered %>% + filter(Name %in% rownames(Betas_clean_filtered_quantile_bmiq_combated)) %>% + filter(EPICv1_Loci != "") %>% + filter(duplicated(EPICv1_Loci) == FALSE) + +# only keep betas that are on v1 and exclude any duplicated loci (keep first CpG) +keep_betas <- (rownames(Betas_clean_filtered_quantile_bmiq_combated) %in% annotations_clean_filtered_pseudo_v1$Name) +Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1 <- Betas_clean_filtered_quantile_bmiq_combated[keep_betas,] +# order Beta rownames after annotation +Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1 <- Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1[match(annotations_clean_filtered_pseudo_v1$Name, rownames(Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1)),] +# rename Beta rownames +rownames(Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1) <- annotations_clean_filtered_pseudo_v1$EPICv1_Loci + +# save Betas +save(Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1, file = paste0(user_choices$project_name, "/processed_data/final_data/Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1.Rdata")) +``` + +```{r, info output for filtered pseudo-EPICv1 version, results='asis'} + keep_betas_df <- as.data.frame(keep_betas) + cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), + " filtered CpGs were not found on EPICv1 and therefore excluded"), sep = "
\n") + + dim_Betas_filtered <- dim(Betas_clean_quantile_bmiq_filtered_combated_pseudo_v1) + step_number <- c("14") + step <- c("Create pseudo-EPICv1 version (filtered data)") + data_class <- c("Betas") + samples <- c(dim_Betas_filtered[2]) + probes <- c(dim_Betas_filtered[1]) + + table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes) + summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding) + +``` + +```{r, create pseudo-EPICv1 version for unfiltered data, include=FALSE} +# create annotation only containing unfiltered CpGs that are also on v1 and exclude any duplicated loci +annotations_clean_unfiltered_pseudo_v1 <- annotations_clean_unfiltered %>% + filter(Name %in% rownames(Betas_clean_unfiltered_quantile_bmiq_combated)) %>% + filter(EPICv1_Loci != "") %>% + filter(duplicated(EPICv1_Loci) == FALSE) + +# only keep betas that are on v1 and exclude any duplicated loci (keep first CpG) +keep_betas <- (rownames(Betas_clean_unfiltered_quantile_bmiq_combated) %in% annotations_clean_unfiltered_pseudo_v1$Name) +Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1 <- Betas_clean_unfiltered_quantile_bmiq_combated[keep_betas,] +# order Beta rownames after annotation +Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1 <- Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1[match(annotations_clean_unfiltered_pseudo_v1$Name, rownames(Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1)),] +# rename Beta rownames +rownames(Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1) <- annotations_clean_unfiltered_pseudo_v1$EPICv1_Loci + +# save Betas +save(Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1, file = paste0(user_choices$project_name, "/processed_data/final_data/Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1.Rdata")) +``` + +```{r, info output for unfiltered pseudo-EPICv1 version, results='asis'} + keep_betas_df <- as.data.frame(keep_betas) + cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), + " unfiltered CpGs were not found on EPICv1 and therefore excluded"), sep = "
\n") + + dim_Betas_unfiltered <- dim(Betas_clean_quantile_bmiq_unfiltered_combated_pseudo_v1) + step_number <- c("15") + step <- c("Create pseudo-EPICv1 version (unfiltered data)") + data_class <- c("Betas") + samples <- c(dim_Betas_unfiltered[2]) + probes <- c(dim_Betas_unfiltered[1]) + + table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes) + summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding) +``` + +## User report + +```{r, info output for user, results='asis'} +cat("Session info text file was updated", sep = "
\n") +``` + +## Preprocessing summary table + +```{r, preprocessing summary table} +summary_table_preprocessing %>% paged_table() +``` + +```{r, document session info into a text file, include=FALSE} +connection <- file(paste0(user_choices$personal_path, "/", user_choices$project_name, + "/session_info_all.txt"), "a+") +writeLines("######################################################################", connection) +writeLines("############################ Script 6: #############################", connection) +writeLines("######################################################################", connection) +sessioninfo <- sessionInfo() +writeLines("\nR Version:", connection) +writeLines(paste0(" ", sessioninfo$R.version$version.string), connection) +writeLines("\nAttached packages:", connection) +# nicely format packages (with version) +package_version <- unlist(lapply(sessioninfo$otherPkgs, function(x){paste0(" ",x$Package, " (", x$Version, ")")})) +names(package_version) <- NULL +for(i in 1:length(package_version)) { + writeLines(package_version[i], connection) +} +writeLines("\n", connection) +close(connection) +``` + +```{r runtime end, include=FALSE} +end_time <- Sys.time() +run_time <- as.numeric(round((end_time - start_time), 1), units = "mins") +``` + +## Completion time + +```{r, completion time, results = 'asis'} +cat("The time for the script to run was:", run_time, "minutes", sep = " ") +```