From e2e8455091de771b995819f1a05b361b736a0c39 Mon Sep 17 00:00:00 2001 From: Natan Yusupov Date: Fri, 12 Apr 2024 09:43:51 +0200 Subject: [PATCH] adapting conseauences of filtering before normalization --- scripts/2_import_raw_DNAm_data.Rmd | 1 + scripts/3_quality_control.Rmd | 3 +++ scripts/4_check_matching_of_epigenetic_sex.Rmd | 6 +++--- scripts/5_samples_exclusion.Rmd | 4 ++++ scripts/6_filtering_cpgs.Rmd | 15 +++++++-------- scripts/7_normalization.Rmd | 18 ++++++++++++------ 6 files changed, 30 insertions(+), 17 deletions(-) diff --git a/scripts/2_import_raw_DNAm_data.Rmd b/scripts/2_import_raw_DNAm_data.Rmd index 7569720..b4abfc6 100644 --- a/scripts/2_import_raw_DNAm_data.Rmd +++ b/scripts/2_import_raw_DNAm_data.Rmd @@ -116,6 +116,7 @@ saveRDS(summary_table_preprocessing, RGSet_original Mset_original RatioSet_original +gRatioSet_original ``` ## User report diff --git a/scripts/3_quality_control.Rmd b/scripts/3_quality_control.Rmd index 51f0f90..6145248 100644 --- a/scripts/3_quality_control.Rmd +++ b/scripts/3_quality_control.Rmd @@ -29,6 +29,7 @@ knitr::opts_chunk$set(echo = FALSE) ```{r load data, include=FALSE} load(paste0(user_choices$project_name, "/processed_data/RGSet_original.Rdata")) load(paste0(user_choices$project_name, "/processed_data/Betas_original.Rdata")) +load(paste0(user_choices$project_name, "/processed_data/gRatioSet_original.Rdata")) phenotype_data <- read.csv(paste0(user_choices$project_name, "/processed_data/phenotype_data.csv"), sep = ",") summary_table_preprocessing <- readRDS(paste0(user_choices$project_name, "/reports/summary_table_preprocessing.rds")) ``` @@ -97,6 +98,8 @@ keep <- colMeans(detP_original) < user_choices$detP_threshold RGSet_quality <- RGSet_original[,keep] save(RGSet_quality, file = paste0(user_choices$project_name, "/processed_data/RGSet_quality.Rdata")) +gRatioSet_quality <- gRatioSet_original[,keep] +save(gRatioSet_quality, file = paste0(user_choices$project_name, "/processed_data/gRatioSet_quality.Rdata")) Betas_quality <- Betas_original[,keep] save(Betas_quality, file = paste0(user_choices$project_name, "/processed_data/Betas_quality.Rdata")) detP_quality <- detP_original[,keep] diff --git a/scripts/4_check_matching_of_epigenetic_sex.Rmd b/scripts/4_check_matching_of_epigenetic_sex.Rmd index 7554c19..8407c04 100644 --- a/scripts/4_check_matching_of_epigenetic_sex.Rmd +++ b/scripts/4_check_matching_of_epigenetic_sex.Rmd @@ -27,7 +27,7 @@ knitr::opts_chunk$set(echo = FALSE) ``` ```{r load data, include=FALSE} -load(paste0(user_choices$project_name, "/processed_data/gRatioSet_original.Rdata")) +load(paste0(user_choices$project_name, "/processed_data/gRatioSet_quality.Rdata")) phenotype_data <- read.csv(paste0(user_choices$project_name, "/processed_data/phenotype_data.csv"), sep = ",") ``` @@ -35,7 +35,7 @@ phenotype_data <- read.csv(paste0(user_choices$project_name, "/processed_data/ph This script: -* predicts sex with DNA methylation data (gRatioSet_original, chip-wide medians +* predicts sex with DNA methylation data (gRatioSet_quality, chip-wide medians of measurements on sex chromosomes) * compares epigenetically predicted sex with phenotype information * reports samples with mismatches. These will be excluded in script 5 @@ -50,7 +50,7 @@ fail the sex match check ## Prediction of sex with DNA methylation data ```{r, predict sex with DNAm data, include= FALSE} -predictedSex <- getSex(gRatioSet_original, cutoff=-2) +predictedSex <- getSex(gRatioSet_quality, cutoff=-2) sex_df <- phenotype_data %>% select(personid, arrayid, reported_sex = sex) %>% diff --git a/scripts/5_samples_exclusion.Rmd b/scripts/5_samples_exclusion.Rmd index 736b938..2395476 100644 --- a/scripts/5_samples_exclusion.Rmd +++ b/scripts/5_samples_exclusion.Rmd @@ -31,6 +31,7 @@ detP_mean_fails_df <- readRDS(paste0(user_choices$project_name, "/reports/detP_m sex_df <- readRDS(paste0(user_choices$project_name, "/reports/predicted_sex.rds")) summary_table_preprocessing <- readRDS(paste0(user_choices$project_name, "/reports/summary_table_preprocessing.rds")) load(paste0(user_choices$project_name, "/processed_data/RGSet_quality.Rdata")) +load(paste0(user_choices$project_name, "/processed_data/gRatioSet_quality.Rdata")) load(paste0(user_choices$project_name, "/processed_data/Betas_quality.Rdata")) load(paste0(user_choices$project_name, "/reports/detP_quality.Rdata")) ``` @@ -93,16 +94,19 @@ saveRDS(exclusion_before_norm_vector, paste0(user_choices$project_name, "/report # subset objects if (!identical(exclusion_arrayids, NULL)){ RGSet_clean <- RGSet_quality[ , !(colnames(RGSet_quality) %in% exclusion_arrayids)] + gRatioSet_clean <- gRatioSet_quality[ , !(colnames(gRatioSet_quality) %in% exclusion_arrayids)] Betas_clean <- Betas_quality[ , !(colnames(Betas_quality) %in% exclusion_arrayids)] detP_clean <- detP_quality[, !(colnames(detP_quality) %in% exclusion_arrayids)] } else { RGSet_clean <- RGSet_quality + gRatioSet_clean <- gRatioSet_quality Betas_clean <- Betas_quality detP_clean <- detP_quality } PhenoData_clean <- pData(RGSet_clean) # save objects save(RGSet_clean, file = paste0(user_choices$project_name, "/processed_data/RGSet_clean.Rdata")) +save(gRatioSet_clean, file = paste0(user_choices$project_name, "/processed_data/gRatioSet_clean.Rdata")) save(Betas_clean, file = paste0(user_choices$project_name, "/processed_data/Betas_clean.Rdata")) save(detP_clean, file = paste0(user_choices$project_name, "/processed_data/detP_clean.Rdata")) save(PhenoData_clean, file = paste0(user_choices$project_name, "/processed_data/PhenoData_clean.Rdata")) diff --git a/scripts/6_filtering_cpgs.Rmd b/scripts/6_filtering_cpgs.Rmd index dfe4e50..4f1786c 100644 --- a/scripts/6_filtering_cpgs.Rmd +++ b/scripts/6_filtering_cpgs.Rmd @@ -29,6 +29,7 @@ knitr::opts_chunk$set(echo = FALSE) ```{r load data, include=FALSE} summary_table_preprocessing <- readRDS(paste0(user_choices$project_name, "/reports/summary_table_preprocessing.rds")) load(paste0(user_choices$project_name, "/processed_data/RGSet_clean.Rdata")) +load(paste0(user_choices$project_name, "/processed_data/gRatioSet_clean.Rdata")) load(paste0(user_choices$project_name, "/processed_data/Betas_clean.Rdata")) load(paste0(user_choices$project_name, "/processed_data/PhenoData_clean.Rdata")) load(paste0(user_choices$project_name, "/processed_data/detP_clean.Rdata")) @@ -62,10 +63,10 @@ detP_clean_filtered <- detP_clean[match(rownames(Betas_clean),rownames(detP_clea keep_betas <- rowSums(detP_clean_filtered < 0.01) == ncol(Betas_clean) Betas_clean_filtered <- Betas_clean[keep_betas,] -# ensure probes are in the same order in cleaned RGChannelSet and detP objects, then subset -detP_clean_filtered2 <- detP_clean[match(featureNames(RGSet_clean),rownames(detP_clean)),] -keep_rgset <- rowSums(detP_clean_filtered2 < 0.01) == ncol(RGSet_clean) -RGSet_clean_filtered <- RGSet_clean[keep_rgset,] +# ensure probes are in the same order in cleaned gRatioSet (RGSet with annotations) and detP objects, then subset +detP_clean_filtered2 <- detP_clean[match(featureNames(gRatioSet_clean),rownames(detP_clean)),] +keep_rgset <- rowSums(detP_clean_filtered < 0.01) == ncol(gRatioSet_clean) +RGSet_clean_filtered <- gRatioSet_clean[keep_rgset,] ``` ```{r, info output for user failed probes, results='asis'} @@ -146,8 +147,7 @@ summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_prep ```{r filter out probes affected by snps, include=FALSE} if (user_choices$array_type == "v1") { RGSet_clean_filtered <- dropLociWithSnps(RGSet_clean_filtered) # function requires a GenomicRatioSet - - Betas_clean_filtered <- Betas_clean_filtered[rownames(Betas_clean_filtered) %in% featureNames(RGSet_clean_filtered),] + Betas_clean_filtered <- Betas_clean_filtered[rownames(Betas_clean_filtered) %in% featureNames(RGSet_clean_filtered),] } ``` @@ -284,13 +284,12 @@ saveRDS(summary_table_preprocessing, paste0(user_choices$project_name, "/reports ``` ```{r, save data, include=FALSE} -save(Betas_clean_filtered, file = paste0(user_choices$project_name, "/processed_data/Betas_clean_filtered.Rdata")) save(RGSet_clean_filtered, file = paste0(user_choices$project_name, "/processed_data/RGSet_clean_filtered.Rdata")) Betas_clean_filtered <- getBeta(RGSet_clean_filtered) # beta values save(Betas_clean_filtered, file = paste0(user_choices$project_name, "/processed_data/Betas_clean_filtered.Rdata")) -Ms_clean_filtered <- getM(RGSet_clean_filtered) +Ms_clean_filtered <- getM(RGSet_clean_filtered) # M values save(Ms_clean_filtered, file = paste0(user_choices$project_name, "/processed_data/Ms_clean_filtered.Rdata")) ``` diff --git a/scripts/7_normalization.Rmd b/scripts/7_normalization.Rmd index 99ad304..ed5c206 100644 --- a/scripts/7_normalization.Rmd +++ b/scripts/7_normalization.Rmd @@ -73,6 +73,12 @@ save(annotations_clean_filtered, file = paste0(user_choices$project_name, "/repo ```{r normalize probes, include=FALSE} # Normalization of filtered data + +# Matrix of probes removed in all prior filtering steps +exclusion_matrix <- Betas_clean[!rownames(Betas_clean) %in% rownames(Betas_clean_filtered), ] +# Exclude all probes filtered script 5 from steps before to improve preprocessing +RGSet_clean_filtered <- subsetByLoci(RGSet_clean, excludeLoci = rownames(exclusion_matrix)) + RGSet_clean_filtered_quantile <- preprocessQuantile(RGSet_clean_filtered) save(RGSet_clean_filtered_quantile, file = paste0(user_choices$project_name, "/processed_data/RGSet_clean_filtered_quantile.Rdata")) # output: GenomicRatioSet @@ -80,9 +86,9 @@ save(RGSet_clean_filtered_quantile, file = paste0(user_choices$project_name, Betas_clean_filtered_quantile <- getBeta(RGSet_clean_filtered_quantile) save(Betas_clean_filtered_quantile, file = paste0(user_choices$project_name, "/processed_data/Betas_clean_filtered_quantile.Rdata")) -Mset_clean_filtered_qunatile <- getM(RGSet_clean_filtered_quantile) -save(Mset_clean_filtered_qunatile, file = paste0(user_choices$project_name, - "/processed_data/Mset_clean_filtered_qunatile.Rdata")) +Ms_clean_filtered_qunatile <- getM(RGSet_clean_filtered_quantile) +save(Ms_clean_filtered_qunatile, file = paste0(user_choices$project_name, + "/processed_data/Ms_clean_filtered_qunatile.Rdata")) # further normalization with BMIQ: probeType <- as.data.frame(annotations_clean_filtered[rownames(Betas_clean_filtered_quantile),c("Name","Type")]) probeType$probeType = ifelse(probeType$Type %in% "I", 1, 2) @@ -99,9 +105,9 @@ save(RGSet_clean_unfiltered_quantile, file = paste0(user_choices$project_name, Betas_clean_unfiltered_quantile <- getBeta(RGSet_clean_unfiltered_quantile) save(Betas_clean_unfiltered_quantile, file = paste0(user_choices$project_name, "/processed_data/Betas_clean_unfiltered_quantile.Rdata")) -Mset_clean_unfiltered_qunatile <- getM(RGSet_clean_unfiltered_quantile) -save(Mset_clean_unfiltered_qunatile, file = paste0(user_choices$project_name, - "/processed_data/Mset_clean_unfiltered_qunatile.Rdata")) +Ms_clean_unfiltered_qunatile <- getM(RGSet_clean_unfiltered_quantile) +save(Ms_clean_unfiltered_qunatile, file = paste0(user_choices$project_name, + "/processed_data/Ms_clean_unfiltered_qunatile.Rdata")) # further normalization with BMIQ: probeType <- as.data.frame(annotations_clean[rownames(Betas_clean_unfiltered_quantile),c("Name","Type")]) probeType$probeType = ifelse(probeType$Type %in% "I", 1, 2)