diff --git a/scripts/6_filtering_cpgs.Rmd b/scripts/6_filtering_cpgs.Rmd
index 5319162..865c403 100644
--- a/scripts/6_filtering_cpgs.Rmd
+++ b/scripts/6_filtering_cpgs.Rmd
@@ -25,25 +25,29 @@ if (!("minfi" %in% rownames(installed.packages()))) {
}
library(minfi)
-knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/"))
knitr::opts_chunk$set(echo = FALSE)
```
-```{r load data, include=FALSE}
-summary_table_preprocessing <- readRDS(paste0(user_choices$project_name, "/reports/summary_table_preprocessing.rds"))
-load(paste0(user_choices$project_name, "/processed_data/RGSet_clean.Rdata"))
-load(paste0(user_choices$project_name, "/processed_data/gRatioSet_clean.Rdata"))
-load(paste0(user_choices$project_name, "/processed_data/Betas_clean.Rdata"))
-load(paste0(user_choices$project_name, "/processed_data/PhenoData_clean.Rdata"))
-load(paste0(user_choices$project_name, "/processed_data/detP_clean.Rdata"))
+```{r load data 1, include=FALSE}
# Info about cross-hybridizing probes comes from Chen et al. 2013, DOI: 10.4161/epi.23470:
-load("data/ChenProbeIDs.rdata")
+load("../data/ChenProbeIDs.rdata")
# Info about mapping inaccuracies for EPIC v2.0 from Illumina product information:
if (user_choices$array_type == "v2") {
v2_mapping_inaccuracy <- read.csv("../data/EPIC-8v2-0_A1-190MappingInaccuracies.csv")}
# Info about flagged probes for EPIC v2.0 from Illumina product information:
if (user_choices$array_type == "v2") {
v2_flagged_probes <- read.csv("../data/EPIC-8v2-0_A1-FlaggedProbes.csv")}
+
+knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/"))
+```
+
+```{r load data 2, include=FALSE}
+summary_table_preprocessing <- readRDS(paste0(user_choices$project_name, "/reports/summary_table_preprocessing.rds"))
+load(paste0(user_choices$project_name, "/processed_data/RGSet_clean.Rdata"))
+load(paste0(user_choices$project_name, "/processed_data/gRatioSet_clean.Rdata"))
+load(paste0(user_choices$project_name, "/processed_data/Betas_clean.Rdata"))
+load(paste0(user_choices$project_name, "/processed_data/PhenoData_clean.Rdata"))
+load(paste0(user_choices$project_name, "/processed_data/detP_clean.Rdata"))
```
# Description
@@ -127,13 +131,13 @@ if (user_choices$array_type == "v2") {
keep_betas_df <- as.data.frame(keep_betas)
cat(paste0(nrow(exclude_replicates), " replicate probes were removed"))
- dim_gRatioSet_filtered <- dim(gRatioSet_clean)
- dim_Betas_filtered <- dim(Betas_clean)
+ dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean)
+ dim_Betas_clean_filtered <- dim(Betas_clean)
step_number <- c("4", "4")
step <- c("Filter replicates", "Filter replicates")
data_class <- c("gRatioSet", "Betas")
- samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
- probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+ samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+ probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -150,7 +154,7 @@ Betas_clean_filtered <- Betas_clean[keep_betas,]
# ensure probes are in the same order in cleaned gRatioSet (RGSet with annotations) and detP objects, then subset
detP_clean_filtered2 <- detP_clean[match(featureNames(gRatioSet_clean),rownames(detP_clean)),]
-keep_gratioset <- rowSums(detP_clean_filtered < 0.01) == ncol(gRatioSet_clean)
+keep_gratioset <- rowSums(detP_clean_filtered2 < 0.01) == ncol(gRatioSet_clean)
gRatioSet_clean_filtered <- gRatioSet_clean[keep_gratioset,]
```
@@ -159,13 +163,13 @@ keep_betas_df <- as.data.frame(keep_betas)
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" probes failed in one or more samples"), sep = "
\n")
-dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
-dim_Betas_filtered <- dim(Betas_clean_filtered)
+dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered)
+dim_Betas_clean_filtered <- dim(Betas_clean_filtered)
step_number <- c("5", "5")
step <- c("Filter technical", "Filter technical")
data_class <- c("gRatioSet", "Betas")
-samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
-probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -188,13 +192,13 @@ keep_betas_df <- as.data.frame(keep_betas)
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" probes were on the X chromosome"), sep = "
\n")
-dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
-dim_Betas_filtered <- dim(Betas_clean_filtered)
+dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered)
+dim_Betas_clean_filtered <- dim(Betas_clean_filtered)
step_number <- c("6", "6")
step <- c("Filter X Chromosome", "Filter X Chromosome")
data_class <- c("gRatioSet", "Betas")
-samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
-probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -215,13 +219,13 @@ keep_betas_df <- as.data.frame(keep_betas)
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" probes were on the Y chromosome"), sep = "
\n")
-dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
-dim_Betas_filtered <- dim(Betas_clean_filtered)
+dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered)
+dim_Betas_clean_filtered <- dim(Betas_clean_filtered)
step_number <- c("7", "7")
step <- c("Filter Y Chromosome", "Filter Y Chromosome")
data_class <- c("gRatioSet", "Betas")
-samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
-probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -235,13 +239,13 @@ Betas_clean_filtered <- Betas_clean_filtered[rownames(Betas_clean_filtered) %in%
```
```{r, info output for user probes affected by snps, results='asis'}
-dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
-dim_Betas_filtered <- dim(gRatioSet_clean_filtered)
+dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered)
+dim_Betas_clean_filtered <- dim(gRatioSet_clean_filtered)
step_number <- c("8", "8")
step <- c("Filter SNP Affected", "Filter SNP Affected")
data_class <- c("gRatioSet", "Betas")
-samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
-probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -283,13 +287,13 @@ if (user_choices$array_type == "v1") {
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" probes were cross-hybridizing according to Chen et al."), sep = "
\n")
- dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
- dim_Betas_filtered <- dim(Betas_clean_filtered)
+ dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered)
+ dim_Betas_clean_filtered <- dim(Betas_clean_filtered)
step_number <- c("9", "9")
step <- c("Filter Cross-Hybridizing Chen", "Filter Cross-Hybridizing Chen")
data_class <- c("gRatioSet", "Betas")
- samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
- probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+ samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+ probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -305,8 +309,8 @@ if (user_choices$array_type == "v1") {
keep_betas <- !(rownames(Betas_clean_filtered) %in% exclude_McCartney$V1)
Betas_clean_filtered <- Betas_clean_filtered[keep_betas,]
- keep_gratioset <- !(featureNames(dim_gRatioSet_filtered) %in% exclude_McCartney$V1)
- dim_gRatioSet_filtered <- dim_gRatioSet_filtered[keep_gratioset,]
+ keep_gratioset <- !(featureNames(dim_gRatioSet_clean_filtered) %in% exclude_McCartney$V1)
+ gRatioSet_clean_filtered <- gRatioSet_clean_filtered[keep_gratioset,]
}
```
@@ -316,13 +320,13 @@ if (user_choices$array_type == "v1") {
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" probes may be cross-hybridizing according to McCartney et al."), sep = "
\n")
- dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
- dim_Betas_filtered <- dim(Betas_clean_filtered)
+ dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered)
+ dim_Betas_clean_filtered <- dim(Betas_clean_filtered)
step_number <- c("10", "10")
step <- c("Filter Cross-Hybridizing McCartney", "Filter Cross-Hybridizing McCartney")
data_class <- c("gRatioSet", "Betas")
- samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
- probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+ samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+ probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -351,13 +355,13 @@ if (user_choices$array_type == "v1") {
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" probes were polymorphic"), sep = "
\n")
- dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
- dim_Betas_filtered <- dim(Betas_clean_filtered)
+ dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered)
+ dim_Betas_clean_filtered <- dim(Betas_clean_filtered)
step_number <- c("11", "11")
step <- c("Filter Polymorphic", "Filter Polymorphic")
data_class <- c("gRatioSet", "Betas")
- samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
- probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+ samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+ probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -369,10 +373,10 @@ saveRDS(summary_table_preprocessing, paste0(user_choices$project_name, "/reports
```{r, filter out mapping inaccuracies, include=FALSE}
if (user_choices$array_type == "v2") {
- keep_betas <- !(rownames(Betas_clean_filtered) %in% v2_mapping_inacc$IlmnID)
+ keep_betas <- !(rownames(Betas_clean_filtered) %in% v2_mapping_inaccuracy$IlmnID)
Betas_clean_filtered <- Betas_clean_filtered[keep_betas,]
- keep_gratioset <- !(featureNames(gRatioSet_clean_filtered) %in% v2_mapping_inacc$IlmnID)
+ keep_gratioset <- !(featureNames(gRatioSet_clean_filtered) %in% v2_mapping_inaccuracy$IlmnID)
gRatioSet_clean_filtered <- gRatioSet_clean_filtered[keep_gratioset,]
}
```
@@ -383,13 +387,13 @@ if (user_choices$array_type == "v2") {
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" CpGs show known mapping inaccuracies"), sep = "
\n")
- dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
- dim_Betas_filtered <- dim(Betas_clean_filtered)
+ dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered)
+ dim_Betas_clean_filtered <- dim(Betas_clean_filtered)
step_number <- c("12", "12")
step <- c("Filter Mapping Inaccuracies", "Filter Mapping Inaccuracies")
data_class <- c("gRatioSet", "Betas")
- samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
- probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+ samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+ probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -414,13 +418,13 @@ if (user_choices$array_type == "v2") {
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" CpGs were flagged by Illumina"), sep = "
\n")
- dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
- dim_Betas_filtered <- dim(Betas_clean_filtered)
+ dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered)
+ dim_Betas_clean_filtered <- dim(Betas_clean_filtered)
step_number <- c("13", "13")
step <- c("Filter Flagged Probes", "Filter Flagged Probes")
data_class <- c("gRatioSet", "Betas")
- samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
- probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+ samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+ probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -428,7 +432,7 @@ if (user_choices$array_type == "v2") {
```
```{r, save data, include=FALSE}
-save(gRatioSet_filtered, file = paste0(user_choices$project_name, "/processed_data/gRatioSet_clean_filtered.Rdata"))
+save(gRatioSet_clean_filtered, file = paste0(user_choices$project_name, "/processed_data/gRatioSet_clean_filtered.Rdata"))
Betas_clean_filtered <- getBeta(gRatioSet_clean_filtered) # beta values
save(Betas_clean_filtered, file = paste0(user_choices$project_name, "/processed_data/Betas_clean_filtered.Rdata"))