diff --git a/scripts/6_filtering_cpgs.Rmd b/scripts/6_filtering_cpgs.Rmd index b98aefb..55bdf32 100644 --- a/scripts/6_filtering_cpgs.Rmd +++ b/scripts/6_filtering_cpgs.Rmd @@ -48,23 +48,90 @@ v2_flagged_probes <- read.csv("../data/EPIC-8v2-0_A1-FlaggedProbes.csv")} # Description -This script filters out poor performing probes with unreliable signal from +This script annotates probes and filters out poor performing probes with unreliable signal from beta values and the genomic ratio set (RGSet with annotations). We will remove probes that: - +* have replicates and do not perform well (v2 only, keep replicate with best detP-value) * have technical failures in one or more samples (i.e. those with detection *p* value of probes > 0.01) * are on sex chromosomes * might be affected by common SNPs * cross-hybridize (i.e. those that map to multiple places in genome) as described by Chen et al. (v1.0 only) * cross-hybridize as described by McCartney et al. (v1.0 only) * are polymorphic (v1.0 only) -* show mapping inaccuracies or were flagged by Illumina (v2 only) -* have replicates and do not perform well (v2 only, keep replicate with best detP-value) +* show mapping inaccuracies or were flagged by Illumina (v2 only) +Please note: replicate removal is performed for the filtered AND unfiltered data (v2 only. ```{r get annotation of probes, include=FALSE} annotations_clean = getAnnotation(RGSet_clean) ``` +## Replicate removal +### Detect replicates (v2 only) + +```{r, detect replicates} +# add non-unique CpG name +CpG_names <- as.data.frame(rownames(Betas_clean)) +colnames(CpG_names) <- c("IlmnID") +CpG_names$CpG_name <- str_sub(CpG_names$IlmnID, 1, 10) +# isolate replicate IDs +replicates <- CpG_names %>% + group_by(CpG_name) %>% + filter(n()>1) %>% + distinct(CpG_name) + +# link to detP values +CpG_names_detP <- left_join(CpG_names, as.data.frame(cbind(rownames(detP_clean), rowMeans(detP_clean))), by = join_by("IlmnID" == "V1")) +colnames(CpG_names_detP) <- c("IlmnID", "CpG_name", "detP_value") + +cat(paste0(nrow(replicates), " replicate CpGs were detected")) +``` + +```{r, exclude replicates and save data, include = FALSE} +## create list of replicates with lowest detP values, if detPs are equal, take first replicate +keep_replicates <- CpG_names_detP %>% + filter(CpG_name %in% replicates$CpG_name) %>% + group_by(CpG_name) %>% + slice_min(detP_value) %>% + ungroup() +keep_replicates <- keep_replicates %>% + filter(duplicated(CpG_name) == FALSE) + +# create list of replicates to exclude (CpGs in replicate list but probe not in keep list) +exclude_replicates <- CpG_names_detP %>% + filter(CpG_name %in% replicates$CpG_name) %>% + filter(!(IlmnID %in% keep_replicates$IlmnID)) + +# exclude replicates from betas, RGSet, detP and save data +RGSet_clean <- subsetByLoci(RGSet_clean, excludeLoci = exclude_replicates$IlmnID) +save(RGSet_clean, file = paste0(user_choices$project_name, "/processed_data/RGSet_clean.Rdata")) + +keep_betas <- !(rownames(Betas_clean) %in% exclude_replicates$IlmnID) +Betas_clean <- Betas_clean[keep_betas,] +save(Betas_clean, file = paste0(user_choices$project_name, "/processed_data/Betas_clean.Rdata")) + +keep_detP <- !(rownames(detP_clean) %in% exclude_replicates$IlmnID) +detP_clean <- detP_clean[keep_detP,] +save(detP_clean, file = paste0(user_choices$project_name, "/processed_data/detP_clean.Rdata")) +``` + +### Exclude replicates + +```{r, info output for replicate removal, results='asis'} +keep_betas_df <- as.data.frame(keep_betas) +cat(paste0(nrow(exclude_replicates), " replicate probes were removed")) + +dim_RGSet_filtered <- dim(RGSet_clean) +dim_Betas_filtered <- dim(Betas_clean) +step_number <- c("4", "4") +step <- c("Filter replicates", "Filter replicates") +data_class <- c("RGSet", "Betas") +samples <- c(dim_RGSet_filtered[2], dim_Betas_filtered[2]) +probes <- c(dim_RGSet_filtered[1], dim_Betas_filtered[1]) + +table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes) +summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding) +``` + ## Removal of failed probes in one or more samples ```{r filter out failed probes, include=FALSE} @@ -86,7 +153,7 @@ cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered) dim_Betas_filtered <- dim(Betas_clean_filtered) -step_number <- c("4", "4") +step_number <- c("5", "5") step <- c("Filter technical", "Filter technical") data_class <- c("gRatioSet", "Betas") samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2]) @@ -115,7 +182,7 @@ cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered) dim_Betas_filtered <- dim(Betas_clean_filtered) -step_number <- c("5", "5") +step_number <- c("6", "6") step <- c("Filter X Chromosome", "Filter X Chromosome") data_class <- c("gRatioSet", "Betas") samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2]) @@ -142,7 +209,7 @@ cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered) dim_Betas_filtered <- dim(Betas_clean_filtered) -step_number <- c("6", "6") +step_number <- c("7", "7") step <- c("Filter Y Chromosome", "Filter Y Chromosome") data_class <- c("gRatioSet", "Betas") samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2]) @@ -155,41 +222,38 @@ summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_prep ## Removal of probes possibly affected by common SNPs ```{r filter out probes affected by snps, include=FALSE} -if (user_choices$array_type == "v1") { - gRatioSet_clean_filtered <- dropLociWithSnps(gRatioSet_clean_filtered) # function requires a GenomicRatioSet - Betas_clean_filtered <- Betas_clean_filtered[rownames(Betas_clean_filtered) %in% featureNames(gRatioSet_clean_filtered),] -} +gRatioSet_clean_filtered <- dropLociWithSnps(gRatioSet_clean_filtered) # function requires a GenomicRatioSet +Betas_clean_filtered <- Betas_clean_filtered[rownames(Betas_clean_filtered) %in% featureNames(gRatioSet_clean_filtered),] ``` ```{r, info output for user probes affected by snps, results='asis'} -if (user_choices$array_type == "v1") { - dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered) - dim_Betas_filtered <- dim(gRatioSet_clean_filtered) - step_number <- c("7", "7") - step <- c("Filter SNP Affected", "Filter SNP Affected") - data_class <- c("gRatioSet", "Betas") - samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2]) - probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1]) +dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered) +dim_Betas_filtered <- dim(gRatioSet_clean_filtered) +step_number <- c("8", "8") +step <- c("Filter SNP Affected", "Filter SNP Affected") +data_class <- c("gRatioSet", "Betas") +samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2]) +probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1]) - table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes) - summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding) +table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes) +summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding) - probes_7_step <- as.numeric(summary_table_preprocessing %>% - filter(data_class == "Betas", step_number == "7") %>% +probes_8_step <- as.numeric(summary_table_preprocessing %>% + filter(data_class == "Betas", step_number == "8") %>% pull(probes)) - probes_6_step <- as.numeric(summary_table_preprocessing %>% - filter(data_class == "Betas", step_number == "6") %>% +probes_7_step <- as.numeric(summary_table_preprocessing %>% + filter(data_class == "Betas", step_number == "7") %>% pull(probes)) - delta_betas_probes <- probes_6_step - probes_7_step +delta_betas_probes <- probes_7_step - probes_8_step - cat(paste0(as.character(delta_betas_probes), +cat(paste0(as.character(delta_betas_probes), " probes were possibly affected by SNPs"), sep = "<br>\n") -} + ``` -## Removal of cross-hybridizing probes (Chen et al.) +## Removal of cross-hybridizing probes (Chen et al., v1 only) ```{r filter out cross-hybridizing probes chen, include=FALSE} if (user_choices$array_type == "v1") { @@ -213,7 +277,7 @@ if (user_choices$array_type == "v1") { dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered) dim_Betas_filtered <- dim(Betas_clean_filtered) - step_number <- c("8", "9") + step_number <- c("9", "9") step <- c("Filter Cross-Hybridizing Chen", "Filter Cross-Hybridizing Chen") data_class <- c("gRatioSet", "Betas") samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2]) @@ -224,7 +288,7 @@ if (user_choices$array_type == "v1") { } ``` -## Removal of Cross-hybridizing probes (McCartney et al.) +## Removal of Cross-hybridizing probes (McCartney et al., v1 only) ```{r, filter out cross-hybridizing probes mccartney, include=FALSE} if (user_choices$array_type == "v1") { @@ -246,7 +310,7 @@ if (user_choices$array_type == "v1") { dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered) dim_Betas_filtered <- dim(Betas_clean_filtered) - step_number <- c("9", "9") + step_number <- c("10", "10") step <- c("Filter Cross-Hybridizing McCartney", "Filter Cross-Hybridizing McCartney") data_class <- c("gRatioSet", "Betas") samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2]) @@ -257,7 +321,7 @@ if (user_choices$array_type == "v1") { } ``` -## Removal of polymorphic probes +## Removal of polymorphic probes (v1 only) ```{r, filter out polymorphic probes, include=FALSE} if (user_choices$array_type == "v1") { @@ -281,7 +345,7 @@ if (user_choices$array_type == "v1") { dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered) dim_Betas_filtered <- dim(Betas_clean_filtered) - step_number <- c("10", "10") + step_number <- c("11", "11") step <- c("Filter Polymorphic", "Filter Polymorphic") data_class <- c("gRatioSet", "Betas") samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2]) @@ -293,6 +357,66 @@ if (user_choices$array_type == "v1") { saveRDS(summary_table_preprocessing, paste0(user_choices$project_name, "/reports/summary_table_preprocessing.rds")) ``` +## Removal of mapping inaccuracies (v2 only) + +```{r, filter out mapping inaccuracies, include=FALSE} +if (user_choices$array_type == "v2") { + keep_betas <- !(rownames(Betas_clean_filtered) %in% v2_mapping_inacc$IlmnID) + Betas_clean_filtered <- Betas_clean_filtered[keep_betas,] + + RGSet_clean_filtered <- subsetByLoci(RGSet_clean_filtered, excludeLoci = v2_mapping_inacc$IlmnID) +} +``` + +```{r, info output for user mapping inaccuracies, results='asis'} +if (user_choices$array_type == "v2") { + keep_betas_df <- as.data.frame(keep_betas) + cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), + " CpGs show known mapping inaccuracies"), sep = "<br>\n") + + dim_RGSet_filtered <- dim(RGSet_clean_filtered) + dim_Betas_filtered <- dim(Betas_clean_filtered) + step_number <- c("12", "12") + step <- c("Filter Mapping Inaccuracies", "Filter Mapping Inaccuracies") + data_class <- c("RGSet", "Betas") + samples <- c(dim_RGSet_filtered[2], dim_Betas_filtered[2]) + probes <- c(dim_RGSet_filtered[1], dim_Betas_filtered[1]) + + table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes) + summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding) +} +``` + +## Removal of flagged probes (v2 only) + +```{r, filter out flagged probes, include=FALSE} +if (user_choices$array_type == "v2") { + keep_betas <- !(rownames(Betas_clean_filtered) %in% v2_flagged_probes$IlmnID) + Betas_clean_filtered <- Betas_clean_filtered[keep_betas,] + + RGSet_clean_filtered <- subsetByLoci(RGSet_clean_filtered, excludeLoci = v2_flagged_probes$IlmnID) +} +``` + +```{r, info output for user flagged probes, results='asis'} +if (user_choices$array_type == "v2") { + keep_betas_df <- as.data.frame(keep_betas) + cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), + " CpGs were flagged by Illumina"), sep = "<br>\n") + + dim_RGSet_filtered <- dim(RGSet_clean_filtered) + dim_Betas_filtered <- dim(Betas_clean_filtered) + step_number <- c("13", "13") + step <- c("Filter Flagged Probes", "Filter Flagged Probes") + data_class <- c("RGSet", "Betas") + samples <- c(dim_RGSet_filtered[2], dim_Betas_filtered[2]) + probes <- c(dim_RGSet_filtered[1], dim_Betas_filtered[1]) + + table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes) + summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding) +} +``` + ```{r, save data, include=FALSE} save(dim_gRatioSet_filtered, file = paste0(user_choices$project_name, "/processed_data/gRatioSet_clean_filtered.Rdata")) @@ -306,8 +430,6 @@ save(Ms_clean_filtered, file = paste0(user_choices$project_name, "/processed_dat ```{r, save annotations, include=FALSE} save(annotations_clean, file = paste0(user_choices$project_name, "/reports/annotations_clean.Rdata")) - - ``` ## Beta values distribution report