Skip to content

Commit

Permalink
Filter v2 replicates, filter SNPs for both v1 and v2, updated documen…
Browse files Browse the repository at this point in the history
…tation
  • Loading branch information
Vera N. Karlbauer committed Aug 13, 2024
1 parent 5f7b9a3 commit 6636b22
Showing 1 changed file with 158 additions and 36 deletions.
194 changes: 158 additions & 36 deletions scripts/6_filtering_cpgs.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -48,23 +48,90 @@ v2_flagged_probes <- read.csv("../data/EPIC-8v2-0_A1-FlaggedProbes.csv")}

# Description

This script filters out poor performing probes with unreliable signal from
This script annotates probes and filters out poor performing probes with unreliable signal from
beta values and the genomic ratio set (RGSet with annotations).
We will remove probes that:

* have replicates and do not perform well (v2 only, keep replicate with best detP-value)
* have technical failures in one or more samples (i.e. those with detection *p* value of probes > 0.01)
* are on sex chromosomes
* might be affected by common SNPs
* cross-hybridize (i.e. those that map to multiple places in genome) as described by Chen et al. (v1.0 only)
* cross-hybridize as described by McCartney et al. (v1.0 only)
* are polymorphic (v1.0 only)
* show mapping inaccuracies or were flagged by Illumina (v2 only)
* have replicates and do not perform well (v2 only, keep replicate with best detP-value)
* show mapping inaccuracies or were flagged by Illumina (v2 only)
Please note: replicate removal is performed for the filtered AND unfiltered data (v2 only.

```{r get annotation of probes, include=FALSE}
annotations_clean = getAnnotation(RGSet_clean)
```

## Replicate removal
### Detect replicates (v2 only)

```{r, detect replicates}
# add non-unique CpG name
CpG_names <- as.data.frame(rownames(Betas_clean))
colnames(CpG_names) <- c("IlmnID")
CpG_names$CpG_name <- str_sub(CpG_names$IlmnID, 1, 10)
# isolate replicate IDs
replicates <- CpG_names %>%
group_by(CpG_name) %>%
filter(n()>1) %>%
distinct(CpG_name)
# link to detP values
CpG_names_detP <- left_join(CpG_names, as.data.frame(cbind(rownames(detP_clean), rowMeans(detP_clean))), by = join_by("IlmnID" == "V1"))
colnames(CpG_names_detP) <- c("IlmnID", "CpG_name", "detP_value")
cat(paste0(nrow(replicates), " replicate CpGs were detected"))
```

```{r, exclude replicates and save data, include = FALSE}
## create list of replicates with lowest detP values, if detPs are equal, take first replicate
keep_replicates <- CpG_names_detP %>%
filter(CpG_name %in% replicates$CpG_name) %>%
group_by(CpG_name) %>%
slice_min(detP_value) %>%
ungroup()
keep_replicates <- keep_replicates %>%
filter(duplicated(CpG_name) == FALSE)
# create list of replicates to exclude (CpGs in replicate list but probe not in keep list)
exclude_replicates <- CpG_names_detP %>%
filter(CpG_name %in% replicates$CpG_name) %>%
filter(!(IlmnID %in% keep_replicates$IlmnID))
# exclude replicates from betas, RGSet, detP and save data
RGSet_clean <- subsetByLoci(RGSet_clean, excludeLoci = exclude_replicates$IlmnID)
save(RGSet_clean, file = paste0(user_choices$project_name, "/processed_data/RGSet_clean.Rdata"))
keep_betas <- !(rownames(Betas_clean) %in% exclude_replicates$IlmnID)
Betas_clean <- Betas_clean[keep_betas,]
save(Betas_clean, file = paste0(user_choices$project_name, "/processed_data/Betas_clean.Rdata"))
keep_detP <- !(rownames(detP_clean) %in% exclude_replicates$IlmnID)
detP_clean <- detP_clean[keep_detP,]
save(detP_clean, file = paste0(user_choices$project_name, "/processed_data/detP_clean.Rdata"))
```

### Exclude replicates

```{r, info output for replicate removal, results='asis'}
keep_betas_df <- as.data.frame(keep_betas)
cat(paste0(nrow(exclude_replicates), " replicate probes were removed"))
dim_RGSet_filtered <- dim(RGSet_clean)
dim_Betas_filtered <- dim(Betas_clean)
step_number <- c("4", "4")
step <- c("Filter replicates", "Filter replicates")
data_class <- c("RGSet", "Betas")
samples <- c(dim_RGSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_RGSet_filtered[1], dim_Betas_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
```

## Removal of failed probes in one or more samples

```{r filter out failed probes, include=FALSE}
Expand All @@ -86,7 +153,7 @@ cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
dim_Betas_filtered <- dim(Betas_clean_filtered)
step_number <- c("4", "4")
step_number <- c("5", "5")
step <- c("Filter technical", "Filter technical")
data_class <- c("gRatioSet", "Betas")
samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
Expand Down Expand Up @@ -115,7 +182,7 @@ cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
dim_Betas_filtered <- dim(Betas_clean_filtered)
step_number <- c("5", "5")
step_number <- c("6", "6")
step <- c("Filter X Chromosome", "Filter X Chromosome")
data_class <- c("gRatioSet", "Betas")
samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
Expand All @@ -142,7 +209,7 @@ cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
dim_Betas_filtered <- dim(Betas_clean_filtered)
step_number <- c("6", "6")
step_number <- c("7", "7")
step <- c("Filter Y Chromosome", "Filter Y Chromosome")
data_class <- c("gRatioSet", "Betas")
samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
Expand All @@ -155,41 +222,38 @@ summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_prep
## Removal of probes possibly affected by common SNPs

```{r filter out probes affected by snps, include=FALSE}
if (user_choices$array_type == "v1") {
gRatioSet_clean_filtered <- dropLociWithSnps(gRatioSet_clean_filtered) # function requires a GenomicRatioSet
Betas_clean_filtered <- Betas_clean_filtered[rownames(Betas_clean_filtered) %in% featureNames(gRatioSet_clean_filtered),]
}
gRatioSet_clean_filtered <- dropLociWithSnps(gRatioSet_clean_filtered) # function requires a GenomicRatioSet
Betas_clean_filtered <- Betas_clean_filtered[rownames(Betas_clean_filtered) %in% featureNames(gRatioSet_clean_filtered),]
```

```{r, info output for user probes affected by snps, results='asis'}
if (user_choices$array_type == "v1") {
dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
dim_Betas_filtered <- dim(gRatioSet_clean_filtered)
step_number <- c("7", "7")
step <- c("Filter SNP Affected", "Filter SNP Affected")
data_class <- c("gRatioSet", "Betas")
samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
dim_Betas_filtered <- dim(gRatioSet_clean_filtered)
step_number <- c("8", "8")
step <- c("Filter SNP Affected", "Filter SNP Affected")
data_class <- c("gRatioSet", "Betas")
samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
probes_7_step <- as.numeric(summary_table_preprocessing %>%
filter(data_class == "Betas", step_number == "7") %>%
probes_8_step <- as.numeric(summary_table_preprocessing %>%
filter(data_class == "Betas", step_number == "8") %>%
pull(probes))
probes_6_step <- as.numeric(summary_table_preprocessing %>%
filter(data_class == "Betas", step_number == "6") %>%
probes_7_step <- as.numeric(summary_table_preprocessing %>%
filter(data_class == "Betas", step_number == "7") %>%
pull(probes))
delta_betas_probes <- probes_6_step - probes_7_step
delta_betas_probes <- probes_7_step - probes_8_step
cat(paste0(as.character(delta_betas_probes),
cat(paste0(as.character(delta_betas_probes),
" probes were possibly affected by SNPs"), sep = "<br>\n")
}
```

## Removal of cross-hybridizing probes (Chen et al.)
## Removal of cross-hybridizing probes (Chen et al., v1 only)

```{r filter out cross-hybridizing probes chen, include=FALSE}
if (user_choices$array_type == "v1") {
Expand All @@ -213,7 +277,7 @@ if (user_choices$array_type == "v1") {
dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
dim_Betas_filtered <- dim(Betas_clean_filtered)
step_number <- c("8", "9")
step_number <- c("9", "9")
step <- c("Filter Cross-Hybridizing Chen", "Filter Cross-Hybridizing Chen")
data_class <- c("gRatioSet", "Betas")
samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
Expand All @@ -224,7 +288,7 @@ if (user_choices$array_type == "v1") {
}
```

## Removal of Cross-hybridizing probes (McCartney et al.)
## Removal of Cross-hybridizing probes (McCartney et al., v1 only)

```{r, filter out cross-hybridizing probes mccartney, include=FALSE}
if (user_choices$array_type == "v1") {
Expand All @@ -246,7 +310,7 @@ if (user_choices$array_type == "v1") {
dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
dim_Betas_filtered <- dim(Betas_clean_filtered)
step_number <- c("9", "9")
step_number <- c("10", "10")
step <- c("Filter Cross-Hybridizing McCartney", "Filter Cross-Hybridizing McCartney")
data_class <- c("gRatioSet", "Betas")
samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
Expand All @@ -257,7 +321,7 @@ if (user_choices$array_type == "v1") {
}
```

## Removal of polymorphic probes
## Removal of polymorphic probes (v1 only)

```{r, filter out polymorphic probes, include=FALSE}
if (user_choices$array_type == "v1") {
Expand All @@ -281,7 +345,7 @@ if (user_choices$array_type == "v1") {
dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
dim_Betas_filtered <- dim(Betas_clean_filtered)
step_number <- c("10", "10")
step_number <- c("11", "11")
step <- c("Filter Polymorphic", "Filter Polymorphic")
data_class <- c("gRatioSet", "Betas")
samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
Expand All @@ -293,6 +357,66 @@ if (user_choices$array_type == "v1") {
saveRDS(summary_table_preprocessing, paste0(user_choices$project_name, "/reports/summary_table_preprocessing.rds"))
```

## Removal of mapping inaccuracies (v2 only)

```{r, filter out mapping inaccuracies, include=FALSE}
if (user_choices$array_type == "v2") {
keep_betas <- !(rownames(Betas_clean_filtered) %in% v2_mapping_inacc$IlmnID)
Betas_clean_filtered <- Betas_clean_filtered[keep_betas,]
RGSet_clean_filtered <- subsetByLoci(RGSet_clean_filtered, excludeLoci = v2_mapping_inacc$IlmnID)
}
```

```{r, info output for user mapping inaccuracies, results='asis'}
if (user_choices$array_type == "v2") {
keep_betas_df <- as.data.frame(keep_betas)
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" CpGs show known mapping inaccuracies"), sep = "<br>\n")
dim_RGSet_filtered <- dim(RGSet_clean_filtered)
dim_Betas_filtered <- dim(Betas_clean_filtered)
step_number <- c("12", "12")
step <- c("Filter Mapping Inaccuracies", "Filter Mapping Inaccuracies")
data_class <- c("RGSet", "Betas")
samples <- c(dim_RGSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_RGSet_filtered[1], dim_Betas_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
}
```

## Removal of flagged probes (v2 only)

```{r, filter out flagged probes, include=FALSE}
if (user_choices$array_type == "v2") {
keep_betas <- !(rownames(Betas_clean_filtered) %in% v2_flagged_probes$IlmnID)
Betas_clean_filtered <- Betas_clean_filtered[keep_betas,]
RGSet_clean_filtered <- subsetByLoci(RGSet_clean_filtered, excludeLoci = v2_flagged_probes$IlmnID)
}
```

```{r, info output for user flagged probes, results='asis'}
if (user_choices$array_type == "v2") {
keep_betas_df <- as.data.frame(keep_betas)
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" CpGs were flagged by Illumina"), sep = "<br>\n")
dim_RGSet_filtered <- dim(RGSet_clean_filtered)
dim_Betas_filtered <- dim(Betas_clean_filtered)
step_number <- c("13", "13")
step <- c("Filter Flagged Probes", "Filter Flagged Probes")
data_class <- c("RGSet", "Betas")
samples <- c(dim_RGSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_RGSet_filtered[1], dim_Betas_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
}
```

```{r, save data, include=FALSE}
save(dim_gRatioSet_filtered, file = paste0(user_choices$project_name, "/processed_data/gRatioSet_clean_filtered.Rdata"))
Expand All @@ -306,8 +430,6 @@ save(Ms_clean_filtered, file = paste0(user_choices$project_name, "/processed_dat
```{r, save annotations, include=FALSE}
save(annotations_clean, file = paste0(user_choices$project_name, "/reports/annotations_clean.Rdata"))
```

## Beta values distribution report
Expand Down

0 comments on commit 6636b22

Please sign in to comment.