Skip to content

Commit

Permalink
gRatioSet name changed where appropriate, documentation in summary ta…
Browse files Browse the repository at this point in the history
…ble updated, description script 6, 2 files for v2.0 filtering added to data
  • Loading branch information
NatanYusupov committed May 10, 2024
1 parent 1099e4f commit 5d02e93
Show file tree
Hide file tree
Showing 8 changed files with 50,505 additions and 71 deletions.
191 changes: 191 additions & 0 deletions data/EPIC-8v2-0_A1-190MappingInaccuracies.csv

Large diffs are not rendered by default.

50,210 changes: 50,210 additions & 0 deletions data/EPIC-8v2-0_A1-FlaggedProbes.csv

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion scripts/10_correction_batch_effects_unfiltered.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ start_time <- Sys.time()
source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff
needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2",
"ggrepel", "broom", "gplots", "tidyr", "sva", "methods")
user_choices <- readRDS("../data/user_choices.rds")
if(user_choices$package_path != "PLEASE FILL IN"){
.libPaths(c(user_choices$package_path, .libPaths()))
}
installed_packages <- needed_packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(needed_packages[!installed_packages], repos = "http://cran.us.r-project.org")
Expand All @@ -44,7 +48,6 @@ if (!("minfi" %in% rownames(installed.packages()))) {
}
library(minfi)
user_choices <- readRDS("../data/user_choices.rds")
knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/"))
knitr::opts_chunk$set(echo = FALSE)
```
Expand Down
5 changes: 4 additions & 1 deletion scripts/5_samples_exclusion.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ start_time <- Sys.time()

```{r setup, include=FALSE}
needed_packages <- c("dplyr", "knitr", "rmarkdown")
user_choices <- readRDS("../data/user_choices.rds")
if(user_choices$package_path != "PLEASE FILL IN"){
.libPaths(c(user_choices$package_path, .libPaths()))
}
installed_packages <- needed_packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(needed_packages[!installed_packages], repos = "http://cran.us.r-project.org")
Expand All @@ -21,7 +25,6 @@ if (!("minfi" %in% rownames(installed.packages()))) {
}
library(minfi)
user_choices <- readRDS("../data/user_choices.rds")
knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/"))
knitr::opts_chunk$set(echo = FALSE)
```
Expand Down
123 changes: 70 additions & 53 deletions scripts/6_filtering_cpgs.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ start_time <- Sys.time()

```{r setup, include=FALSE}
needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "utils")
user_choices <- readRDS("../data/user_choices.rds")
if(user_choices$package_path != "PLEASE FILL IN"){
.libPaths(c(user_choices$package_path, .libPaths()))
}
installed_packages <- needed_packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(needed_packages[!installed_packages], repos = "http://cran.us.r-project.org")
Expand All @@ -21,7 +25,6 @@ if (!("minfi" %in% rownames(installed.packages()))) {
}
library(minfi)
user_choices <- readRDS("../data/user_choices.rds")
knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/"))
knitr::opts_chunk$set(echo = FALSE)
```
Expand All @@ -35,24 +38,31 @@ load(paste0(user_choices$project_name, "/processed_data/PhenoData_clean.Rdata"))
load(paste0(user_choices$project_name, "/processed_data/detP_clean.Rdata"))
# Info about cross-hybridizing probes comes from Chen et al. 2013, DOI: 10.4161/epi.23470:
load("data/ChenProbeIDs.rdata")
# Info about mapping inaccuracies for EPIC v2.0 from Illumina product information:
if (user_choices$array_type == "v2") {
v2_mapping_inaccuracy <- read.csv("../data/EPIC-8v2-0_A1-190MappingInaccuracies.csv")}
# Info about flagged probes for EPIC v2.0 from Illumina product information:
if (user_choices$array_type == "v2") {
v2_flagged_probes <- read.csv("../data/EPIC-8v2-0_A1-FlaggedProbes.csv")}
```

# Description

This script filters out poor performing probes with unreliable signal from
beta values and the genomic ratio set.
beta values and the genomic ratio set (RGSet with annotations).
We will remove probes that:

* have technical failures in one or more samples (i.e. those with detection *p* value > threshold set by user)
* have technical failures in one or more samples (i.e. those with detection *p* value of probes > 0.01)
* are on sex chromosomes
* might be affected by common SNPs
* cross-hybridize (i.e. those that map to multiple places in genome) as described by Chen et al.
* cross-hybridize (described by McCartney et al.)
* are polymorphic
* cross-hybridize (i.e. those that map to multiple places in genome) as described by Chen et al. (v1.0 only)
* cross-hybridize as described by McCartney et al. (v1.0 only)
* are polymorphic (v1.0 only)
* show mapping inaccuracies or were flagged by Illumina (v2 only)
* have replicates and do not perform well (v2 only, keep replicate with best detP-value)

```{r get annotation of probes, include=FALSE}
annotations_clean = getAnnotation(RGSet_clean)
save(annotations_clean, file = paste0(user_choices$project_name, "/reports/annotations_clean.Rdata"))
```

## Removal of failed probes in one or more samples
Expand All @@ -65,22 +75,22 @@ Betas_clean_filtered <- Betas_clean[keep_betas,]
# ensure probes are in the same order in cleaned gRatioSet (RGSet with annotations) and detP objects, then subset
detP_clean_filtered2 <- detP_clean[match(featureNames(gRatioSet_clean),rownames(detP_clean)),]
keep_rgset <- rowSums(detP_clean_filtered < 0.01) == ncol(gRatioSet_clean)
RGSet_clean_filtered <- gRatioSet_clean[keep_rgset,]
keep_gratioset <- rowSums(detP_clean_filtered < 0.01) == ncol(gRatioSet_clean)
gRatioSet_clean_filtered <- gRatioSet_clean[keep_gratioset,]
```

```{r, info output for user failed probes, results='asis'}
keep_betas_df <- as.data.frame(keep_betas)
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" probes failed in one or more samples"), sep = "<br>\n")
dim_RGSet_filtered <- dim(RGSet_clean_filtered)
dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
dim_Betas_filtered <- dim(Betas_clean_filtered)
step_number <- c("4", "4")
step <- c("Filter technical", "Filter technical")
data_class <- c("RGSet", "Betas")
samples <- c(dim_RGSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_RGSet_filtered[1], dim_Betas_filtered[1])
data_class <- c("gRatioSet", "Betas")
samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
Expand All @@ -94,22 +104,22 @@ summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_prep
keep_betas <- !(rownames(Betas_clean_filtered) %in% annotations_clean$Name[annotations_clean$chr == "chrX"])
Betas_clean_filtered <- Betas_clean_filtered[keep_betas,]
keep_rgset <- !(featureNames(RGSet_clean_filtered) %in% annotations_clean$Name[annotations_clean$chr == "chrX"])
RGSet_clean_filtered <- RGSet_clean_filtered[keep_rgset,]
keep_gratioset <- !(featureNames(gRatioSet_clean_filtered) %in% annotations_clean$Name[annotations_clean$chr == "chrX"])
gRatioSet_clean_filtered <- gRatioSet_clean_filtered[keep_gratioset,]
```

```{r, info output for user x chromosomes, results='asis'}
keep_betas_df <- as.data.frame(keep_betas)
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" probes were on the X chromosome"), sep = "<br>\n")
dim_RGSet_filtered <- dim(RGSet_clean_filtered)
dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
dim_Betas_filtered <- dim(Betas_clean_filtered)
step_number <- c("5", "5")
step <- c("Filter X Chromosome", "Filter X Chromosome")
data_class <- c("RGSet", "Betas")
samples <- c(dim_RGSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_RGSet_filtered[1], dim_Betas_filtered[1])
data_class <- c("gRatioSet", "Betas")
samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
Expand All @@ -121,22 +131,22 @@ summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_prep
keep_betas <- !(rownames(Betas_clean_filtered) %in% annotations_clean$Name[annotations_clean$chr == "chrY"])
Betas_clean_filtered <- Betas_clean_filtered[keep_betas,]
keep_rgset <- !(featureNames(RGSet_clean_filtered) %in% annotations_clean$Name[annotations_clean$chr == "chrY"])
RGSet_clean_filtered <- RGSet_clean_filtered[keep_rgset,]
keep_gratioset <- !(featureNames(gRatioSet_clean_filtered) %in% annotations_clean$Name[annotations_clean$chr == "chrY"])
gRatioSet_clean_filtered <- gRatioSet_clean_filtered[keep_gratioset,]
```

```{r, info output for user y chromosomes, results='asis'}
keep_betas_df <- as.data.frame(keep_betas)
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" probes were on the Y chromosome"), sep = "<br>\n")
dim_RGSet_filtered <- dim(RGSet_clean_filtered)
dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
dim_Betas_filtered <- dim(Betas_clean_filtered)
step_number <- c("6", "6")
step <- c("Filter Y Chromosome", "Filter Y Chromosome")
data_class <- c("RGSet", "Betas")
samples <- c(dim_RGSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_RGSet_filtered[1], dim_Betas_filtered[1])
data_class <- c("gRatioSet", "Betas")
samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
Expand All @@ -146,20 +156,20 @@ summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_prep

```{r filter out probes affected by snps, include=FALSE}
if (user_choices$array_type == "v1") {
RGSet_clean_filtered <- dropLociWithSnps(RGSet_clean_filtered) # function requires a GenomicRatioSet
Betas_clean_filtered <- Betas_clean_filtered[rownames(Betas_clean_filtered) %in% featureNames(RGSet_clean_filtered),]
gRatioSet_clean_filtered <- dropLociWithSnps(gRatioSet_clean_filtered) # function requires a GenomicRatioSet
Betas_clean_filtered <- Betas_clean_filtered[rownames(Betas_clean_filtered) %in% featureNames(gRatioSet_clean_filtered),]
}
```

```{r, info output for user probes affected by snps, results='asis'}
if (user_choices$array_type == "v1") {
dim_RGSet_filtered <- dim(RGSet_clean_filtered)
dim_Betas_filtered <- dim(Betas_clean_filtered)
dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
dim_Betas_filtered <- dim(gRatioSet_clean_filtered)
step_number <- c("7", "7")
step <- c("Filter SNP Affected", "Filter SNP Affected")
data_class <- c("RGSet", "Betas")
samples <- c(dim_RGSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_RGSet_filtered[1], dim_Betas_filtered[1])
data_class <- c("gRatioSet", "Betas")
samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
Expand Down Expand Up @@ -190,8 +200,8 @@ if (user_choices$array_type == "v1") {
keep_betas <- !(rownames(Betas_clean_filtered) %in% exclude_Chen$Name)
Betas_clean_filtered <- Betas_clean_filtered[keep_betas,]
keep_rgset <- !(featureNames(RGSet_clean_filtered) %in% exclude_Chen$Name)
RGSet_clean_filtered <- RGSet_clean_filtered[keep_rgset,]
keep_gratioset <- !(featureNames(gRatioSet_clean_filtered) %in% exclude_Chen$Name)
gRatioSet_clean_filtered <- gRatioSet_clean_filtered[keep_gratioset,]
}
```

Expand All @@ -201,13 +211,13 @@ if (user_choices$array_type == "v1") {
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" probes were cross-hybridizing according to Chen et al."), sep = "<br>\n")
dim_RGSet_filtered <- dim(RGSet_clean_filtered)
dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
dim_Betas_filtered <- dim(Betas_clean_filtered)
step_number <- c("8", "9")
step <- c("Filter Cross-Hybridizing Chen", "Filter Cross-Hybridizing Chen")
data_class <- c("RGSet", "Betas")
samples <- c(dim_RGSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_RGSet_filtered[1], dim_Betas_filtered[1])
data_class <- c("gRatioSet", "Betas")
samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
Expand All @@ -223,8 +233,8 @@ if (user_choices$array_type == "v1") {
keep_betas <- !(rownames(Betas_clean_filtered) %in% exclude_McCartney$V1)
Betas_clean_filtered <- Betas_clean_filtered[keep_betas,]
keep_rgset <- !(featureNames(RGSet_clean_filtered) %in% exclude_McCartney$V1)
RGSet_clean_filtered <- RGSet_clean_filtered[keep_rgset,]
keep_gratioset <- !(featureNames(dim_gRatioSet_filtered) %in% exclude_McCartney$V1)
dim_gRatioSet_filtered <- dim_gRatioSet_filtered[keep_gratioset,]
}
```

Expand All @@ -234,13 +244,13 @@ if (user_choices$array_type == "v1") {
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" probes may be cross-hybridizing according to McCartney et al."), sep = "<br>\n")
dim_RGSet_filtered <- dim(RGSet_clean_filtered)
dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
dim_Betas_filtered <- dim(Betas_clean_filtered)
step_number <- c("9", "9")
step <- c("Filter Cross-Hybridizing McCartney", "Filter Cross-Hybridizing McCartney")
data_class <- c("RGSet", "Betas")
samples <- c(dim_RGSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_RGSet_filtered[1], dim_Betas_filtered[1])
data_class <- c("gRatioSet", "Betas")
samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
Expand All @@ -258,8 +268,8 @@ if (user_choices$array_type == "v1") {
keep_betas <- !(rownames(Betas_clean_filtered) %in% exclude_polymorphic$IlmnID)
Betas_clean_filtered <- Betas_clean_filtered[keep_betas,]
keep_rgset <- !(featureNames(RGSet_clean_filtered) %in% exclude_polymorphic$IlmnID)
RGSet_clean_filtered <- RGSet_clean_filtered[keep_rgset,]
keep_gratioset <- !(featureNames(gRatioSet_clean_filtered) %in% exclude_polymorphic$IlmnID)
gRatioSet_clean_filtered <- gRatioSet_clean_filtered[keep_gratioset,]
}
```

Expand All @@ -269,13 +279,13 @@ if (user_choices$array_type == "v1") {
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" probes were polymorphic"), sep = "<br>\n")
dim_RGSet_filtered <- dim(RGSet_clean_filtered)
dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
dim_Betas_filtered <- dim(Betas_clean_filtered)
step_number <- c("10", "10")
step <- c("Filter Polymorphic", "Filter Polymorphic")
data_class <- c("RGSet", "Betas")
samples <- c(dim_RGSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_RGSet_filtered[1], dim_Betas_filtered[1])
data_class <- c("gRatioSet", "Betas")
samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
Expand All @@ -284,13 +294,20 @@ saveRDS(summary_table_preprocessing, paste0(user_choices$project_name, "/reports
```

```{r, save data, include=FALSE}
save(RGSet_clean_filtered, file = paste0(user_choices$project_name, "/processed_data/RGSet_clean_filtered.Rdata"))
save(dim_gRatioSet_filtered, file = paste0(user_choices$project_name, "/processed_data/gRatioSet_clean_filtered.Rdata"))
Betas_clean_filtered <- getBeta(RGSet_clean_filtered) # beta values
Betas_clean_filtered <- getBeta(gRatioSet_clean_filtered) # beta values
save(Betas_clean_filtered, file = paste0(user_choices$project_name, "/processed_data/Betas_clean_filtered.Rdata"))
Ms_clean_filtered <- getM(RGSet_clean_filtered) # M values
Ms_clean_filtered <- getM(gRatioSet_clean_filtered) # M values
save(Ms_clean_filtered, file = paste0(user_choices$project_name, "/processed_data/Ms_clean_filtered.Rdata"))
```

```{r, save annotations, include=FALSE}
save(annotations_clean, file = paste0(user_choices$project_name, "/reports/annotations_clean.Rdata"))
```

## Beta values distribution report
Expand Down
Loading

0 comments on commit 5d02e93

Please sign in to comment.