Skip to content

Bug fixes #2

Merged
merged 11 commits into from
Mar 19, 2025
22 changes: 17 additions & 5 deletions scripts/10_correction_batch_effects_unfiltered.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ start_time <- Sys.time()
```

```{r setup, include=FALSE}
source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff
source("../data/Calculate_PC_Cutoff.R") # source function for determining PC cutoff
needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2",
"ggrepel", "broom", "gplots", "tidyr", "sva", "methods")
user_choices <- readRDS("../data/user_choices.rds")
Expand Down Expand Up @@ -69,7 +69,7 @@ phenotype_data <- readRDS(paste0(user_choices$project_name, "/processed_data/phe

## Correction of technical batch effects with ComBat

```{r, correction of first batch effect and PCA}
```{r, correction of first batch effect and PCA, results='asis'}
m_values <- apply(Betas_clean_unfiltered_quantile_bmiq, 2, function(x) log2((x)/(1-x))) # get M-values

if (correction_variable_1 == "PLEASE FILL IN"){
Expand Down Expand Up @@ -230,7 +230,11 @@ if (correction_variable_1 == "PLEASE FILL IN"){
}
```

```{r, correction of second batch effect and PCA}
```{r, print p-value table for batch effects after correction of first batch effect variable}
anova_lm_pvalue_df %>% paged_table()
```

```{r, correction of second batch effect and PCA, results='asis'}
if (correction_variable_2 == "PLEASE FILL IN"){
print("No second batch correction variable was specified by user, data remains unchanged")
} else {
Expand Down Expand Up @@ -389,7 +393,11 @@ if (correction_variable_2 == "PLEASE FILL IN"){
}
```

```{r, correction of third batch effect and PCA}
```{r, print p-value table for batch effects after correction of second batch effect variable}
anova_lm_pvalue_df %>% paged_table()
```

```{r, correction of third batch effect and PCA, results='asis'}
if (correction_variable_3 == "PLEASE FILL IN"){
print("No third batch correction variable was specified by user, data remains unchanged")
} else {
Expand Down Expand Up @@ -548,7 +556,11 @@ if (correction_variable_3 == "PLEASE FILL IN"){
}
```

```{r, convertion M to Beta and save, include=FALSE}
```{r, print p-value table for batch effects after correction of third batch effect variable}
anova_lm_pvalue_df %>% paged_table()
```

```{r, conversion M to Beta and save, include=FALSE}
if(exists("m_values_combat_3")){
corrected_data <- m_values_combat_3
changing_status <- "changed"
Expand Down
26 changes: 13 additions & 13 deletions scripts/11_annotate_hg19_create_pseudo_epicv1_from_epicv2.Rmd
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
title: "Script 11: Create hg19 annotation for EPICv2 and pseudo-EPICv1 version from EPICv2 data"
date: '`r format(Sys.time(), %d %B, %Y")`'
date: '`r format(Sys.time(), "%d %B, %Y")`'
output: html_document
---

Expand All @@ -27,11 +27,10 @@ start_time <- Sys.time()

```{r setup, include=FALSE}
user_choices <- readRDS("../data/user_choices.rds")
knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/"))
knitr::opts_chunk$set(echo = FALSE)

if(user_choices$package_directory != "PLEASE FILL IN"){
.libPaths(c(user_choices$package_directory, .libPaths()))
if(user_choices$package_path != "PLEASE FILL IN"){
.libPaths(c(user_choices$package_path, .libPaths()))
}

needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "minfi", "janitor")
Expand All @@ -57,6 +56,7 @@ load(paste0(user_choices$project_name, "/reports/annotations_clean_unfiltered.Rd
load(paste0(user_choices$project_name, "/processed_data/PhenoData_clean.Rdata"))
annotation_hg19 <- read.table(paste0(user_choices$personal_path, "/epic_preprocessing_k2h/data/EPICv2.hg19.manifest.tsv"))
annotation_hg19 <- row_to_names(annotation_hg19, row_number = 1, remove_row = TRUE, remove_rows_above = TRUE)
knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/"))
```

```{r, check array type, include = TRUE}
Expand All @@ -68,7 +68,7 @@ if (user_choices$array_type == "v1") {

```{r, hg19 annotation filtered data, include = FALSE}
annations_clean_filtered_hg19 <- annotation_hg19 %>%
dplyr::filter(Probe_ID %in% rownames(Betas_clean_quantile_bmiq_filtered_combated))
dplyr::filter(Probe_ID %in% rownames(Betas_clean_filtered_quantile_bmiq_combated))
save(annations_clean_filtered_hg19, file = paste0(user_choices$project_name, "/reports/annations_clean_filtered_hg19.Rdata"))
```

Expand All @@ -81,10 +81,10 @@ save(annotions_clean_unfiltered_hg19, file = paste0(user_choices$project_name, "
```{r, create pseudo-EPICv1 version for filtered data, include=FALSE}
# create annotation only containing filtered CpGs that are also on v1 and exclude any duplicated loci
annotations_clean_filtered_pseudo_v1 <- annotations_clean_filtered %>%
filter(Name %in% rownames(Betas_clean_filtered_quantile_bmiq_combated)) %>%
dplyr::filter(Name %in% rownames(Betas_clean_filtered_quantile_bmiq_combated)) %>%
# only keep loci on EPICv1, filter wrongly annotated empty value
filter(EPICv1_Loci != "") %>%
filter(duplicated(EPICv1_Loci) == FALSE)
dplyr::filter(EPICv1_Loci != "") %>%
dplyr::filter(duplicated(EPICv1_Loci) == FALSE)

# only keep betas that are on v1 and exclude any duplicated loci (keep first CpG)
keep_betas <- (rownames(Betas_clean_filtered_quantile_bmiq_combated) %in% annotations_clean_filtered_pseudo_v1$Name)
Expand All @@ -103,7 +103,7 @@ save(Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1, file = paste0(user_c
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" filtered CpGs were not found on EPICv1 and therefore excluded"), sep = "<br>\n")

dim_Betas_filtered <- dim(Betas_clean_quantile_bmiq_filtered_combated_pseudo_v1)
dim_Betas_filtered <- dim(Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1)
step_number <- c("14")
step <- c("Create pseudo-EPICv1 version (filtered data)")
data_class <- c("Betas")
Expand All @@ -117,9 +117,9 @@ save(Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1, file = paste0(user_c
```{r, create pseudo-EPICv1 version for unfiltered data, include=FALSE}
# create annotation only containing unfiltered CpGs that are also on v1 and exclude any duplicated loci
annotations_clean_unfiltered_pseudo_v1 <- annotations_clean_unfiltered %>%
filter(Name %in% rownames(Betas_clean_unfiltered_quantile_bmiq_combated)) %>%
filter(EPICv1_Loci != "") %>%
filter(duplicated(EPICv1_Loci) == FALSE)
dplyr::filter(Name %in% rownames(Betas_clean_unfiltered_quantile_bmiq_combated)) %>%
dplyr::filter(EPICv1_Loci != "") %>%
dplyr::filter(duplicated(EPICv1_Loci) == FALSE)

# only keep betas that are on v1 and exclude any duplicated loci (keep first CpG)
keep_betas <- (rownames(Betas_clean_unfiltered_quantile_bmiq_combated) %in% annotations_clean_unfiltered_pseudo_v1$Name)
Expand All @@ -138,7 +138,7 @@ save(Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1, file = paste0(user
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" unfiltered CpGs were not found on EPICv1 and therefore excluded"), sep = "<br>\n")

dim_Betas_unfiltered <- dim(Betas_clean_quantile_bmiq_unfiltered_combated_pseudo_v1)
dim_Betas_unfiltered <- dim(Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1)
step_number <- c("15")
step <- c("Create pseudo-EPICv1 version (unfiltered data)")
data_class <- c("Betas")
Expand Down
16 changes: 12 additions & 4 deletions scripts/1_definitions_and_setup.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ This script asks the user to define or choose:
+ person id
+ sex
* additional batch variables (up to 3)
* optionally:
+ path for R packages
+ path to the genotype data location
+ prefix (project name) of the genotype files

<br>
**Notes/Definitions:**
Expand All @@ -52,9 +56,9 @@ The path name should not include the name of the github repository
**Path to phenotype data:** The path to the data table containing phenotype information
The path name should include the name of data table itself. This file must be **.csv**
**Path to idat files:** The path to the raw files obtained from array
**Path to R package directories (optional):** The path to the personal directory for installing and loading packages.
**Path to R package directories (optional):** The path to a directory for installing and loading R packages.
This is needed if installation of R packages to the default R library are not permitted due to the lack of admin rights.
Thsi will enable R to install and load any needed packages without problems.
This will enable R to install and load any needed packages without problems.
Please make sure you have the needed writing permissions in this specified directory
**Array type:** "v1" or "v2". This stands for the different versions of the EPIC array
**Population ethnicity:** Choose "AFR", "AMR", "ASN", "EUR" or "Global". Global
Expand All @@ -78,6 +82,9 @@ be listed as F, M, or W. If your sex data is described in a different way
be considered in a batch check in script 8: plate, slide, array, row and column. Please
provide additional possible batch variables if needed as a character vector (up to 3,
add more if necessary). If not, please leave this field as is.
+**Path to genotype files:** The path to the QC-ed genotype data in plink file format (.bed, .bim, .fam)
+**Genotype project name:** Prefix of the genotype files, denoting the project (e.g. my_study_2020)

<br>

## Directory structure
Expand Down Expand Up @@ -143,10 +150,11 @@ user_choices <- data.frame(
"name_well_column" = name_well_column,
"name_personid_column" = name_personid_column,
"name_sex_column" = name_sex_column,
"name_treatment_column" = name_treatment_column,
"additional_batch_variable_1" = additional_batch_variable_1,
"additional_batch_variable_2" = additional_batch_variable_2,
"additional_batch_variable_3" = additional_batch_variable_3
"additional_batch_variable_3" = additional_batch_variable_3,
"genotype_path" = genotype_path,
"genotype_projectname" = genotype_projectname
)
saveRDS(user_choices, "../data/user_choices.rds")

Expand Down
2 changes: 1 addition & 1 deletion scripts/2_import_raw_DNAm_data.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ if (any(installed_biocmanager_packages == FALSE)) {
lapply(needed_biocmanager_packages, library, character.only = TRUE)

if (user_choices$array_type == "v2") {
if (!("jokergoo/IlluminaHumanMethylationEPICv2manifest" %in% rownames(installed.packages()))) {
if (!("IlluminaHumanMethylationEPICv2manifest" %in% rownames(installed.packages()))) {
BiocManager::install("jokergoo/IlluminaHumanMethylationEPICv2manifest")
}
if (!("IlluminaHumanMethylationEPICv2anno.20a1.hg38" %in% rownames(installed.packages()))) {
Expand Down
3 changes: 1 addition & 2 deletions scripts/3_quality_control.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ library(minfi)

knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/"))
knitr::opts_chunk$set(echo = FALSE)
phenotype_data <- readRDS(paste0(user_choices$project_name, "/processed_data/phenotype_data.rds"))
```

```{r load data, include=FALSE}
Expand Down Expand Up @@ -147,7 +146,7 @@ title_df <- title_df %>%
dplyr::select(arrayid, personid)

for (i in 1:ncol(RGSet_quality)){
title_df_current <- title_df %>% filter(arrayid == rownames(pData(RGSet_quality))[i] %>% pull(personid))
title_df_current <- title_df %>% filter(arrayid == rownames(pData(RGSet_quality))[i]) %>% pull(personid)
titel <- paste0(rownames(pData(RGSet_quality))[i], " / ", title_df_current)
densityPlot(as.matrix(Betas_quality[,i]), main = titel)
print(i)
Expand Down
1 change: 1 addition & 0 deletions scripts/4_check_matching_of_epigenetic_sex.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ fail the sex match check

```{r, predict sex with DNAm data, include= FALSE}
predictedSex <- getSex(gRatioSet_quality, cutoff=-2)
phenotype_data <- subset(phenotype_data, arrayid %in% rownames(predictedSex),)

sex_df <- phenotype_data %>%
select(personid, arrayid, reported_sex = sex) %>%
Expand Down