diff --git a/scripts/10_correction_batch_effects_unfiltered.Rmd b/scripts/10_correction_batch_effects_unfiltered.Rmd
index dce65a0..f59a991 100644
--- a/scripts/10_correction_batch_effects_unfiltered.Rmd
+++ b/scripts/10_correction_batch_effects_unfiltered.Rmd
@@ -30,7 +30,7 @@ start_time <- Sys.time()
```
```{r setup, include=FALSE}
-source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff
+source("../data/Calculate_PC_Cutoff.R") # source function for determining PC cutoff
needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2",
"ggrepel", "broom", "gplots", "tidyr", "sva", "methods")
user_choices <- readRDS("../data/user_choices.rds")
@@ -69,7 +69,7 @@ phenotype_data <- readRDS(paste0(user_choices$project_name, "/processed_data/phe
## Correction of technical batch effects with ComBat
-```{r, correction of first batch effect and PCA}
+```{r, correction of first batch effect and PCA, results='asis'}
m_values <- apply(Betas_clean_unfiltered_quantile_bmiq, 2, function(x) log2((x)/(1-x))) # get M-values
if (correction_variable_1 == "PLEASE FILL IN"){
@@ -230,7 +230,11 @@ if (correction_variable_1 == "PLEASE FILL IN"){
}
```
-```{r, correction of second batch effect and PCA}
+```{r, print p-value table for batch effects after correction of first batch effect variable}
+anova_lm_pvalue_df %>% paged_table()
+```
+
+```{r, correction of second batch effect and PCA, results='asis'}
if (correction_variable_2 == "PLEASE FILL IN"){
print("No second batch correction variable was specified by user, data remains unchanged")
} else {
@@ -389,7 +393,11 @@ if (correction_variable_2 == "PLEASE FILL IN"){
}
```
-```{r, correction of third batch effect and PCA}
+```{r, print p-value table for batch effects after correction of second batch effect variable}
+anova_lm_pvalue_df %>% paged_table()
+```
+
+```{r, correction of third batch effect and PCA, results='asis'}
if (correction_variable_3 == "PLEASE FILL IN"){
print("No third batch correction variable was specified by user, data remains unchanged")
} else {
@@ -548,7 +556,11 @@ if (correction_variable_3 == "PLEASE FILL IN"){
}
```
-```{r, convertion M to Beta and save, include=FALSE}
+```{r, print p-value table for batch effects after correction of third batch effect variable}
+anova_lm_pvalue_df %>% paged_table()
+```
+
+```{r, conversion M to Beta and save, include=FALSE}
if(exists("m_values_combat_3")){
corrected_data <- m_values_combat_3
changing_status <- "changed"
diff --git a/scripts/11_annotate_hg19_create_pseudo_epicv1_from_epicv2.Rmd b/scripts/11_annotate_hg19_create_pseudo_epicv1_from_epicv2.Rmd
index 2c57f79..34b1e51 100644
--- a/scripts/11_annotate_hg19_create_pseudo_epicv1_from_epicv2.Rmd
+++ b/scripts/11_annotate_hg19_create_pseudo_epicv1_from_epicv2.Rmd
@@ -1,6 +1,6 @@
---
title: "Script 11: Create hg19 annotation for EPICv2 and pseudo-EPICv1 version from EPICv2 data"
-date: '`r format(Sys.time(), %d %B, %Y")`'
+date: '`r format(Sys.time(), "%d %B, %Y")`'
output: html_document
---
@@ -27,11 +27,10 @@ start_time <- Sys.time()
```{r setup, include=FALSE}
user_choices <- readRDS("../data/user_choices.rds")
-knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/"))
knitr::opts_chunk$set(echo = FALSE)
-if(user_choices$package_directory != "PLEASE FILL IN"){
- .libPaths(c(user_choices$package_directory, .libPaths()))
+if(user_choices$package_path != "PLEASE FILL IN"){
+ .libPaths(c(user_choices$package_path, .libPaths()))
}
needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "minfi", "janitor")
@@ -57,6 +56,7 @@ load(paste0(user_choices$project_name, "/reports/annotations_clean_unfiltered.Rd
load(paste0(user_choices$project_name, "/processed_data/PhenoData_clean.Rdata"))
annotation_hg19 <- read.table(paste0(user_choices$personal_path, "/epic_preprocessing_k2h/data/EPICv2.hg19.manifest.tsv"))
annotation_hg19 <- row_to_names(annotation_hg19, row_number = 1, remove_row = TRUE, remove_rows_above = TRUE)
+knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/"))
```
```{r, check array type, include = TRUE}
@@ -68,7 +68,7 @@ if (user_choices$array_type == "v1") {
```{r, hg19 annotation filtered data, include = FALSE}
annations_clean_filtered_hg19 <- annotation_hg19 %>%
- dplyr::filter(Probe_ID %in% rownames(Betas_clean_quantile_bmiq_filtered_combated))
+ dplyr::filter(Probe_ID %in% rownames(Betas_clean_filtered_quantile_bmiq_combated))
save(annations_clean_filtered_hg19, file = paste0(user_choices$project_name, "/reports/annations_clean_filtered_hg19.Rdata"))
```
@@ -81,10 +81,10 @@ save(annotions_clean_unfiltered_hg19, file = paste0(user_choices$project_name, "
```{r, create pseudo-EPICv1 version for filtered data, include=FALSE}
# create annotation only containing filtered CpGs that are also on v1 and exclude any duplicated loci
annotations_clean_filtered_pseudo_v1 <- annotations_clean_filtered %>%
- filter(Name %in% rownames(Betas_clean_filtered_quantile_bmiq_combated)) %>%
+ dplyr::filter(Name %in% rownames(Betas_clean_filtered_quantile_bmiq_combated)) %>%
# only keep loci on EPICv1, filter wrongly annotated empty value
- filter(EPICv1_Loci != "") %>%
- filter(duplicated(EPICv1_Loci) == FALSE)
+ dplyr::filter(EPICv1_Loci != "") %>%
+ dplyr::filter(duplicated(EPICv1_Loci) == FALSE)
# only keep betas that are on v1 and exclude any duplicated loci (keep first CpG)
keep_betas <- (rownames(Betas_clean_filtered_quantile_bmiq_combated) %in% annotations_clean_filtered_pseudo_v1$Name)
@@ -103,7 +103,7 @@ save(Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1, file = paste0(user_c
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" filtered CpGs were not found on EPICv1 and therefore excluded"), sep = "
\n")
- dim_Betas_filtered <- dim(Betas_clean_quantile_bmiq_filtered_combated_pseudo_v1)
+ dim_Betas_filtered <- dim(Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1)
step_number <- c("14")
step <- c("Create pseudo-EPICv1 version (filtered data)")
data_class <- c("Betas")
@@ -117,9 +117,9 @@ save(Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1, file = paste0(user_c
```{r, create pseudo-EPICv1 version for unfiltered data, include=FALSE}
# create annotation only containing unfiltered CpGs that are also on v1 and exclude any duplicated loci
annotations_clean_unfiltered_pseudo_v1 <- annotations_clean_unfiltered %>%
- filter(Name %in% rownames(Betas_clean_unfiltered_quantile_bmiq_combated)) %>%
- filter(EPICv1_Loci != "") %>%
- filter(duplicated(EPICv1_Loci) == FALSE)
+ dplyr::filter(Name %in% rownames(Betas_clean_unfiltered_quantile_bmiq_combated)) %>%
+ dplyr::filter(EPICv1_Loci != "") %>%
+ dplyr::filter(duplicated(EPICv1_Loci) == FALSE)
# only keep betas that are on v1 and exclude any duplicated loci (keep first CpG)
keep_betas <- (rownames(Betas_clean_unfiltered_quantile_bmiq_combated) %in% annotations_clean_unfiltered_pseudo_v1$Name)
@@ -138,7 +138,7 @@ save(Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1, file = paste0(user
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" unfiltered CpGs were not found on EPICv1 and therefore excluded"), sep = "
\n")
- dim_Betas_unfiltered <- dim(Betas_clean_quantile_bmiq_unfiltered_combated_pseudo_v1)
+ dim_Betas_unfiltered <- dim(Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1)
step_number <- c("15")
step <- c("Create pseudo-EPICv1 version (unfiltered data)")
data_class <- c("Betas")
diff --git a/scripts/1_definitions_and_setup.Rmd b/scripts/1_definitions_and_setup.Rmd
index 404f356..0627cdf 100644
--- a/scripts/1_definitions_and_setup.Rmd
+++ b/scripts/1_definitions_and_setup.Rmd
@@ -43,6 +43,10 @@ This script asks the user to define or choose:
+ person id
+ sex
* additional batch variables (up to 3)
+* optionally:
+ + path for R packages
+ + path to the genotype data location
+ + prefix (project name) of the genotype files
**Notes/Definitions:**
@@ -52,9 +56,9 @@ The path name should not include the name of the github repository
**Path to phenotype data:** The path to the data table containing phenotype information
The path name should include the name of data table itself. This file must be **.csv**
**Path to idat files:** The path to the raw files obtained from array
-**Path to R package directories (optional):** The path to the personal directory for installing and loading packages.
+**Path to R package directories (optional):** The path to a directory for installing and loading R packages.
This is needed if installation of R packages to the default R library are not permitted due to the lack of admin rights.
-Thsi will enable R to install and load any needed packages without problems.
+This will enable R to install and load any needed packages without problems.
Please make sure you have the needed writing permissions in this specified directory
**Array type:** "v1" or "v2". This stands for the different versions of the EPIC array
**Population ethnicity:** Choose "AFR", "AMR", "ASN", "EUR" or "Global". Global
@@ -78,6 +82,9 @@ be listed as F, M, or W. If your sex data is described in a different way
be considered in a batch check in script 8: plate, slide, array, row and column. Please
provide additional possible batch variables if needed as a character vector (up to 3,
add more if necessary). If not, please leave this field as is.
++**Path to genotype files:** The path to the QC-ed genotype data in plink file format (.bed, .bim, .fam)
++**Genotype project name:** Prefix of the genotype files, denoting the project (e.g. my_study_2020)
+
## Directory structure
@@ -143,10 +150,11 @@ user_choices <- data.frame(
"name_well_column" = name_well_column,
"name_personid_column" = name_personid_column,
"name_sex_column" = name_sex_column,
- "name_treatment_column" = name_treatment_column,
"additional_batch_variable_1" = additional_batch_variable_1,
"additional_batch_variable_2" = additional_batch_variable_2,
- "additional_batch_variable_3" = additional_batch_variable_3
+ "additional_batch_variable_3" = additional_batch_variable_3,
+ "genotype_path" = genotype_path,
+ "genotype_projectname" = genotype_projectname
)
saveRDS(user_choices, "../data/user_choices.rds")
diff --git a/scripts/2_import_raw_DNAm_data.Rmd b/scripts/2_import_raw_DNAm_data.Rmd
index 3c84dd1..6b31569 100644
--- a/scripts/2_import_raw_DNAm_data.Rmd
+++ b/scripts/2_import_raw_DNAm_data.Rmd
@@ -29,7 +29,7 @@ if (any(installed_biocmanager_packages == FALSE)) {
lapply(needed_biocmanager_packages, library, character.only = TRUE)
if (user_choices$array_type == "v2") {
- if (!("jokergoo/IlluminaHumanMethylationEPICv2manifest" %in% rownames(installed.packages()))) {
+ if (!("IlluminaHumanMethylationEPICv2manifest" %in% rownames(installed.packages()))) {
BiocManager::install("jokergoo/IlluminaHumanMethylationEPICv2manifest")
}
if (!("IlluminaHumanMethylationEPICv2anno.20a1.hg38" %in% rownames(installed.packages()))) {
diff --git a/scripts/3_quality_control.Rmd b/scripts/3_quality_control.Rmd
index 8ef3db5..a22a324 100644
--- a/scripts/3_quality_control.Rmd
+++ b/scripts/3_quality_control.Rmd
@@ -27,7 +27,6 @@ library(minfi)
knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/"))
knitr::opts_chunk$set(echo = FALSE)
-phenotype_data <- readRDS(paste0(user_choices$project_name, "/processed_data/phenotype_data.rds"))
```
```{r load data, include=FALSE}
@@ -147,7 +146,7 @@ title_df <- title_df %>%
dplyr::select(arrayid, personid)
for (i in 1:ncol(RGSet_quality)){
- title_df_current <- title_df %>% filter(arrayid == rownames(pData(RGSet_quality))[i] %>% pull(personid))
+ title_df_current <- title_df %>% filter(arrayid == rownames(pData(RGSet_quality))[i]) %>% pull(personid)
titel <- paste0(rownames(pData(RGSet_quality))[i], " / ", title_df_current)
densityPlot(as.matrix(Betas_quality[,i]), main = titel)
print(i)
diff --git a/scripts/4_check_matching_of_epigenetic_sex.Rmd b/scripts/4_check_matching_of_epigenetic_sex.Rmd
index 736dafd..bf8ac07 100644
--- a/scripts/4_check_matching_of_epigenetic_sex.Rmd
+++ b/scripts/4_check_matching_of_epigenetic_sex.Rmd
@@ -54,6 +54,7 @@ fail the sex match check
```{r, predict sex with DNAm data, include= FALSE}
predictedSex <- getSex(gRatioSet_quality, cutoff=-2)
+phenotype_data <- subset(phenotype_data, arrayid %in% rownames(predictedSex),)
sex_df <- phenotype_data %>%
select(personid, arrayid, reported_sex = sex) %>%
diff --git a/scripts/6_filtering_cpgs.Rmd b/scripts/6_filtering_cpgs.Rmd
index 5319162..865c403 100644
--- a/scripts/6_filtering_cpgs.Rmd
+++ b/scripts/6_filtering_cpgs.Rmd
@@ -25,25 +25,29 @@ if (!("minfi" %in% rownames(installed.packages()))) {
}
library(minfi)
-knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/"))
knitr::opts_chunk$set(echo = FALSE)
```
-```{r load data, include=FALSE}
-summary_table_preprocessing <- readRDS(paste0(user_choices$project_name, "/reports/summary_table_preprocessing.rds"))
-load(paste0(user_choices$project_name, "/processed_data/RGSet_clean.Rdata"))
-load(paste0(user_choices$project_name, "/processed_data/gRatioSet_clean.Rdata"))
-load(paste0(user_choices$project_name, "/processed_data/Betas_clean.Rdata"))
-load(paste0(user_choices$project_name, "/processed_data/PhenoData_clean.Rdata"))
-load(paste0(user_choices$project_name, "/processed_data/detP_clean.Rdata"))
+```{r load data 1, include=FALSE}
# Info about cross-hybridizing probes comes from Chen et al. 2013, DOI: 10.4161/epi.23470:
-load("data/ChenProbeIDs.rdata")
+load("../data/ChenProbeIDs.rdata")
# Info about mapping inaccuracies for EPIC v2.0 from Illumina product information:
if (user_choices$array_type == "v2") {
v2_mapping_inaccuracy <- read.csv("../data/EPIC-8v2-0_A1-190MappingInaccuracies.csv")}
# Info about flagged probes for EPIC v2.0 from Illumina product information:
if (user_choices$array_type == "v2") {
v2_flagged_probes <- read.csv("../data/EPIC-8v2-0_A1-FlaggedProbes.csv")}
+
+knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/"))
+```
+
+```{r load data 2, include=FALSE}
+summary_table_preprocessing <- readRDS(paste0(user_choices$project_name, "/reports/summary_table_preprocessing.rds"))
+load(paste0(user_choices$project_name, "/processed_data/RGSet_clean.Rdata"))
+load(paste0(user_choices$project_name, "/processed_data/gRatioSet_clean.Rdata"))
+load(paste0(user_choices$project_name, "/processed_data/Betas_clean.Rdata"))
+load(paste0(user_choices$project_name, "/processed_data/PhenoData_clean.Rdata"))
+load(paste0(user_choices$project_name, "/processed_data/detP_clean.Rdata"))
```
# Description
@@ -127,13 +131,13 @@ if (user_choices$array_type == "v2") {
keep_betas_df <- as.data.frame(keep_betas)
cat(paste0(nrow(exclude_replicates), " replicate probes were removed"))
- dim_gRatioSet_filtered <- dim(gRatioSet_clean)
- dim_Betas_filtered <- dim(Betas_clean)
+ dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean)
+ dim_Betas_clean_filtered <- dim(Betas_clean)
step_number <- c("4", "4")
step <- c("Filter replicates", "Filter replicates")
data_class <- c("gRatioSet", "Betas")
- samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
- probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+ samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+ probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -150,7 +154,7 @@ Betas_clean_filtered <- Betas_clean[keep_betas,]
# ensure probes are in the same order in cleaned gRatioSet (RGSet with annotations) and detP objects, then subset
detP_clean_filtered2 <- detP_clean[match(featureNames(gRatioSet_clean),rownames(detP_clean)),]
-keep_gratioset <- rowSums(detP_clean_filtered < 0.01) == ncol(gRatioSet_clean)
+keep_gratioset <- rowSums(detP_clean_filtered2 < 0.01) == ncol(gRatioSet_clean)
gRatioSet_clean_filtered <- gRatioSet_clean[keep_gratioset,]
```
@@ -159,13 +163,13 @@ keep_betas_df <- as.data.frame(keep_betas)
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" probes failed in one or more samples"), sep = "
\n")
-dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
-dim_Betas_filtered <- dim(Betas_clean_filtered)
+dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered)
+dim_Betas_clean_filtered <- dim(Betas_clean_filtered)
step_number <- c("5", "5")
step <- c("Filter technical", "Filter technical")
data_class <- c("gRatioSet", "Betas")
-samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
-probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -188,13 +192,13 @@ keep_betas_df <- as.data.frame(keep_betas)
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" probes were on the X chromosome"), sep = "
\n")
-dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
-dim_Betas_filtered <- dim(Betas_clean_filtered)
+dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered)
+dim_Betas_clean_filtered <- dim(Betas_clean_filtered)
step_number <- c("6", "6")
step <- c("Filter X Chromosome", "Filter X Chromosome")
data_class <- c("gRatioSet", "Betas")
-samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
-probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -215,13 +219,13 @@ keep_betas_df <- as.data.frame(keep_betas)
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" probes were on the Y chromosome"), sep = "
\n")
-dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
-dim_Betas_filtered <- dim(Betas_clean_filtered)
+dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered)
+dim_Betas_clean_filtered <- dim(Betas_clean_filtered)
step_number <- c("7", "7")
step <- c("Filter Y Chromosome", "Filter Y Chromosome")
data_class <- c("gRatioSet", "Betas")
-samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
-probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -235,13 +239,13 @@ Betas_clean_filtered <- Betas_clean_filtered[rownames(Betas_clean_filtered) %in%
```
```{r, info output for user probes affected by snps, results='asis'}
-dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
-dim_Betas_filtered <- dim(gRatioSet_clean_filtered)
+dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered)
+dim_Betas_clean_filtered <- dim(gRatioSet_clean_filtered)
step_number <- c("8", "8")
step <- c("Filter SNP Affected", "Filter SNP Affected")
data_class <- c("gRatioSet", "Betas")
-samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
-probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -283,13 +287,13 @@ if (user_choices$array_type == "v1") {
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" probes were cross-hybridizing according to Chen et al."), sep = "
\n")
- dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
- dim_Betas_filtered <- dim(Betas_clean_filtered)
+ dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered)
+ dim_Betas_clean_filtered <- dim(Betas_clean_filtered)
step_number <- c("9", "9")
step <- c("Filter Cross-Hybridizing Chen", "Filter Cross-Hybridizing Chen")
data_class <- c("gRatioSet", "Betas")
- samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
- probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+ samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+ probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -305,8 +309,8 @@ if (user_choices$array_type == "v1") {
keep_betas <- !(rownames(Betas_clean_filtered) %in% exclude_McCartney$V1)
Betas_clean_filtered <- Betas_clean_filtered[keep_betas,]
- keep_gratioset <- !(featureNames(dim_gRatioSet_filtered) %in% exclude_McCartney$V1)
- dim_gRatioSet_filtered <- dim_gRatioSet_filtered[keep_gratioset,]
+ keep_gratioset <- !(featureNames(dim_gRatioSet_clean_filtered) %in% exclude_McCartney$V1)
+ gRatioSet_clean_filtered <- gRatioSet_clean_filtered[keep_gratioset,]
}
```
@@ -316,13 +320,13 @@ if (user_choices$array_type == "v1") {
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" probes may be cross-hybridizing according to McCartney et al."), sep = "
\n")
- dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
- dim_Betas_filtered <- dim(Betas_clean_filtered)
+ dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered)
+ dim_Betas_clean_filtered <- dim(Betas_clean_filtered)
step_number <- c("10", "10")
step <- c("Filter Cross-Hybridizing McCartney", "Filter Cross-Hybridizing McCartney")
data_class <- c("gRatioSet", "Betas")
- samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
- probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+ samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+ probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -351,13 +355,13 @@ if (user_choices$array_type == "v1") {
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" probes were polymorphic"), sep = "
\n")
- dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
- dim_Betas_filtered <- dim(Betas_clean_filtered)
+ dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered)
+ dim_Betas_clean_filtered <- dim(Betas_clean_filtered)
step_number <- c("11", "11")
step <- c("Filter Polymorphic", "Filter Polymorphic")
data_class <- c("gRatioSet", "Betas")
- samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
- probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+ samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+ probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -369,10 +373,10 @@ saveRDS(summary_table_preprocessing, paste0(user_choices$project_name, "/reports
```{r, filter out mapping inaccuracies, include=FALSE}
if (user_choices$array_type == "v2") {
- keep_betas <- !(rownames(Betas_clean_filtered) %in% v2_mapping_inacc$IlmnID)
+ keep_betas <- !(rownames(Betas_clean_filtered) %in% v2_mapping_inaccuracy$IlmnID)
Betas_clean_filtered <- Betas_clean_filtered[keep_betas,]
- keep_gratioset <- !(featureNames(gRatioSet_clean_filtered) %in% v2_mapping_inacc$IlmnID)
+ keep_gratioset <- !(featureNames(gRatioSet_clean_filtered) %in% v2_mapping_inaccuracy$IlmnID)
gRatioSet_clean_filtered <- gRatioSet_clean_filtered[keep_gratioset,]
}
```
@@ -383,13 +387,13 @@ if (user_choices$array_type == "v2") {
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" CpGs show known mapping inaccuracies"), sep = "
\n")
- dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
- dim_Betas_filtered <- dim(Betas_clean_filtered)
+ dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered)
+ dim_Betas_clean_filtered <- dim(Betas_clean_filtered)
step_number <- c("12", "12")
step <- c("Filter Mapping Inaccuracies", "Filter Mapping Inaccuracies")
data_class <- c("gRatioSet", "Betas")
- samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
- probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+ samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+ probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -414,13 +418,13 @@ if (user_choices$array_type == "v2") {
cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(),
" CpGs were flagged by Illumina"), sep = "
\n")
- dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered)
- dim_Betas_filtered <- dim(Betas_clean_filtered)
+ dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered)
+ dim_Betas_clean_filtered <- dim(Betas_clean_filtered)
step_number <- c("13", "13")
step <- c("Filter Flagged Probes", "Filter Flagged Probes")
data_class <- c("gRatioSet", "Betas")
- samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2])
- probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1])
+ samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2])
+ probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1])
table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes)
summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding)
@@ -428,7 +432,7 @@ if (user_choices$array_type == "v2") {
```
```{r, save data, include=FALSE}
-save(gRatioSet_filtered, file = paste0(user_choices$project_name, "/processed_data/gRatioSet_clean_filtered.Rdata"))
+save(gRatioSet_clean_filtered, file = paste0(user_choices$project_name, "/processed_data/gRatioSet_clean_filtered.Rdata"))
Betas_clean_filtered <- getBeta(gRatioSet_clean_filtered) # beta values
save(Betas_clean_filtered, file = paste0(user_choices$project_name, "/processed_data/Betas_clean_filtered.Rdata"))
diff --git a/scripts/7_normalization.Rmd b/scripts/7_normalization.Rmd
index f9e8abf..583c20b 100644
--- a/scripts/7_normalization.Rmd
+++ b/scripts/7_normalization.Rmd
@@ -31,7 +31,7 @@ knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/"))
knitr::opts_chunk$set(echo = FALSE)
if (user_choices$array_type == "v2") {
- if (!("jokergoo/IlluminaHumanMethylationEPICv2manifest" %in% rownames(installed.packages()))) {
+ if (!("IlluminaHumanMethylationEPICv2manifest" %in% rownames(installed.packages()))) {
BiocManager::install("jokergoo/IlluminaHumanMethylationEPICv2manifest")
}
if (!("IlluminaHumanMethylationEPICv2anno.20a1.hg38" %in% rownames(installed.packages()))) {
@@ -84,27 +84,23 @@ save(gRatioSet_clean_filtered_quantile, file = paste0(user_choices$project_name,
Betas_clean_filtered_quantile <- getBeta(gRatioSet_clean_filtered_quantile)
save(Betas_clean_filtered_quantile, file = paste0(user_choices$project_name,
- "/processed_data/Betas_clean_filtered_quantile.Rdata"))
-Ms_clean_filtered_qunatile <- getM(gRatioSet_clean_filtered_quantile)
-save(Ms_clean_filtered_qunatile, file = paste0(user_choices$project_name,
- "/processed_data/Ms_clean_filtered_qunatile.Rdata"))
+ "/processed_data/Betas_clean_filtered_quantile.Rdata"))
+Ms_clean_filtered_quantile <- getM(gRatioSet_clean_filtered_quantile)
+save(Ms_clean_filtered_quantile, file = paste0(user_choices$project_name,
+ "/processed_data/Ms_clean_filtered_quantile.Rdata"))
# further normalization with BMIQ:
-probeType <- as.data.frame(annotations_clean_filtered[rownames(Betas_clean_filtered_quantile),c("Name","Type")])
+probeType <- as.data.frame(subset(annotations_clean_filtered, Name %in% rownames(Betas_clean_filtered_quantile), select=c("Name","Type")))
probeType$probeType = ifelse(probeType$Type %in% "I", 1, 2)
Betas_clean_filtered_quantile_bmiq <- apply(Betas_clean_filtered_quantile[,1:length(colnames(Betas_clean_filtered_quantile))],2,
function(a) BMIQ(a,probeType$probeType,plots=FALSE)$nbeta) # sourced from script "BMIQ_1.6_Teschendorff.R"
-save(Betas_clean_filtered_quantile_bmiq, file = paste0(user_choices$project_name,
- "/processed_data/Betas_clean_filtered_quantile_bmiq.Rdata"))
+save(Betas_clean_filtered_quantile_bmiq, file = paste0(user_choices$project_name, "/processed_data/Betas_clean_filtered_quantile_bmiq.Rdata"))
```
-```{r. normalize unfiltered probes, include=FALSE}
+```{r normalize unfiltered probes, include=FALSE}
# Normalization of unfiltered data
-
# Functional normalization
-# Note: sex is set to "F" for all samples since sex chromosomes were already removed in script 6
-# Note: This does not affect the normalization or the phenotype data, it simply stops preprocessQuantile() from producing an error
-gRatioSet_clean_unfiltered_quantile <- preprocessQuantile(RGSet_clean, sex = "F")
+gRatioSet_clean_unfiltered_quantile <- preprocessQuantile(RGSet_clean)
save(gRatioSet_clean_unfiltered_quantile, file = paste0(user_choices$project_name,
"/processed_data/gRatioSet_clean_unfiltered_quantile.Rdata")) # output: GenomicRatioSet
diff --git a/scripts/8_removal_of_outliers_detect_batch_effects.Rmd b/scripts/8_removal_of_outliers_detect_batch_effects.Rmd
index 6bbf7ec..8d176f6 100644
--- a/scripts/8_removal_of_outliers_detect_batch_effects.Rmd
+++ b/scripts/8_removal_of_outliers_detect_batch_effects.Rmd
@@ -46,7 +46,7 @@ start_time <- Sys.time()
```{r setup, include=FALSE}
-source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff
+source("../data/Calculate_PC_Cutoff.R") # source function for determining PC cutoff
needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2",
"ggrepel", "broom", "gplots", "tidyr", "sva", "methods")
user_choices <- readRDS("../data/user_choices.rds")
diff --git a/scripts/9_correction_batch_effects_filtered.Rmd b/scripts/9_correction_batch_effects_filtered.Rmd
index a8e1daf..17500de 100644
--- a/scripts/9_correction_batch_effects_filtered.Rmd
+++ b/scripts/9_correction_batch_effects_filtered.Rmd
@@ -41,7 +41,7 @@ start_time <- Sys.time()
```
```{r setup, include=FALSE}
-source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff
+source("../data/Calculate_PC_Cutoff.R") # source function for determining PC cutoff
needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2",
"ggrepel", "broom", "gplots", "tidyr", "sva", "methods")
user_choices <- readRDS("../data/user_choices.rds")
@@ -79,7 +79,7 @@ phenotype_data <- readRDS(paste0(user_choices$project_name, "/processed_data/phe
## Correction of technical batch effects with ComBat
-```{r, correction of first batch effect and PCA}
+```{r, correction of first batch effect and PCA, results='asis'}
m_values <- apply(Betas_clean_filtered_quantile_bmiq, 2, function(x) log2((x)/(1-x))) # get M-values
if (correction_variable_1 == "PLEASE FILL IN"){
@@ -240,7 +240,11 @@ if (correction_variable_1 == "PLEASE FILL IN"){
}
```
-```{r, correction of second batch effect and PCA}
+```{r, print p-value table for batch effects after correction of first batch effect variable}
+anova_lm_pvalue_df %>% paged_table()
+```
+
+```{r, correction of second batch effect and PCA, results='asis'}
if (correction_variable_2 == "PLEASE FILL IN"){
print("No second batch correction variable was specified by user, data remains unchanged")
} else {
@@ -399,6 +403,10 @@ if (correction_variable_2 == "PLEASE FILL IN"){
}
```
+```{r, print p-value table for batch effects after correction of second batch effect variable}
+anova_lm_pvalue_df %>% paged_table()
+```
+
```{r, correction of third batch effect and PCA}
if (correction_variable_3 == "PLEASE FILL IN"){
print("No third batch correction variable was specified by user, data remains unchanged")
@@ -557,6 +565,10 @@ if (correction_variable_3 == "PLEASE FILL IN"){
}
```
+```{r, print p-value table for batch effects after correction of third batch effect variable}
+anova_lm_pvalue_df %>% paged_table()
+```
+
```{r, convertion M to Beta and save, include=FALSE}
if(exists("m_values_combat_3")){
corrected_data <- m_values_combat_3