diff --git a/scripts/10_correction_batch_effects_unfiltered.Rmd b/scripts/10_correction_batch_effects_unfiltered.Rmd index dce65a0..f59a991 100644 --- a/scripts/10_correction_batch_effects_unfiltered.Rmd +++ b/scripts/10_correction_batch_effects_unfiltered.Rmd @@ -30,7 +30,7 @@ start_time <- Sys.time() ``` ```{r setup, include=FALSE} -source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff +source("../data/Calculate_PC_Cutoff.R") # source function for determining PC cutoff needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2", "ggrepel", "broom", "gplots", "tidyr", "sva", "methods") user_choices <- readRDS("../data/user_choices.rds") @@ -69,7 +69,7 @@ phenotype_data <- readRDS(paste0(user_choices$project_name, "/processed_data/phe ## Correction of technical batch effects with ComBat -```{r, correction of first batch effect and PCA} +```{r, correction of first batch effect and PCA, results='asis'} m_values <- apply(Betas_clean_unfiltered_quantile_bmiq, 2, function(x) log2((x)/(1-x))) # get M-values if (correction_variable_1 == "PLEASE FILL IN"){ @@ -230,7 +230,11 @@ if (correction_variable_1 == "PLEASE FILL IN"){ } ``` -```{r, correction of second batch effect and PCA} +```{r, print p-value table for batch effects after correction of first batch effect variable} +anova_lm_pvalue_df %>% paged_table() +``` + +```{r, correction of second batch effect and PCA, results='asis'} if (correction_variable_2 == "PLEASE FILL IN"){ print("No second batch correction variable was specified by user, data remains unchanged") } else { @@ -389,7 +393,11 @@ if (correction_variable_2 == "PLEASE FILL IN"){ } ``` -```{r, correction of third batch effect and PCA} +```{r, print p-value table for batch effects after correction of second batch effect variable} +anova_lm_pvalue_df %>% paged_table() +``` + +```{r, correction of third batch effect and PCA, results='asis'} if (correction_variable_3 == "PLEASE FILL IN"){ print("No third batch correction variable was specified by user, data remains unchanged") } else { @@ -548,7 +556,11 @@ if (correction_variable_3 == "PLEASE FILL IN"){ } ``` -```{r, convertion M to Beta and save, include=FALSE} +```{r, print p-value table for batch effects after correction of third batch effect variable} +anova_lm_pvalue_df %>% paged_table() +``` + +```{r, conversion M to Beta and save, include=FALSE} if(exists("m_values_combat_3")){ corrected_data <- m_values_combat_3 changing_status <- "changed" diff --git a/scripts/11_annotate_hg19_create_pseudo_epicv1_from_epicv2.Rmd b/scripts/11_annotate_hg19_create_pseudo_epicv1_from_epicv2.Rmd index 2c57f79..34b1e51 100644 --- a/scripts/11_annotate_hg19_create_pseudo_epicv1_from_epicv2.Rmd +++ b/scripts/11_annotate_hg19_create_pseudo_epicv1_from_epicv2.Rmd @@ -1,6 +1,6 @@ --- title: "Script 11: Create hg19 annotation for EPICv2 and pseudo-EPICv1 version from EPICv2 data" -date: '`r format(Sys.time(), %d %B, %Y")`' +date: '`r format(Sys.time(), "%d %B, %Y")`' output: html_document --- @@ -27,11 +27,10 @@ start_time <- Sys.time() ```{r setup, include=FALSE} user_choices <- readRDS("../data/user_choices.rds") -knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/")) knitr::opts_chunk$set(echo = FALSE) -if(user_choices$package_directory != "PLEASE FILL IN"){ - .libPaths(c(user_choices$package_directory, .libPaths())) +if(user_choices$package_path != "PLEASE FILL IN"){ + .libPaths(c(user_choices$package_path, .libPaths())) } needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "minfi", "janitor") @@ -57,6 +56,7 @@ load(paste0(user_choices$project_name, "/reports/annotations_clean_unfiltered.Rd load(paste0(user_choices$project_name, "/processed_data/PhenoData_clean.Rdata")) annotation_hg19 <- read.table(paste0(user_choices$personal_path, "/epic_preprocessing_k2h/data/EPICv2.hg19.manifest.tsv")) annotation_hg19 <- row_to_names(annotation_hg19, row_number = 1, remove_row = TRUE, remove_rows_above = TRUE) +knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/")) ``` ```{r, check array type, include = TRUE} @@ -68,7 +68,7 @@ if (user_choices$array_type == "v1") { ```{r, hg19 annotation filtered data, include = FALSE} annations_clean_filtered_hg19 <- annotation_hg19 %>% - dplyr::filter(Probe_ID %in% rownames(Betas_clean_quantile_bmiq_filtered_combated)) + dplyr::filter(Probe_ID %in% rownames(Betas_clean_filtered_quantile_bmiq_combated)) save(annations_clean_filtered_hg19, file = paste0(user_choices$project_name, "/reports/annations_clean_filtered_hg19.Rdata")) ``` @@ -81,10 +81,10 @@ save(annotions_clean_unfiltered_hg19, file = paste0(user_choices$project_name, " ```{r, create pseudo-EPICv1 version for filtered data, include=FALSE} # create annotation only containing filtered CpGs that are also on v1 and exclude any duplicated loci annotations_clean_filtered_pseudo_v1 <- annotations_clean_filtered %>% - filter(Name %in% rownames(Betas_clean_filtered_quantile_bmiq_combated)) %>% + dplyr::filter(Name %in% rownames(Betas_clean_filtered_quantile_bmiq_combated)) %>% # only keep loci on EPICv1, filter wrongly annotated empty value - filter(EPICv1_Loci != "") %>% - filter(duplicated(EPICv1_Loci) == FALSE) + dplyr::filter(EPICv1_Loci != "") %>% + dplyr::filter(duplicated(EPICv1_Loci) == FALSE) # only keep betas that are on v1 and exclude any duplicated loci (keep first CpG) keep_betas <- (rownames(Betas_clean_filtered_quantile_bmiq_combated) %in% annotations_clean_filtered_pseudo_v1$Name) @@ -103,7 +103,7 @@ save(Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1, file = paste0(user_c cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), " filtered CpGs were not found on EPICv1 and therefore excluded"), sep = "
\n") - dim_Betas_filtered <- dim(Betas_clean_quantile_bmiq_filtered_combated_pseudo_v1) + dim_Betas_filtered <- dim(Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1) step_number <- c("14") step <- c("Create pseudo-EPICv1 version (filtered data)") data_class <- c("Betas") @@ -117,9 +117,9 @@ save(Betas_clean_filtered_quantile_bmiq_combated_pseudo_v1, file = paste0(user_c ```{r, create pseudo-EPICv1 version for unfiltered data, include=FALSE} # create annotation only containing unfiltered CpGs that are also on v1 and exclude any duplicated loci annotations_clean_unfiltered_pseudo_v1 <- annotations_clean_unfiltered %>% - filter(Name %in% rownames(Betas_clean_unfiltered_quantile_bmiq_combated)) %>% - filter(EPICv1_Loci != "") %>% - filter(duplicated(EPICv1_Loci) == FALSE) + dplyr::filter(Name %in% rownames(Betas_clean_unfiltered_quantile_bmiq_combated)) %>% + dplyr::filter(EPICv1_Loci != "") %>% + dplyr::filter(duplicated(EPICv1_Loci) == FALSE) # only keep betas that are on v1 and exclude any duplicated loci (keep first CpG) keep_betas <- (rownames(Betas_clean_unfiltered_quantile_bmiq_combated) %in% annotations_clean_unfiltered_pseudo_v1$Name) @@ -138,7 +138,7 @@ save(Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1, file = paste0(user cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), " unfiltered CpGs were not found on EPICv1 and therefore excluded"), sep = "
\n") - dim_Betas_unfiltered <- dim(Betas_clean_quantile_bmiq_unfiltered_combated_pseudo_v1) + dim_Betas_unfiltered <- dim(Betas_clean_unfiltered_quantile_bmiq_combated_pseudo_v1) step_number <- c("15") step <- c("Create pseudo-EPICv1 version (unfiltered data)") data_class <- c("Betas") diff --git a/scripts/1_definitions_and_setup.Rmd b/scripts/1_definitions_and_setup.Rmd index 404f356..0627cdf 100644 --- a/scripts/1_definitions_and_setup.Rmd +++ b/scripts/1_definitions_and_setup.Rmd @@ -43,6 +43,10 @@ This script asks the user to define or choose: + person id + sex * additional batch variables (up to 3) +* optionally: + + path for R packages + + path to the genotype data location + + prefix (project name) of the genotype files
**Notes/Definitions:** @@ -52,9 +56,9 @@ The path name should not include the name of the github repository **Path to phenotype data:** The path to the data table containing phenotype information The path name should include the name of data table itself. This file must be **.csv** **Path to idat files:** The path to the raw files obtained from array -**Path to R package directories (optional):** The path to the personal directory for installing and loading packages. +**Path to R package directories (optional):** The path to a directory for installing and loading R packages. This is needed if installation of R packages to the default R library are not permitted due to the lack of admin rights. -Thsi will enable R to install and load any needed packages without problems. +This will enable R to install and load any needed packages without problems. Please make sure you have the needed writing permissions in this specified directory **Array type:** "v1" or "v2". This stands for the different versions of the EPIC array **Population ethnicity:** Choose "AFR", "AMR", "ASN", "EUR" or "Global". Global @@ -78,6 +82,9 @@ be listed as F, M, or W. If your sex data is described in a different way be considered in a batch check in script 8: plate, slide, array, row and column. Please provide additional possible batch variables if needed as a character vector (up to 3, add more if necessary). If not, please leave this field as is. ++**Path to genotype files:** The path to the QC-ed genotype data in plink file format (.bed, .bim, .fam) ++**Genotype project name:** Prefix of the genotype files, denoting the project (e.g. my_study_2020) +
## Directory structure @@ -143,10 +150,11 @@ user_choices <- data.frame( "name_well_column" = name_well_column, "name_personid_column" = name_personid_column, "name_sex_column" = name_sex_column, - "name_treatment_column" = name_treatment_column, "additional_batch_variable_1" = additional_batch_variable_1, "additional_batch_variable_2" = additional_batch_variable_2, - "additional_batch_variable_3" = additional_batch_variable_3 + "additional_batch_variable_3" = additional_batch_variable_3, + "genotype_path" = genotype_path, + "genotype_projectname" = genotype_projectname ) saveRDS(user_choices, "../data/user_choices.rds") diff --git a/scripts/2_import_raw_DNAm_data.Rmd b/scripts/2_import_raw_DNAm_data.Rmd index 3c84dd1..6b31569 100644 --- a/scripts/2_import_raw_DNAm_data.Rmd +++ b/scripts/2_import_raw_DNAm_data.Rmd @@ -29,7 +29,7 @@ if (any(installed_biocmanager_packages == FALSE)) { lapply(needed_biocmanager_packages, library, character.only = TRUE) if (user_choices$array_type == "v2") { - if (!("jokergoo/IlluminaHumanMethylationEPICv2manifest" %in% rownames(installed.packages()))) { + if (!("IlluminaHumanMethylationEPICv2manifest" %in% rownames(installed.packages()))) { BiocManager::install("jokergoo/IlluminaHumanMethylationEPICv2manifest") } if (!("IlluminaHumanMethylationEPICv2anno.20a1.hg38" %in% rownames(installed.packages()))) { diff --git a/scripts/3_quality_control.Rmd b/scripts/3_quality_control.Rmd index 8ef3db5..a22a324 100644 --- a/scripts/3_quality_control.Rmd +++ b/scripts/3_quality_control.Rmd @@ -27,7 +27,6 @@ library(minfi) knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/")) knitr::opts_chunk$set(echo = FALSE) -phenotype_data <- readRDS(paste0(user_choices$project_name, "/processed_data/phenotype_data.rds")) ``` ```{r load data, include=FALSE} @@ -147,7 +146,7 @@ title_df <- title_df %>% dplyr::select(arrayid, personid) for (i in 1:ncol(RGSet_quality)){ - title_df_current <- title_df %>% filter(arrayid == rownames(pData(RGSet_quality))[i] %>% pull(personid)) + title_df_current <- title_df %>% filter(arrayid == rownames(pData(RGSet_quality))[i]) %>% pull(personid) titel <- paste0(rownames(pData(RGSet_quality))[i], " / ", title_df_current) densityPlot(as.matrix(Betas_quality[,i]), main = titel) print(i) diff --git a/scripts/4_check_matching_of_epigenetic_sex.Rmd b/scripts/4_check_matching_of_epigenetic_sex.Rmd index 736dafd..bf8ac07 100644 --- a/scripts/4_check_matching_of_epigenetic_sex.Rmd +++ b/scripts/4_check_matching_of_epigenetic_sex.Rmd @@ -54,6 +54,7 @@ fail the sex match check ```{r, predict sex with DNAm data, include= FALSE} predictedSex <- getSex(gRatioSet_quality, cutoff=-2) +phenotype_data <- subset(phenotype_data, arrayid %in% rownames(predictedSex),) sex_df <- phenotype_data %>% select(personid, arrayid, reported_sex = sex) %>% diff --git a/scripts/6_filtering_cpgs.Rmd b/scripts/6_filtering_cpgs.Rmd index 5319162..865c403 100644 --- a/scripts/6_filtering_cpgs.Rmd +++ b/scripts/6_filtering_cpgs.Rmd @@ -25,25 +25,29 @@ if (!("minfi" %in% rownames(installed.packages()))) { } library(minfi) -knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/")) knitr::opts_chunk$set(echo = FALSE) ``` -```{r load data, include=FALSE} -summary_table_preprocessing <- readRDS(paste0(user_choices$project_name, "/reports/summary_table_preprocessing.rds")) -load(paste0(user_choices$project_name, "/processed_data/RGSet_clean.Rdata")) -load(paste0(user_choices$project_name, "/processed_data/gRatioSet_clean.Rdata")) -load(paste0(user_choices$project_name, "/processed_data/Betas_clean.Rdata")) -load(paste0(user_choices$project_name, "/processed_data/PhenoData_clean.Rdata")) -load(paste0(user_choices$project_name, "/processed_data/detP_clean.Rdata")) +```{r load data 1, include=FALSE} # Info about cross-hybridizing probes comes from Chen et al. 2013, DOI: 10.4161/epi.23470: -load("data/ChenProbeIDs.rdata") +load("../data/ChenProbeIDs.rdata") # Info about mapping inaccuracies for EPIC v2.0 from Illumina product information: if (user_choices$array_type == "v2") { v2_mapping_inaccuracy <- read.csv("../data/EPIC-8v2-0_A1-190MappingInaccuracies.csv")} # Info about flagged probes for EPIC v2.0 from Illumina product information: if (user_choices$array_type == "v2") { v2_flagged_probes <- read.csv("../data/EPIC-8v2-0_A1-FlaggedProbes.csv")} + +knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/")) +``` + +```{r load data 2, include=FALSE} +summary_table_preprocessing <- readRDS(paste0(user_choices$project_name, "/reports/summary_table_preprocessing.rds")) +load(paste0(user_choices$project_name, "/processed_data/RGSet_clean.Rdata")) +load(paste0(user_choices$project_name, "/processed_data/gRatioSet_clean.Rdata")) +load(paste0(user_choices$project_name, "/processed_data/Betas_clean.Rdata")) +load(paste0(user_choices$project_name, "/processed_data/PhenoData_clean.Rdata")) +load(paste0(user_choices$project_name, "/processed_data/detP_clean.Rdata")) ``` # Description @@ -127,13 +131,13 @@ if (user_choices$array_type == "v2") { keep_betas_df <- as.data.frame(keep_betas) cat(paste0(nrow(exclude_replicates), " replicate probes were removed")) - dim_gRatioSet_filtered <- dim(gRatioSet_clean) - dim_Betas_filtered <- dim(Betas_clean) + dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean) + dim_Betas_clean_filtered <- dim(Betas_clean) step_number <- c("4", "4") step <- c("Filter replicates", "Filter replicates") data_class <- c("gRatioSet", "Betas") - samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2]) - probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1]) + samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2]) + probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1]) table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes) summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding) @@ -150,7 +154,7 @@ Betas_clean_filtered <- Betas_clean[keep_betas,] # ensure probes are in the same order in cleaned gRatioSet (RGSet with annotations) and detP objects, then subset detP_clean_filtered2 <- detP_clean[match(featureNames(gRatioSet_clean),rownames(detP_clean)),] -keep_gratioset <- rowSums(detP_clean_filtered < 0.01) == ncol(gRatioSet_clean) +keep_gratioset <- rowSums(detP_clean_filtered2 < 0.01) == ncol(gRatioSet_clean) gRatioSet_clean_filtered <- gRatioSet_clean[keep_gratioset,] ``` @@ -159,13 +163,13 @@ keep_betas_df <- as.data.frame(keep_betas) cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), " probes failed in one or more samples"), sep = "
\n") -dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered) -dim_Betas_filtered <- dim(Betas_clean_filtered) +dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered) +dim_Betas_clean_filtered <- dim(Betas_clean_filtered) step_number <- c("5", "5") step <- c("Filter technical", "Filter technical") data_class <- c("gRatioSet", "Betas") -samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2]) -probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1]) +samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2]) +probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1]) table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes) summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding) @@ -188,13 +192,13 @@ keep_betas_df <- as.data.frame(keep_betas) cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), " probes were on the X chromosome"), sep = "
\n") -dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered) -dim_Betas_filtered <- dim(Betas_clean_filtered) +dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered) +dim_Betas_clean_filtered <- dim(Betas_clean_filtered) step_number <- c("6", "6") step <- c("Filter X Chromosome", "Filter X Chromosome") data_class <- c("gRatioSet", "Betas") -samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2]) -probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1]) +samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2]) +probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1]) table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes) summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding) @@ -215,13 +219,13 @@ keep_betas_df <- as.data.frame(keep_betas) cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), " probes were on the Y chromosome"), sep = "
\n") -dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered) -dim_Betas_filtered <- dim(Betas_clean_filtered) +dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered) +dim_Betas_clean_filtered <- dim(Betas_clean_filtered) step_number <- c("7", "7") step <- c("Filter Y Chromosome", "Filter Y Chromosome") data_class <- c("gRatioSet", "Betas") -samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2]) -probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1]) +samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2]) +probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1]) table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes) summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding) @@ -235,13 +239,13 @@ Betas_clean_filtered <- Betas_clean_filtered[rownames(Betas_clean_filtered) %in% ``` ```{r, info output for user probes affected by snps, results='asis'} -dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered) -dim_Betas_filtered <- dim(gRatioSet_clean_filtered) +dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered) +dim_Betas_clean_filtered <- dim(gRatioSet_clean_filtered) step_number <- c("8", "8") step <- c("Filter SNP Affected", "Filter SNP Affected") data_class <- c("gRatioSet", "Betas") -samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2]) -probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1]) +samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2]) +probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1]) table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes) summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding) @@ -283,13 +287,13 @@ if (user_choices$array_type == "v1") { cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), " probes were cross-hybridizing according to Chen et al."), sep = "
\n") - dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered) - dim_Betas_filtered <- dim(Betas_clean_filtered) + dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered) + dim_Betas_clean_filtered <- dim(Betas_clean_filtered) step_number <- c("9", "9") step <- c("Filter Cross-Hybridizing Chen", "Filter Cross-Hybridizing Chen") data_class <- c("gRatioSet", "Betas") - samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2]) - probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1]) + samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2]) + probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1]) table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes) summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding) @@ -305,8 +309,8 @@ if (user_choices$array_type == "v1") { keep_betas <- !(rownames(Betas_clean_filtered) %in% exclude_McCartney$V1) Betas_clean_filtered <- Betas_clean_filtered[keep_betas,] - keep_gratioset <- !(featureNames(dim_gRatioSet_filtered) %in% exclude_McCartney$V1) - dim_gRatioSet_filtered <- dim_gRatioSet_filtered[keep_gratioset,] + keep_gratioset <- !(featureNames(dim_gRatioSet_clean_filtered) %in% exclude_McCartney$V1) + gRatioSet_clean_filtered <- gRatioSet_clean_filtered[keep_gratioset,] } ``` @@ -316,13 +320,13 @@ if (user_choices$array_type == "v1") { cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), " probes may be cross-hybridizing according to McCartney et al."), sep = "
\n") - dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered) - dim_Betas_filtered <- dim(Betas_clean_filtered) + dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered) + dim_Betas_clean_filtered <- dim(Betas_clean_filtered) step_number <- c("10", "10") step <- c("Filter Cross-Hybridizing McCartney", "Filter Cross-Hybridizing McCartney") data_class <- c("gRatioSet", "Betas") - samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2]) - probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1]) + samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2]) + probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1]) table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes) summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding) @@ -351,13 +355,13 @@ if (user_choices$array_type == "v1") { cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), " probes were polymorphic"), sep = "
\n") - dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered) - dim_Betas_filtered <- dim(Betas_clean_filtered) + dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered) + dim_Betas_clean_filtered <- dim(Betas_clean_filtered) step_number <- c("11", "11") step <- c("Filter Polymorphic", "Filter Polymorphic") data_class <- c("gRatioSet", "Betas") - samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2]) - probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1]) + samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2]) + probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1]) table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes) summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding) @@ -369,10 +373,10 @@ saveRDS(summary_table_preprocessing, paste0(user_choices$project_name, "/reports ```{r, filter out mapping inaccuracies, include=FALSE} if (user_choices$array_type == "v2") { - keep_betas <- !(rownames(Betas_clean_filtered) %in% v2_mapping_inacc$IlmnID) + keep_betas <- !(rownames(Betas_clean_filtered) %in% v2_mapping_inaccuracy$IlmnID) Betas_clean_filtered <- Betas_clean_filtered[keep_betas,] - keep_gratioset <- !(featureNames(gRatioSet_clean_filtered) %in% v2_mapping_inacc$IlmnID) + keep_gratioset <- !(featureNames(gRatioSet_clean_filtered) %in% v2_mapping_inaccuracy$IlmnID) gRatioSet_clean_filtered <- gRatioSet_clean_filtered[keep_gratioset,] } ``` @@ -383,13 +387,13 @@ if (user_choices$array_type == "v2") { cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), " CpGs show known mapping inaccuracies"), sep = "
\n") - dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered) - dim_Betas_filtered <- dim(Betas_clean_filtered) + dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered) + dim_Betas_clean_filtered <- dim(Betas_clean_filtered) step_number <- c("12", "12") step <- c("Filter Mapping Inaccuracies", "Filter Mapping Inaccuracies") data_class <- c("gRatioSet", "Betas") - samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2]) - probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1]) + samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2]) + probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1]) table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes) summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding) @@ -414,13 +418,13 @@ if (user_choices$array_type == "v2") { cat(paste0(keep_betas_df %>% filter(keep_betas == FALSE) %>% nrow(), " CpGs were flagged by Illumina"), sep = "
\n") - dim_gRatioSet_filtered <- dim(gRatioSet_clean_filtered) - dim_Betas_filtered <- dim(Betas_clean_filtered) + dim_gRatioSet_clean_filtered <- dim(gRatioSet_clean_filtered) + dim_Betas_clean_filtered <- dim(Betas_clean_filtered) step_number <- c("13", "13") step <- c("Filter Flagged Probes", "Filter Flagged Probes") data_class <- c("gRatioSet", "Betas") - samples <- c(dim_gRatioSet_filtered[2], dim_Betas_filtered[2]) - probes <- c(dim_gRatioSet_filtered[1], dim_Betas_filtered[1]) + samples <- c(dim_gRatioSet_clean_filtered[2], dim_Betas_clean_filtered[2]) + probes <- c(dim_gRatioSet_clean_filtered[1], dim_Betas_clean_filtered[1]) table_preprocessing_adding <- data.frame(step_number, step, data_class, samples, probes) summary_table_preprocessing <- bind_rows(summary_table_preprocessing, table_preprocessing_adding) @@ -428,7 +432,7 @@ if (user_choices$array_type == "v2") { ``` ```{r, save data, include=FALSE} -save(gRatioSet_filtered, file = paste0(user_choices$project_name, "/processed_data/gRatioSet_clean_filtered.Rdata")) +save(gRatioSet_clean_filtered, file = paste0(user_choices$project_name, "/processed_data/gRatioSet_clean_filtered.Rdata")) Betas_clean_filtered <- getBeta(gRatioSet_clean_filtered) # beta values save(Betas_clean_filtered, file = paste0(user_choices$project_name, "/processed_data/Betas_clean_filtered.Rdata")) diff --git a/scripts/7_normalization.Rmd b/scripts/7_normalization.Rmd index f9e8abf..583c20b 100644 --- a/scripts/7_normalization.Rmd +++ b/scripts/7_normalization.Rmd @@ -31,7 +31,7 @@ knitr::opts_knit$set(root.dir = paste0(user_choices$personal_path, "/")) knitr::opts_chunk$set(echo = FALSE) if (user_choices$array_type == "v2") { - if (!("jokergoo/IlluminaHumanMethylationEPICv2manifest" %in% rownames(installed.packages()))) { + if (!("IlluminaHumanMethylationEPICv2manifest" %in% rownames(installed.packages()))) { BiocManager::install("jokergoo/IlluminaHumanMethylationEPICv2manifest") } if (!("IlluminaHumanMethylationEPICv2anno.20a1.hg38" %in% rownames(installed.packages()))) { @@ -84,27 +84,23 @@ save(gRatioSet_clean_filtered_quantile, file = paste0(user_choices$project_name, Betas_clean_filtered_quantile <- getBeta(gRatioSet_clean_filtered_quantile) save(Betas_clean_filtered_quantile, file = paste0(user_choices$project_name, - "/processed_data/Betas_clean_filtered_quantile.Rdata")) -Ms_clean_filtered_qunatile <- getM(gRatioSet_clean_filtered_quantile) -save(Ms_clean_filtered_qunatile, file = paste0(user_choices$project_name, - "/processed_data/Ms_clean_filtered_qunatile.Rdata")) + "/processed_data/Betas_clean_filtered_quantile.Rdata")) +Ms_clean_filtered_quantile <- getM(gRatioSet_clean_filtered_quantile) +save(Ms_clean_filtered_quantile, file = paste0(user_choices$project_name, + "/processed_data/Ms_clean_filtered_quantile.Rdata")) # further normalization with BMIQ: -probeType <- as.data.frame(annotations_clean_filtered[rownames(Betas_clean_filtered_quantile),c("Name","Type")]) +probeType <- as.data.frame(subset(annotations_clean_filtered, Name %in% rownames(Betas_clean_filtered_quantile), select=c("Name","Type"))) probeType$probeType = ifelse(probeType$Type %in% "I", 1, 2) Betas_clean_filtered_quantile_bmiq <- apply(Betas_clean_filtered_quantile[,1:length(colnames(Betas_clean_filtered_quantile))],2, function(a) BMIQ(a,probeType$probeType,plots=FALSE)$nbeta) # sourced from script "BMIQ_1.6_Teschendorff.R" -save(Betas_clean_filtered_quantile_bmiq, file = paste0(user_choices$project_name, - "/processed_data/Betas_clean_filtered_quantile_bmiq.Rdata")) +save(Betas_clean_filtered_quantile_bmiq, file = paste0(user_choices$project_name, "/processed_data/Betas_clean_filtered_quantile_bmiq.Rdata")) ``` -```{r. normalize unfiltered probes, include=FALSE} +```{r normalize unfiltered probes, include=FALSE} # Normalization of unfiltered data - # Functional normalization -# Note: sex is set to "F" for all samples since sex chromosomes were already removed in script 6 -# Note: This does not affect the normalization or the phenotype data, it simply stops preprocessQuantile() from producing an error -gRatioSet_clean_unfiltered_quantile <- preprocessQuantile(RGSet_clean, sex = "F") +gRatioSet_clean_unfiltered_quantile <- preprocessQuantile(RGSet_clean) save(gRatioSet_clean_unfiltered_quantile, file = paste0(user_choices$project_name, "/processed_data/gRatioSet_clean_unfiltered_quantile.Rdata")) # output: GenomicRatioSet diff --git a/scripts/8_removal_of_outliers_detect_batch_effects.Rmd b/scripts/8_removal_of_outliers_detect_batch_effects.Rmd index 6bbf7ec..8d176f6 100644 --- a/scripts/8_removal_of_outliers_detect_batch_effects.Rmd +++ b/scripts/8_removal_of_outliers_detect_batch_effects.Rmd @@ -46,7 +46,7 @@ start_time <- Sys.time() ```{r setup, include=FALSE} -source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff +source("../data/Calculate_PC_Cutoff.R") # source function for determining PC cutoff needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2", "ggrepel", "broom", "gplots", "tidyr", "sva", "methods") user_choices <- readRDS("../data/user_choices.rds") diff --git a/scripts/9_correction_batch_effects_filtered.Rmd b/scripts/9_correction_batch_effects_filtered.Rmd index a8e1daf..17500de 100644 --- a/scripts/9_correction_batch_effects_filtered.Rmd +++ b/scripts/9_correction_batch_effects_filtered.Rmd @@ -41,7 +41,7 @@ start_time <- Sys.time() ``` ```{r setup, include=FALSE} -source("../data/calculate_pc_cutoff.R") # source function for determining PC cutoff +source("../data/Calculate_PC_Cutoff.R") # source function for determining PC cutoff needed_packages <- c("BiocManager", "dplyr", "knitr", "rmarkdown", "tibble", "ggplot2", "ggrepel", "broom", "gplots", "tidyr", "sva", "methods") user_choices <- readRDS("../data/user_choices.rds") @@ -79,7 +79,7 @@ phenotype_data <- readRDS(paste0(user_choices$project_name, "/processed_data/phe ## Correction of technical batch effects with ComBat -```{r, correction of first batch effect and PCA} +```{r, correction of first batch effect and PCA, results='asis'} m_values <- apply(Betas_clean_filtered_quantile_bmiq, 2, function(x) log2((x)/(1-x))) # get M-values if (correction_variable_1 == "PLEASE FILL IN"){ @@ -240,7 +240,11 @@ if (correction_variable_1 == "PLEASE FILL IN"){ } ``` -```{r, correction of second batch effect and PCA} +```{r, print p-value table for batch effects after correction of first batch effect variable} +anova_lm_pvalue_df %>% paged_table() +``` + +```{r, correction of second batch effect and PCA, results='asis'} if (correction_variable_2 == "PLEASE FILL IN"){ print("No second batch correction variable was specified by user, data remains unchanged") } else { @@ -399,6 +403,10 @@ if (correction_variable_2 == "PLEASE FILL IN"){ } ``` +```{r, print p-value table for batch effects after correction of second batch effect variable} +anova_lm_pvalue_df %>% paged_table() +``` + ```{r, correction of third batch effect and PCA} if (correction_variable_3 == "PLEASE FILL IN"){ print("No third batch correction variable was specified by user, data remains unchanged") @@ -557,6 +565,10 @@ if (correction_variable_3 == "PLEASE FILL IN"){ } ``` +```{r, print p-value table for batch effects after correction of third batch effect variable} +anova_lm_pvalue_df %>% paged_table() +``` + ```{r, convertion M to Beta and save, include=FALSE} if(exists("m_values_combat_3")){ corrected_data <- m_values_combat_3