Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
# Analysis script for the paper:
# "Extensive evaluation of DNA methylation of functional elements in the murine
# Fkbp5 locus using high accuracy DNA methylation measurement via targeted
# bisulfite sequencing"
#
# ==============================================================================
#
# Author: Natan Yusupov, natan_yusupov@psych.mpg.de
#
# Description: The script analyses DNAm data of 157 CpGs from three murine
# tissues/brain regions as described in details in:
# Yusupov N, van Doeselaar L, Röh S, Wiechmann T, Ködel M, Sauer S, Rex-Haffner M,
# Schmidt MV, Binder EB. Extensive evaluation of DNA methylation of functional
# elements in the murine Fkbp5 locus using high-accuracy DNA methylation
# measurement via targeted bisulfite sequencing. Eur J Neurosci. 2023 Jul 6.
# doi: 10.1111/ejn.16078. PMID: 37414581.
#
# Summary of steps performed in the script "analysis_script.R":
# 1. Load libraries and data
# 2. Label primers with corresponding functional annotations of the locus
# 3. Data preparation:
# A. Order CpGs in genomic order
# B. Separate to DNAm data and covariates data
# C. Transform data to M-Values
# D. Prepare data in long format
# 4. Calculate mean and standard deviation per CpG for each tissue/brain region and ELS status
# 5. Calculate delta mean of DNAm for: FC - Blood, HIP - Blood, HIP - FC
# 6. Plot DNAm patterns across tissues/brain regions (samples)
# 7. Plot DNAm patterns across tissues/brain regions (mean and standard deviation)
# 8. Plot delta DNAm patterns across tissues/brain regions
# 9. Filter CpGs with low variance at baseline (IQR=<1)
# 10. Perform linear regression in each tissue/brain region to study ELS effects on DNAm of each CpG
# 11. Visualize significant CpGs
#
# ==============================================================================
# 1. Load libraries and data
library(tidyverse)
library(broom)
library(ggpubr)
data_df <- read.csv("data_df.csv", check.names = F) %>%
mutate(row = as.factor(row), column = as.factor(column), ELS_status = as.factor(ELS_status),
tissue = as.factor(tissue))
cpgs_amplicon_info <- read.csv("cpgs_amplicon_info.csv")
# 2. Label primers with corresponding functional annotations of the locus
intron_8 <- cpgs_amplicon_info %>%
filter(amplicon %in% c("11_fk5_gre_i8.1", "11_fk5_gre_i8.2")) %>%
select(cpg_genomic) %>% pull() %>% sort() %>% as.character()
intron_5 <- cpgs_amplicon_info %>%
filter(amplicon %in% c("10_fk5_gre_i5.1", "10_fk5_gre_i5.2", "10_fk5_gre_i5.3","9_fk5_gre_i5")) %>%
select(cpg_genomic) %>% pull() %>% sort() %>% as.character()
intron_1 <- cpgs_amplicon_info %>%
filter(amplicon %in% c("5_fk5_gre_i1", "6_fk5_gre_i1","7_fk5_gre_i1","8_fk5_gre_i1")) %>%
select(cpg_genomic) %>% pull() %>% sort() %>% as.character()
tss <- cpgs_amplicon_info %>% filter(amplicon == "18_fk5_TSS") %>%
select(cpg_genomic) %>% pull() %>% sort() %>% as.character()
proximal_enhancer <- cpgs_amplicon_info %>%
filter(amplicon %in% c("1_fk5_gre_pE", "2_fk5_gre_pE_1", "2_fk5_gre_pE_2", "4_fk5_gre_pE")) %>%
select(cpg_genomic) %>% pull() %>% sort() %>% as.character()
ctcf_5UTR <- cpgs_amplicon_info %>%
filter(amplicon %in% c("12_fk5_ctcf_5UTR.1", "12_fk5_ctcf_5UTR.2", "13_fk5_ctcf_5UTR")) %>%
select(cpg_genomic) %>% pull() %>% sort() %>% as.character()
proximal_enhancer_1 <- cpgs_amplicon_info %>%
filter(amplicon == "1_fk5_gre_pE") %>%
select(cpg_genomic) %>% pull() %>% sort() %>% as.character()
proximal_enhancer_2 <- cpgs_amplicon_info %>%
filter(amplicon %in% c("2_fk5_gre_pE_1", "2_fk5_gre_pE_2")) %>%
select(cpg_genomic) %>% pull() %>% sort() %>% as.character()
proximal_enhancer_3 <- cpgs_amplicon_info %>%
filter(amplicon == "4_fk5_gre_pE") %>%
select(cpg_genomic) %>% pull() %>% sort() %>% as.character()
# 3. data preparation
# A. Order CpGs in genomic order
target_cpgs <- c(intron_8, intron_5, intron_1, tss, proximal_enhancer, ctcf_5UTR)
# B. Separate to DNAm data and covariates data
methylation_df <- data_df %>%
dplyr::select(sample, contains(target_cpgs)) %>%
column_to_rownames(var = "sample")
covariates <- data_df %>%
select(sample, row, column, ELS_status, tissue)
# C. Transform data to M-Values
methylation_beta <- apply(methylation_df, 2, function(x) x/100) # Beta values
methylation_mval <- apply(methylation_beta, 2, function(x) log2((x)/(1-x))) # M values
# D. Prepare data in long format
methylation_df_long <- methylation_df %>%
rownames_to_column("sample") %>%
pivot_longer(cols = -sample, names_to = "cpg", values_to = "methylation") %>%
separate(sample, sep = "-", into = c("samplenumber", "tissue"), remove = F) %>%
mutate(functional_region = as.factor(case_when(
cpg %in% intron_8 ~ "Intron 8",
cpg %in% intron_5 ~ "Intron 5",
cpg %in% intron_1 ~ "Intron 1",
cpg %in% tss ~ "TSS",
cpg %in% proximal_enhancer ~ "Proximal Enhancer",
cpg %in% ctcf_5UTR ~ "CTCF 5'UTR",
TRUE ~ "error")),
functional_region_detailed = as.factor(case_when(
cpg %in% intron_8 ~ "Intron 8",
cpg %in% intron_5 ~ "Intron 5",
cpg %in% intron_1 ~ "Intron 1",
cpg %in% tss ~ "TSS",
cpg %in% proximal_enhancer_1 ~ "Proximal Enhancer: Part 1",
cpg %in% proximal_enhancer_2 ~ "Proximal Enhancer: Part 2",
cpg %in% proximal_enhancer_3 ~ "Proximal Enhancer: Part 3",
cpg %in% ctcf_5UTR ~ "CTCF 5'UTR",
TRUE ~ "error")),
cpg = as.factor(cpg),
tissue = as.factor(tissue))
methylation_mval_long <- methylation_mval %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
pivot_longer(cols = -sample, names_to = "cpg", values_to = "methylation") %>%
separate(sample, sep = "-", into = c("samplenumber", "tissue"), remove = F) %>%
mutate(functional_region = as.factor(case_when(
cpg %in% intron_8 ~ "Intron 8",
cpg %in% intron_5 ~ "Intron 5",
cpg %in% intron_1 ~ "Intron 1",
cpg %in% tss ~ "TSS",
cpg %in% proximal_enhancer ~ "Proximal Enhancer",
cpg %in% ctcf_5UTR ~ "CTCF 5'UTR",
TRUE ~ "error")),
functional_region_detailed = as.factor(case_when(
cpg %in% intron_8 ~ "Intron 8",
cpg %in% intron_5 ~ "Intron 5",
cpg %in% intron_1 ~ "Intron 1",
cpg %in% tss ~ "TSS",
cpg %in% proximal_enhancer_1 ~ "Proximal Enhancer: Part 1",
cpg %in% proximal_enhancer_2 ~ "Proximal Enhancer: Part 2",
cpg %in% proximal_enhancer_3 ~ "Proximal Enhancer: Part 3",
cpg %in% ctcf_5UTR ~ "CTCF 5'UTR",
TRUE ~ "error")),
cpg = as.factor(cpg),
tissue = as.factor(tissue))
# 4. Calculate mean and standard deviation per CpG for each tissue/brain region and ELS status
methylation_mean_sd_tissue <- data_df %>%
group_by(tissue) %>%
dplyr::summarise(across(contains(target_cpgs),
.fns = list(mean = ~ mean(.x, na.rm = TRUE), sd = ~ sd(.x, na.rm = TRUE)),
.names = "{.fn}_{.col}")) %>%
pivot_longer(cols = -tissue, names_to = "which" , values_to = "methylation") %>%
separate(which, into = c("which","cpg"), sep = "_") %>%
spread(key = which, value = methylation) %>%
mutate(functional_region = as.factor(case_when(
cpg %in% intron_8 ~ "Intron 8",
cpg %in% intron_5 ~ "Intron 5",
cpg %in% intron_1 ~ "Intron 1",
cpg %in% tss ~ "TSS",
cpg %in% proximal_enhancer ~ "Proximal Enhancer",
cpg %in% ctcf_5UTR ~ "CTCF 5'UTR",
TRUE ~ "error")))
methylation_mean_sd_status <- data_df %>%
group_by(tissue, ELS_status) %>%
dplyr::summarise(across(contains(target_cpgs),
.fns = list(mean = ~ mean(.x, na.rm = TRUE),
sd = ~ sd(.x, na.rm = TRUE)),
.names = "{.fn}_{.col}"), .groups = "keep") %>%
pivot_longer(cols = -c("tissue","ELS_status"),
names_to = "which" , values_to = "methylation") %>%
separate(which, into = c("which","cpg"), sep = "_") %>%
spread(key = which, value = methylation) %>%
mutate(functional_region = as.factor(case_when(
cpg %in% intron_8 ~ "Intron 8",
cpg %in% intron_5 ~ "Intron 5",
cpg %in% intron_1 ~ "Intron 1",
cpg %in% tss ~ "TSS",
cpg %in% proximal_enhancer ~ "Proximal Enhancer",
cpg %in% ctcf_5UTR ~ "CTCF 5'UTR",
TRUE ~ "error")),
functional_region_detailed = as.factor(case_when(
cpg %in% intron_8 ~ "Intron 8",
cpg %in% intron_5 ~ "Intron 5",
cpg %in% intron_1 ~ "Intron 1",
cpg %in% tss ~ "TSS",
cpg %in% proximal_enhancer_1 ~ "Proximal Enhancer: Part 1",
cpg %in% proximal_enhancer_2 ~ "Proximal Enhancer: Part 2",
cpg %in% proximal_enhancer_3 ~ "Proximal Enhancer: Part 3",
cpg %in% ctcf_5UTR ~ "CTCF 5'UTR",
TRUE ~ "error")),
cpg = as.factor(cpg))
# 5. Calculate delta mean of DNAm for: FC - Blood, HIP - Blood, HIP - FC
delta_mouse <- data_df %>%
filter(ELS_status == "Control") %>%
select(-ELS_status, -sample) %>%
group_by(tissue) %>%
dplyr::summarise(across("28544982":"28665256",
.fns = ~ mean(.x, na.rm = TRUE))) %>%
pivot_longer(cols = -tissue, names_to = "cpg" , values_to = "mean")
delta_mouse_blood <- delta_mouse %>%
filter(tissue == "Blood") %>% select(cpg, mean_blood = "mean")
delta_mouse_fc <- delta_mouse %>%
filter(tissue == "FC") %>% select(cpg, mean_fc = "mean")
delta_mouse_hip <- delta_mouse %>%
filter(tissue == "HIP") %>% select(cpg, mean_hip = "mean")
delta_final_df <- delta_mouse_blood %>%
left_join(delta_mouse_fc, by = "cpg") %>%
left_join(delta_mouse_hip, by = "cpg") %>%
mutate(delta_fc_blood = mean_fc - mean_blood,
delta_hip_blood = mean_hip - mean_blood,
delta_fc_hip = mean_fc - mean_hip,
functional_region = as.factor(case_when(
cpg %in% intron_8 ~ "Intron 8",
cpg %in% intron_5 ~ "Intron 5",
cpg %in% intron_1 ~ "Intron 1",
cpg %in% tss ~ "TSS",
cpg %in% proximal_enhancer ~ "Proximal Enhancer",
cpg %in% ctcf_5UTR ~ "CTCF 5'UTR",
TRUE ~ "error")),
functional_region_detailed = as.factor(case_when(
cpg %in% intron_8 ~ "Intron 8",
cpg %in% intron_5 ~ "Intron 5",
cpg %in% intron_1 ~ "Intron 1",
cpg %in% tss ~ "TSS",
cpg %in% proximal_enhancer_1 ~ "Proximal Enhancer: Part 1",
cpg %in% proximal_enhancer_2 ~ "Proximal Enhancer: Part 2",
cpg %in% proximal_enhancer_3 ~ "Proximal Enhancer: Part 3",
cpg %in% ctcf_5UTR ~ "CTCF 5'UTR",
TRUE ~ "error")))
delta_for_plot <- data.frame(cpg = rep(delta_final_df$cpg, 3),
functional_region_detailed = rep(delta_final_df$functional_region_detailed, 3),
delta = c(delta_final_df$delta_fc_blood,
delta_final_df$delta_hip_blood,
delta_final_df$delta_fc_hip),
category = factor(c(rep("FC-Blood", 157),
rep("HIP-Blood", 157),
rep("FC-HIP", 157)),
levels = c("FC-Blood", "HIP-Blood", "FC-HIP")))
# 6. Plot DNAm patterns across tissues/brain regions (samples)
color_theme <- c("darkred", "cornflowerblue", "lightgoldenrod3") # set color themes
plot_per_functional_unit <- function(functional_unit){ # function to make plots
methylation_df_long %>%
filter(functional_region_detailed == functional_unit) %>%
ggplot(aes(x = cpg, y = methylation, col = sample, group = sample)) +
geom_point(alpha = 0.5) +
geom_line(alpha = 0.3) +
labs(x = "CpG", y = "DNA Methylation [%]",
title = paste0("DNA Methylation Patterns Across Tissues: ", functional_unit)) +
ylim(-5, 105) +
facet_wrap(~tissue) +
theme_bw() +
theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
axis.title.y = element_text(size = 10, face = "bold"),
axis.title.x = element_text(size = 10, face = "bold"),
axis.text.x = element_text(size = 8, hjust = 1, angle = 45),
legend.position = "none")
}
plot_per_functional_unit("Intron 8")
plot_per_functional_unit("Intron 5")
plot_per_functional_unit("Intron 1")
plot_per_functional_unit("TSS")
plot_per_functional_unit("CTCF 5'UTR")
methylation_df_long %>%
filter(functional_region == "Proximal Enhancer", tissue == "Blood") %>%
ggplot(aes(x = cpg, y = methylation, col = sample, group = sample)) +
geom_point(alpha = 0.5) +
geom_line(alpha = 0.3) +
labs(x = "CpG", y = "DNA Methylation [%]",
title = "DNA Methylation Patterns Blood: Proximal Enhancer") +
ylim(-5, 105) +
theme_bw() +
theme(plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
axis.title.y = element_text(size = 16, face = "bold"),
axis.title.x = element_text(size = 16, face = "bold"),
axis.text.x = element_text(size = 8, hjust = 1, angle = 90),
axis.text.y = element_text(size = 8),
legend.position = "none")
methylation_df_long %>%
filter(functional_region == "Proximal Enhancer", tissue == "FC") %>%
ggplot(aes(x = cpg, y = methylation, col = sample, group = sample)) +
geom_point(alpha = 0.5) +
geom_line(alpha = 0.3) +
labs(x = "CpG", y = "DNAm Methylation [%]",
title = "DNA Methylation Patterns FC: Proximal Enhancer") +
ylim(-5, 105) +
theme_bw() +
theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
axis.title.y = element_text(size = 16, face = "bold"),
axis.title.x = element_text(size = 16, face = "bold"),
axis.text.x = element_text(size = 8, hjust = 1, angle = 90),
legend.position = "none")
methylation_df_long %>%
filter(functional_region == "Proximal Enhancer", tissue == "HIP") %>%
ggplot(aes(x = cpg, y = methylation, col = sample, group = sample)) +
geom_point(alpha = 0.5) +
geom_line(alpha = 0.3) +
labs(x = "CpG", y = "DNA Methylation [%]",
title = "DNA Methylation Patterns HIP: Proximal Enhancer") +
ylim(-5, 105) +
theme_bw() +
theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
axis.title.y = element_text(size = 16, face = "bold"),
axis.title.x = element_text(size = 16, face = "bold"),
axis.text.x = element_text(size = 8, hjust = 1, angle = 90),
legend.position = "none")
# 7. Plot DNAm patterns across tissues/brain regions (mean and standard deviation)
plot_per_functional_unit <- function(functional_unit){
methylation_mean_sd_status %>% # only unreated samples here
filter(functional_region_detailed == functional_unit, ELS_status == "Control") %>%
ggplot(aes(x = cpg, y = mean, col = tissue, group = tissue)) +
geom_point(alpha = 0.7) +
geom_line(size = 1) +
scale_color_manual(values=color_theme) +
scale_y_continuous(limits = c(0,100), breaks=seq(0,100,10)) +
geom_errorbar(aes(ymin = mean-sd, ymax = mean+sd), width = .2, size = 1) +
labs(x = "CpG", y = "DNA Methylation [%]",
title = functional_unit) +
theme_bw() +
theme(plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
axis.title.y = element_text(size = 16, face = "bold"),
axis.title.x = element_text(size = 16, face = "bold"),
axis.text.x = element_text(size = 14, hjust = 1, angle = 45),
axis.text.y = element_text(size = 12),
legend.background = element_rect(fill = "lightgray", size = 0.5,
linetype = "solid", colour = "darkgray"),
legend.title = element_blank(),
legend.position = "bottom")
}
plot_per_functional_unit("Intron 8")
plot_per_functional_unit("Intron 5")
plot_per_functional_unit("Intron 1")
plot_per_functional_unit("CTCF 5'UTR")
methylation_mean_sd_status %>%
filter(functional_region == "TSS", ELS_status == "Control") %>%
ggplot(aes(x = cpg, y = mean, col = tissue, group = tissue)) +
geom_point(alpha = 0.7) +
geom_line(size = 0.4) +
scale_color_manual(values=color_theme) +
geom_errorbar(aes(ymin = mean-sd, ymax = mean+sd), width = .2) +
labs(x = "CpG", y = "DNA Methylation [%]",
title = "TSS") +
theme_bw() +
scale_y_continuous(limits = c(-5,100), breaks=seq(0,100,10)) +
theme(plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
axis.title.y = element_text(size = 16, face = "bold"),
axis.title.x = element_text(size = 16, face = "bold"),
axis.text.x = element_text(size = 14, hjust = 1, angle = 45),
axis.text.y = element_text(size = 14),
legend.background = element_rect(fill = "lightgray", size = 0.5,
linetype = "solid", colour = "darkgray"),
legend.title = element_blank(),
legend.position = "bottom")
methylation_mean_sd_status %>%
filter(functional_region_detailed == "Proximal Enhancer: Part 1", ELS_status == "Control") %>%
ggplot(aes(x = cpg, y = mean, col = tissue, group = tissue)) +
geom_point(alpha = 0.7) +
geom_line(size = 1) +
scale_color_manual(values=color_theme) +
geom_errorbar(aes(ymin = mean-sd, ymax = mean+sd), width = .2, size = 1) +
labs(x = "CpG", y = "DNA Methylation [%]",
title = "Proximal Enhancer: Part 1") +
theme_bw() +
scale_y_continuous(limits = c(-5,100), breaks=seq(0,100,10)) +
theme(plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
axis.title.y = element_text(size = 16, face = "bold"),
axis.title.x = element_text(size = 16, face = "bold"),
axis.text.x = element_text(size = 14, hjust = 1, angle = 45),
axis.text.y = element_text(size = 14),
legend.background = element_rect(fill = "lightgray", size = 0.5,
linetype = "solid", colour = "darkgray"),
legend.title = element_blank(),
legend.position = "bottom")
methylation_mean_sd_status %>%
filter(functional_region_detailed == "Proximal Enhancer: Part 2", ELS_status == "Control") %>%
ggplot(aes(x = cpg, y = mean, col = tissue, group = tissue)) +
geom_point(alpha = 0.7) +
geom_line(size = 1) +
scale_color_manual(values=color_theme) +
geom_errorbar(aes(ymin = mean-sd, ymax = mean+sd), width = .2, size = 1) +
labs(x = "CpG", y = "DNA Methylation [%]",
title = "Proximal Enhancer: Part 2") +
theme_bw() +
scale_y_continuous(limits = c(-5,100), breaks=seq(0,100,10)) +
theme(plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
axis.title.y = element_text(size = 16, face = "bold"),
axis.title.x = element_text(size = 16, face = "bold"),
axis.text.x = element_text(size = 14, hjust = 1, angle = 45),
axis.text.y = element_text(size = 14),
legend.background = element_rect(fill = "lightgray", size = 0.5,
linetype = "solid", colour = "darkgray"),
legend.title = element_blank(),
legend.position = "bottom")
methylation_mean_sd_status %>%
filter(functional_region_detailed == "Proximal Enhancer: Part 3", ELS_status == "Control") %>%
ggplot(aes(x = cpg, y = mean, col = tissue, group = tissue)) +
geom_point(alpha = 0.7) +
geom_line(size = 1) +
scale_color_manual(values=color_theme) +
geom_errorbar(aes(ymin = mean-sd, ymax = mean+sd), width = .2, size = 1) +
labs(x = "CpG", y = "DNA Methylation [%]",
title = "Proximal Enhancer: Part 3") +
theme_bw() +
scale_y_continuous(limits = c(-5,100), breaks=seq(0,100,10)) +
theme(plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
axis.title.y = element_text(size = 16, face = "bold"),
axis.title.x = element_text(size = 16, face = "bold"),
axis.text.x = element_text(size = 14, hjust = 1, angle = 45),
axis.text.y = element_text(size = 14),
legend.background = element_rect(fill = "lightgray", size = 0.5,
linetype = "solid", colour = "darkgray"),
legend.title = element_blank(),
legend.position = "bottom")
# 8. Plot delta DNAm patterns across tissues/brain regions
plot_delta <- function(functional_unit){
delta_for_plot %>%
filter(functional_region_detailed == functional_unit) %>%
ggplot(aes(x = cpg, y = delta, group = category)) +
geom_line(size = 1, aes(linetype = category, color = category)) +
geom_point(aes(color = category)) +
geom_hline(color = "darkred", yintercept = 0) +
scale_y_continuous(limits = c(-80,80), breaks=seq(-80,80,10)) +
scale_linetype_manual(values=c("solid", "twodash", "dotted")) +
scale_color_manual(values = c("#3B528BFF", "#21908CFF", "#5DC963FF")) +
labs(x = "CpG", y = Delta ~ "DNA Methylation [%]",
title = functional_unit) +
theme_bw() +
theme(plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
axis.title.y = element_text(size = 16, face = "bold"),
axis.title.x = element_text(size = 16, face = "bold"),
axis.text.x = element_text(size = 14, hjust = 1, angle = 45),
axis.text.y = element_text(size = 12),
legend.background = element_rect(fill = "lightgray", size = 0.5,
linetype = "solid", colour = "darkgray"),
legend.title = element_blank(),
legend.position = "bottom")
}
plot_delta("Intron 8")
plot_delta("Intron 5")
plot_delta("Intron 1")
plot_delta("TSS")
plot_delta("Proximal Enhancer: Part 3")
plot_delta("Proximal Enhancer: Part 2")
plot_delta("Proximal Enhancer: Part 1")
plot_delta("CTCF 5'UTR")
# 9. Filter CpGs with low variance at baseline (IQR=<1)
iqr_cpg_tissue <- data_df %>%
filter(ELS_status == "Control") %>%
select(-c(ELS_status,row, column)) %>%
group_by(tissue) %>%
summarise(across(-c(sample), IQR, na.rm = TRUE)) %>%
pivot_longer(cols = -tissue, names_to = "cpg", values_to = "IQR") %>%
arrange(desc(IQR)) %>%
mutate(functional_region = as.factor(case_when(
cpg %in% intron_8 ~ "Intron 8",
cpg %in% intron_5 ~ "Intron 5",
cpg %in% intron_1 ~ "Intron 1",
cpg %in% tss ~ "TSS",
cpg %in% proximal_enhancer ~ "Proximal Enhancer",
cpg %in% ctcf_5UTR ~ "CTCF 5'UTR",
TRUE ~ "error")))
iqr_cpg_tissue %>%
ggplot(aes(x = IQR)) +
geom_histogram() +
geom_vline(color = "darkred", xintercept = 1) +
facet_wrap(~tissue) +
theme(plot.title = element_text(size = 12, face = "bold", hjust = 0.5),
axis.title.y = element_text(size = 10, face = "bold"),
axis.title.x = element_text(size = 10, face = "bold"),
axis.text.x = element_text(size = 5, hjust = 1, angle = 45),
legend.background = element_rect(fill = "lightgray", size = 0.5,
linetype = "solid", colour = "darkgray"),
legend.title = element_blank(),
legend.position = "bottom") +
theme_bw() +
scale_x_continuous(breaks = seq(0, 25, by = 1)) +
scale_y_continuous(breaks = seq(0, 100, by = 10))
cpgs_remain_blood <- iqr_cpg_tissue %>% filter(tissue == "Blood", IQR > 1) %>% pull(cpg)
print(paste0("CpGs remained for final analysis in blood: ", length(cpgs_remain_blood)))
cpgs_remain_fc <- iqr_cpg_tissue %>% filter(tissue == "FC", IQR > 1) %>% pull(cpg)
print(paste0("CpGs remained for final analysis in FC: ", length(cpgs_remain_fc)))
cpgs_remain_hip <- iqr_cpg_tissue %>% filter(tissue == "HIP", IQR > 1) %>% pull(cpg)
print(paste0("CpGs remained for final analysis in HIP: ", length(cpgs_remain_hip)))
reduced_df_mval_long_blood <- methylation_mval_long %>% filter(tissue == "Blood", cpg %in% cpgs_remain_blood)
reduced_df_mval_long_fc <- methylation_mval_long %>% filter(tissue == "FC", cpg %in% cpgs_remain_fc)
reduced_df_mval_long_hip <- methylation_mval_long %>% filter(tissue == "HIP", cpg %in% cpgs_remain_hip)
reduced_df_mval_all_tissues <- rbind(reduced_df_mval_long_blood, reduced_df_mval_long_fc, reduced_df_mval_long_hip)
# 10. Perform linear regression in each tissue/brain region to study ELS effects on DNAm of each CpG
lm_summary <- function(tissue_region){ # function to run linear regressions in each tissue separately
data_sub <- reduced_df_mval_all_tissues %>%
pivot_wider(id_cols = sample, names_from = cpg, values_from = methylation) %>%
left_join(covariates %>% select(sample, tissue, ELS_status)) %>%
filter(tissue == tissue_region)
results <- list()
for(i in outcome){
f <- as.formula(paste0("`", i, "`", "~ ELS_status"))
tidy <- tidy(lm(formula = f, data = data_sub, na.action = na.omit))
results[[i]] <- tidy
}
results
}
## Blood
outcome <- cpgs_remain_blood %>% sort()
lm_blood <- lm_summary("Blood")
lm_blood_df <- purrr::map_dfr(lm_blood, ~ .x, .id = "cpg") %>%
mutate(q.value = p.adjust(p.value, method = "fdr"),
p.signif = case_when(p.value < 0.05 & p.value >= 0.01 ~ "*",
p.value < 0.01 & p.value >= 0.001 ~ "**",
p.value < 0.001 & p.value >= 0.0001 ~ "***",
p.value < 0.0001 ~ "****",
p.value > 0.05 ~ "ns"),
q.signif = case_when(q.value < 0.05 & q.value >= 0.01 ~ "*",
q.value < 0.01 & q.value >= 0.001 ~ "**",
q.value < 0.001 & q.value >= 0.0001 ~ "***",
q.value < 0.0001 ~ "****",
q.value > 0.05 ~ "ns"),
functional_region = as.factor(case_when(
cpg %in% intron_8 ~ "Intron 8",
cpg %in% intron_5 ~ "Intron 5",
cpg %in% intron_1 ~ "Intron 1",
cpg %in% tss ~ "TSS",
cpg %in% proximal_enhancer ~ "Proximal Enhancer",
cpg %in% ctcf_5UTR ~ "CTCF 5'UTR",
TRUE ~ "error")))
lm_blood_df %>% filter(term != "(Intercept)", (p.signif != "ns" | q.signif != "ns"))
## Frontal cortex
outcome <- cpgs_remain_fc %>% sort()
lm_fc <- lm_summary("FC")
lm_fc_df <- purrr::map_dfr(lm_fc, ~ .x, .id = "cpg") %>%
mutate(q.value = p.adjust(p.value, method = "fdr"),
p.signif = case_when(p.value < 0.05 & p.value >= 0.01 ~ "*",
p.value < 0.01 & p.value >= 0.001 ~ "**",
p.value < 0.001 & p.value >= 0.0001 ~ "***",
p.value < 0.0001 ~ "****",
p.value > 0.05 ~ "ns"),
q.signif = case_when(q.value < 0.05 & q.value >= 0.01 ~ "*",
q.value < 0.01 & q.value >= 0.001 ~ "**",
q.value < 0.001 & q.value >= 0.0001 ~ "***",
q.value < 0.0001 ~ "****",
q.value > 0.05 ~ "ns"),
functional_region = as.factor(case_when(
cpg %in% intron_8 ~ "Intron 8",
cpg %in% intron_5 ~ "Intron 5",
cpg %in% intron_1 ~ "Intron 1",
cpg %in% tss ~ "TSS",
cpg %in% proximal_enhancer ~ "Proximal Enhancer",
cpg %in% ctcf_5UTR ~ "CTCF 5'UTR",
TRUE ~ "error")))
lm_fc_df %>% filter(term != "(Intercept)", (p.signif != "ns" | q.signif != "ns"))
## Hippocampus
outcome <- cpgs_remain_hip %>% sort()
lm_hip <- lm_summary("HIP")
lm_hip_df <- purrr::map_dfr(lm_hip, ~ .x, .id = "cpg") %>%
mutate(q.value = p.adjust(p.value, method = "fdr"),
p.signif = case_when(p.value < 0.05 & p.value >= 0.01 ~ "*",
p.value < 0.01 & p.value >= 0.001 ~ "**",
p.value < 0.001 & p.value >= 0.0001 ~ "***",
p.value < 0.0001 ~ "****",
p.value > 0.05 ~ "ns"),
q.signif = case_when(q.value < 0.05 & q.value >= 0.01 ~ "*",
q.value < 0.01 & q.value >= 0.001 ~ "**",
q.value < 0.001 & q.value >= 0.0001 ~ "***",
q.value < 0.0001 ~ "****",
q.value > 0.05 ~ "ns"),
functional_region = as.factor(case_when(
cpg %in% intron_8 ~ "Intron 8",
cpg %in% intron_5 ~ "Intron 5",
cpg %in% intron_1 ~ "Intron 1",
cpg %in% tss ~ "TSS",
cpg %in% proximal_enhancer ~ "Proximal Enhancer",
cpg %in% ctcf_5UTR ~ "CTCF 5'UTR",
TRUE ~ "error")))
lm_hip_df %>% filter(term != "(Intercept)", (p.signif != "ns" | q.signif != "ns"))
# 11. Visualize significant CpGs
## Blood
data_df %>%
ggplot(aes(x = ELS_status, y = `28557969`)) +
geom_boxplot(aes(fill = ELS_status), alpha = 0.7, outlier.size = 1) +
geom_point(aes(fill = ELS_status), alpha = 0.7,
position = position_jitterdodge(jitter.height = 0,
jitter.width = 0.2), size = 1) +
facet_wrap(~ tissue) +
scale_fill_manual(values = c("snow2", "seagreen4")) +
labs(title = "Intron 5 - 28557969", y = "DNA Methylation [%]") +
theme_bw() +
theme(plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
axis.title.y = element_text(size = 16, face = "bold"),
axis.title.x = element_blank(),
axis.text.x = element_text(size = 16, face = "bold"),
axis.text.y = element_text(size = 14),
legend.position = "none") +
stat_compare_means(aes(group = ELS_status, label = paste0("p = ", ..p.format.., ..p.signif..)),
method = "t.test", size = 5) +
scale_y_continuous(breaks = seq(round(min(data_df$`28557969`, na.rm = T)),
round(max(data_df$`28557969`, na.rm = T)), by = 4))
data_df %>%
ggplot(aes(x = ELS_status, y = `28579496`)) +
geom_boxplot(aes(fill = ELS_status), alpha = 0.7, outlier.size = 1) +
geom_point(aes(fill = ELS_status), alpha = 0.7,
position = position_jitterdodge(jitter.height = 0,
jitter.width = 0.2), size = 1) +
facet_wrap(~ tissue) +
scale_fill_manual(values = c("snow2", "seagreen4")) +
labs(title ="Intron 1 - 28579496", y = "DNA Methylation [%]") +
theme_bw() +
theme(plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
axis.title.y = element_text(size = 16, face = "bold"),
axis.title.x = element_blank(),
axis.text.x = element_text(size = 16, face = "bold"),
axis.text.y = element_text(size = 14),
legend.position = "none") +
stat_compare_means(aes(group = ELS_status, label = paste0("p = ", ..p.format.., ..p.signif..)),
method = "t.test", size = 5) +
scale_y_continuous(breaks = seq(round(min(data_df$`28579496`, na.rm = T)),
round(max(data_df$`28579496`, na.rm = T)), by = 4))
## FC
data_df %>%
ggplot(aes(x = ELS_status, y = `28557729`)) +
geom_boxplot(aes(fill = ELS_status), alpha = 0.7, outlier.size = 1) +
geom_point(aes(fill = ELS_status), alpha = 0.7,
position = position_jitterdodge(jitter.height = 0,
jitter.width = 0.2), size = 1) +
facet_wrap(~ tissue) +
scale_fill_manual(values = c("snow2", "seagreen4")) +
labs(title = "Intron 5 - 28557729", y = "DNA Methylation [%]") +
theme_bw() +
theme(plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
axis.title.y = element_text(size = 16, face = "bold"),
axis.title.x = element_blank(),
axis.text.x = element_text(size = 16, face = "bold"),
axis.text.y = element_text(size = 14),
legend.position = "none") +
stat_compare_means(aes(group = ELS_status, label = paste0("p = ", ..p.format.., ..p.signif..)),
method = "t.test", size = 5) +
scale_y_continuous(breaks = seq(round(min(data_df$`28557729`, na.rm = T)),
round(max(data_df$`28557729`, na.rm = T)), by = 4))
data_df %>%
ggplot(aes(x = ELS_status, y = `28649771`)) +
geom_boxplot(aes(fill = ELS_status), alpha = 0.7, outlier.size = 1) +
geom_point(aes(fill = ELS_status), alpha = 0.7,
position = position_jitterdodge(jitter.height = 0,
jitter.width = 0.2), size = 1) +
facet_wrap(~ tissue) +
scale_fill_manual(values = c("snow2", "seagreen4")) +
labs(title ="Proximal Enhancer - 28649771", y = "DNA Methylation [%]") +
theme_bw() +
theme(plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
axis.title.y = element_text(size = 16, face = "bold"),
axis.title.x = element_blank(),
axis.text.x = element_text(size = 16, face = "bold"),
axis.text.y = element_text(size = 14),
legend.position = "none") +
stat_compare_means(aes(group = ELS_status, label = paste0("p = ", ..p.format.., ..p.signif..)),
method = "t.test", size = 5) +
scale_y_continuous(breaks = seq(round(min(data_df$`28649771`, na.rm = T)),
round(max(data_df$`28649771`, na.rm = T)), by = 4))
data_df %>%
ggplot(aes(x = ELS_status, y = `28649785`)) +
geom_boxplot(aes(fill = ELS_status), alpha = 0.7, outlier.size = 1) +
geom_point(aes(fill = ELS_status), alpha = 0.7,
position = position_jitterdodge(jitter.height = 0,
jitter.width = 0.2), size = 1) +
facet_wrap(~ tissue) +
scale_fill_manual(values = c("snow2", "seagreen4")) +
labs(title ="Proximal Enhancer - 28649785", y = "DNA Methylation [%]") +
theme_bw() +
theme(plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
axis.title.y = element_text(size = 16, face = "bold"),
axis.title.x = element_blank(),
axis.text.x = element_text(size = 16, face = "bold"),
axis.text.y = element_text(size = 14),
legend.position = "none") +
stat_compare_means(aes(group = ELS_status, label = paste0("p = ", ..p.format.., ..p.signif..)),
method = "t.test", size = 5) +
scale_y_continuous(breaks = seq(round(min(data_df$`28649785`, na.rm = T)),
round(max(data_df$`28649785`, na.rm = T)), by = 4))
## HIP
data_df %>%
ggplot(aes(x = ELS_status, y = `28657196`)) +
geom_boxplot(aes(fill = ELS_status), alpha = 0.7, outlier.size = 1) +
geom_point(aes(fill = ELS_status), alpha = 0.7,
position = position_jitterdodge(jitter.height = 0,
jitter.width = 0.2), size = 1) +
facet_wrap(~ tissue) +
scale_fill_manual(values = c("snow2", "seagreen4")) +
labs(title = "CTCF 5'UTR - 28657196", y = "DNA Methylation [%]") +
theme_bw() +
theme(plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
axis.title.y = element_text(size = 16, face = "bold"),
axis.title.x = element_blank(),
axis.text.x = element_text(size = 16, face = "bold"),
axis.text.y = element_text(size = 14),
legend.position = "none") +
stat_compare_means(aes(group = ELS_status, label = paste0("p = ", ..p.format.., ..p.signif..)),
method = "t.test", size = 5) +
scale_y_continuous(breaks = seq(round(min(data_df$`28657196`, na.rm = T)),
round(max(data_df$`28657196`, na.rm = T)), by = 4))
data_df %>%
ggplot(aes(x = ELS_status, y = `28657385`)) +
geom_boxplot(aes(fill = ELS_status), alpha = 0.7, outlier.size = 1) +
geom_point(aes(fill = ELS_status), alpha = 0.7,
position = position_jitterdodge(jitter.height = 0,
jitter.width = 0.2), size = 1) +
facet_wrap(~ tissue) +
scale_fill_manual(values = c("snow2", "seagreen4")) +
labs(title = "CTCF 5'UTR - 28657385", y = "DNA Methylation [%]") +
theme_bw() +
theme(plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
axis.title.y = element_text(size = 16, face = "bold"),
axis.title.x = element_blank(),
axis.text.x = element_text(size = 16, face = "bold"),
axis.text.y = element_text(size = 14),
legend.position = "none") +
stat_compare_means(aes(group = ELS_status, label = paste0("p = ", ..p.format.., ..p.signif..)),
method = "t.test", size = 5) +
scale_y_continuous(breaks = seq(round(min(data_df$`28657385`, na.rm = T)),
round(max(data_df$`28657385`, na.rm = T)), by = 2))
# Session information
utils:::print.sessionInfo(sessionInfo()[-8])
# End of script