diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..bbdfeb5 Binary files /dev/null and b/.DS_Store differ diff --git a/README.md b/README.md new file mode 100644 index 0000000..df0e5d0 --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +# Schizophrenia risk variants modulate transcription factor binding and gene expression in cortical cell types + +Schizophrenia is a complex neuropsychiatric disorder with a strong genetic component. Genome-wide association studies (GWAS) have identified numerous risk variants, but their functional impact on gene regulation remains largely unknown. We investigate the disruption and enhancement of transcription factor (TF) binding motifs by schizophrenia-associated GWAS SNPs in 15 cortical cell types of the human brain. By integrating single-nucleus ATAC-seq and RNA-seq data from 71 donors (36 affected by schizophrenia) with GWAS summary statistics, we identify specific TF motifs whose binding affinities are altered by risk and protective SNPs. We find that risk SNPs predominantly disrupt TF binding, while protective SNPs show a more balanced effect. Furthermore, we demonstrate that disrupted TF motifs can lead to altered expression of target genes, including NAGA in excitatory neurons and HLA-B in oligodendrocyte precursor cells. These genes have been previously implicated in schizophrenia and our study provides a mechanism for their dysregulation through altered TF binding. Our findings highlight the importance of considering cell type-specific effects and provide a genome-wide map of TF motif disruptions in schizophrenia, offering insights into the regulatory mechanisms underlying disease risk. + +This repository contains all scripts used for the analysis of TF binding disruption and enhancement through schizophrenia GWAS SNPs and the differential expression analysis of respective target genes between carriers and non-carriers of the respective alternative allele. diff --git a/TF_binding/01_CellTypeSpecificPeaks_SCZ.R b/TF_binding/01_CellTypeSpecificPeaks_SCZ.R new file mode 100644 index 0000000..007dc61 --- /dev/null +++ b/TF_binding/01_CellTypeSpecificPeaks_SCZ.R @@ -0,0 +1,171 @@ +## Identify marker features based on the peak matrix +## only on SCZ cases and controls + +# run with conda environment sc-atac + +library(ArchR) +library(dplyr) +library(tibble) +library(tidyr) +library(RColorBrewer) +library(BSgenome.Hsapiens.UCSC.hg38) + +# folder of ArchR project +archr_folder <- "../ArchRSubset_SCZ" + +# 1. Read data + +projPM <- loadArchRProject(path = paste0(archr_folder,"/")) + +# 2. Check out available matrices, peak matrix was added to ArchR project in 14_1_markerFeatures.R, but marker features were not saved to object or file +#projPM_peak <- addPeakMatrix(projPM) +getAvailableMatrices(projPM) +# [1] "GeneIntegrationMatrix" "GeneScoreMatrix" "MotifMatrix" +# [4] "MotifMatrix_SCZ" "PeakMatrix" "TileMatrix" +peak_mat <- getMatrixFromProject(projPM, useMatrix="PeakMatrix") +min(peak_mat@colData$ReadsInPeaks) # 482 + +# 3. Add marker features for each cell type compared to all others + +markersPeaks <- getMarkerFeatures( + ArchRProj = projPM, + useMatrix = "PeakMatrix", + groupBy = "celltypes_refined", + bias = c("TSSEnrichment", "log10(nFrags)"), + testMethod = "wilcoxon" +) + +saveRDS(markersPeaks, "data/markersPeaks.rds") +markersPeaks <- readRDS("data/markersPeaks.rds") + +markerList <- getMarkers(markersPeaks, + cutOff = "FDR <= 0.05 & Log2FC >= 1") +markerList +markerList_GRanges <- getMarkers(markersPeaks, + cutOff = "FDR <= 0.05 & Log2FC >= 1", + returnGR = TRUE) +markerList_GRanges + +# 3a. Plot number of marker peaks per cell type +n_markers <- sapply(markerList, nrow) +df_n_markers <- data.frame(celltype=names(n_markers), + value=n_markers, row.names=NULL) +write.csv(df_n_markers, "data/01_Number-MarkerPeaks.csv") + +# Barplot with number of marker peaks per celltype +ggplot(df_n_markers, aes(x=celltype, y=value)) + + geom_bar(stat="identity", fill="steelblue")+ + geom_text(aes(label=value), vjust=1.6, color="white", size=3.5)+ + xlab("") + + ylab("Number of marker peaks") + + scale_x_discrete(guide = guide_axis(angle = 45)) + + theme_minimal() + + theme( + strip.text.x = element_text(size = 15), + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + legend.title = element_text(size = 12) + ) +ggsave(paste0(projPM@projectMetadata$outputDirectory, "/Plots/TFbinding/01_Barplot_MarkerPeaks.pdf"), + height = 8, + width = 10) + +# 4. Add motif annotations to each peak + +projPM <- addMotifAnnotations(ArchRProj = projPM, + motifSet = "JASPAR2020", + #collection = "CORE", + annoName = "Motif") + #force = TRUE) # force was necessary after subsetting to SCZ samples + + +# 5. Motif Enrichment in Marker Peaks + +enrichMotifs <- peakAnnoEnrichment( + seMarker = markersPeaks, + ArchRProj = projPM, + peakAnnotation = "Motif", + cutOff = "FDR <= 0.05 & Log2FC >= 1" +) +enrichMotifs +head(enrichMotifs@assays@data$mlog10p) + +saveRDS(enrichMotifs, "data/enrichMotifs.rds") +enrichMotifs <- readRDS("data/enrichMotifs.rds") + +# 5.1 Plot heatmap of enriched motifs per cell type + +heatmapEM <- plotEnrichHeatmap(enrichMotifs, + n = 7, # limit of motifs shown per cell type + transpose = TRUE) +ComplexHeatmap::draw(heatmapEM, + heatmap_legend_side = "bot", + annotation_legend_side = "bot") +plotPDF(heatmapEM, + name = "TFbinding/01_Motifs-Enriched-Marker-Heatmap", + width = 8, + height = 6, + ArchRProj = projPM, + addDOC = FALSE) + +# 5.2 TF binding enrichments with P < 1*10^-10 + +# extract motif-celltype pairs with respective P value +tfenr <- as.data.frame(enrichMotifs@assays@data$mlog10p) %>% + rownames_to_column(var = "motif") %>% + pivot_longer(!motif, names_to="celltype", values_to="mlog10p") %>% + filter(mlog10p >= 10) +write.csv(tfenr, "data/01_MotifEnrichments_threshold.csv") + +# count enriched motifs per celltype +tf_enr_count <- tfenr %>% + group_by(celltype) %>% + count(celltype) +write.csv(tf_enr_count, "data/01_MotifEnrichments_count.csv") + +# # A tibble: 15 × 2 +# # Groups: celltype [15] +# celltype n +# +# 1 Astro_FB 127 +# 2 Astro_PP 128 +# 3 Endothelial 86 +# 4 Exc_L2-3 111 +# 5 Exc_L3-5 143 +# 6 Exc_L4-6_1 127 +# 7 Exc_L4-6_2 98 +# 8 In_LAMP5 57 +# 9 In_PVALB_Ba 78 +# 10 In_PVALB_Ch 150 +# 11 In_SST 79 +# 12 In_VIP 122 +# 13 Microglia 153 +# 14 OPC 113 +# 15 Oligodendrocyte 111 + +# Barplot with number of motif enrichments with P < 1*10^-10 per celltype +ggplot(tf_enr_count, aes(x=celltype, y=n)) + + geom_bar(stat="identity", fill="steelblue")+ + geom_text(aes(label=n), vjust=1.6, color="white", size=3.5)+ + xlab("") + + ylab("Number of enriched motifs") + + scale_x_discrete(guide = guide_axis(angle = 45)) + + theme_minimal() + + theme( + strip.text.x = element_text(size = 15), + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + legend.title = element_text(size = 12) + ) +ggsave(paste0(projPM@projectMetadata$outputDirectory, "/Plots/TFbinding/01_Barplot_Enriched_Motifs.pdf"), + height = 8, + width = 10) + +# 6. Save ArchR project with motif annotations +saveArchRProject(ArchRProj = projPM) \ No newline at end of file diff --git a/TF_binding/02_MotifDeviation_chromVAR_SCZ.R b/TF_binding/02_MotifDeviation_chromVAR_SCZ.R new file mode 100644 index 0000000..035b7be --- /dev/null +++ b/TF_binding/02_MotifDeviation_chromVAR_SCZ.R @@ -0,0 +1,481 @@ +## ChromVAR Deviations Enrichment on a per-cell basis +## only on SCZ cases and controls + +# run with conda environment sc-atac (sc-atac-tf --> for lower part with RNA expression) + +library(ArchR) +library(Seurat) +library(dplyr) +library(tidyr) +library(RColorBrewer) +library(BSgenome.Hsapiens.UCSC.hg38) + +# folder of ArchR project +archr_folder <- "../ArchRSubset_SCZ" + +# 1. Read data + +projPM <- loadArchRProject(path = paste0(archr_folder,"/")) + +# 1a. Add imputation weights as they are missing +projPM <- addImputeWeights(projPM) + +# 2. Add set of background peaks which are used for computation of deviations +# --> function is based on similarity in GC-content and number of fragments across all samples + +projPM <- addBgdPeaks(projPM) + +# 3. Compute per-cell deviations across all motif annotations + +projPM <- addDeviationsMatrix( + ArchRProj = projPM, + peakAnnotation = "Motif", + force = TRUE +) + +# 3a. Save ArchR project with deviations matrix +saveArchRProject(ArchRProj = projPM) + +# 4. Access deviation scores +plotVarDev_df <- getVarDeviations(projPM, + name = "MotifMatrix_SCZ", + plot = FALSE) # returns dataframe, otherwise ggplot object +plotVarDev_plot <- getVarDeviations(projPM, + name = "MotifMatrix_SCZ", + plot = TRUE) # returns ggplot object with TRUE, otherwise dataframe +plotPDF(plotVarDev_plot, + name = "TFbinding/02_Variable-Motif-Deviation-Scores", + width = 5, + height = 5, + ArchRProj = projPM, + addDOC = FALSE) + +# 4.1 Explore deviation scores +tmp <- as.data.frame(plotVarDev_df) %>% filter(combinedVars >= 1) +dim(tmp) # [1] 619 6 +dim(plotVarDev_df) # [1] 633 6 +head(plotVarDev_df, n = 10) + +# 5. Inspect subset of motifs for downstream analysis +motifs <- plotVarDev_df$name[1:10] +markerMotifs <- getFeatures(projPM, + select = paste(motifs, collapse="|"), + useMatrix = "MotifMatrix") +markerMotifs + +# get just the features corresponding to z-scores +markerMotifs <- grep("z:", markerMotifs, value = TRUE) +markerMotifs + +# plot distribution of chromVAR deviation scores for each cell type +p <- plotGroups(ArchRProj = projPM, + groupBy = "celltypes_refined", + colorBy = "MotifMatrix", + name = markerMotifs, + imputeWeights = getImputeWeights(projPM) +) + +p2 <- lapply(seq_along(p), function(x){ + if(x != 1){ + p[[x]] + guides(color = FALSE, fill = FALSE) + + theme_ArchR(baseSize = 6) + + theme(plot.margin = unit(c(0.1, 0.1, 0.1, 0.1), "cm")) + + theme( + axis.text.y=element_blank(), + axis.ticks.y=element_blank(), + axis.title.y=element_blank() + ) + ylab("") + }else{ + p[[x]] + guides(color = FALSE, fill = FALSE) + + theme_ArchR(baseSize = 6) + + theme(plot.margin = unit(c(0.1, 0.1, 0.1, 0.1), "cm")) + + theme( + axis.ticks.y=element_blank(), + axis.title.y=element_blank() + ) + ylab("") + } +}) +p3 <- do.call(cowplot::plot_grid, c(list(nrow = 1, rel_widths = c(2, rep(1, length(p2) - 1))),p2)) + +plotPDF(p3, name = "TFbinding/02_Plot-Groups-Deviations-w-Imputation", + width = 12, + height = 10, + ArchRProj = projPM, + addDOC = FALSE) + +# plot UMAP +p <- plotEmbedding( + ArchRProj = projPM, + colorBy = "MotifMatrix", + name = sort(markerMotifs), + embedding = "UMAP", + imputeWeights = getImputeWeights(projPM) +) + +p2 <- lapply(p, function(x){ + x + guides(color = FALSE, fill = FALSE) + + theme_ArchR(baseSize = 6.5) + + theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) + + theme( + axis.text.x=element_blank(), + axis.ticks.x=element_blank(), + axis.text.y=element_blank(), + axis.ticks.y=element_blank() + ) +}) +p3 <- do.call(cowplot::plot_grid, c(list(ncol = 3),p2)) +plotPDF(p3, + name = "TFbinding/02_Plot-UMAP-Deviations-w-Imputation", + width = 10, + height = 10, + ArchRProj = projPM, + addDOC = FALSE) + + +# plot respective gene scores for these TFs +motifs_format <- str_remove(motifs, "_[0-9]+") +markerRNA <- getFeatures(projPM, select = paste(motifs_format, collapse="|"), useMatrix = "GeneScoreMatrix") +markerRNA <- markerRNA[markerRNA %ni% c("CELF3", "FOSL1", "FOSB", "SPICE1")] +markerRNA + +p <- plotEmbedding( + ArchRProj = projPM, + colorBy = "GeneScoreMatrix", + name = sort(markerRNA), + embedding = "UMAP", + imputeWeights = getImputeWeights(projPM) +) + +p2 <- lapply(p, function(x){ + x + guides(color = FALSE, fill = FALSE) + + theme_ArchR(baseSize = 6.5) + + theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) + + theme( + axis.text.x=element_blank(), + axis.ticks.x=element_blank(), + axis.text.y=element_blank(), + axis.ticks.y=element_blank() + ) +}) +p3 <- do.call(cowplot::plot_grid, c(list(ncol = 3),p2)) +plotPDF(p3, + name = "TFbinding/02_Plot-UMAP-GeneScores-w-Imputation", + width = 10, + height = 10, + ArchRProj = projPM, + addDOC = FALSE) + + +# plot respective RNA expression of matched cells +motifs_format <- str_remove(motifs, "_[0-9]+") +markerRNA <- getFeatures(projPM, select = paste(motifs_format, collapse="|"), useMatrix = "GeneIntegrationMatrix") +markerRNA <- markerRNA[markerRNA %ni% c("CELF3", "FOSL1", "FOSB", "SPICE1")] +markerRNA + +p <- plotEmbedding( + ArchRProj = projPM, + colorBy = "GeneIntegrationMatrix", + name = sort(markerRNA), + embedding = "UMAP", + continuousSet = "blueYellow", + imputeWeights = getImputeWeights(projPM) +) + +p2 <- lapply(p, function(x){ + x + guides(color = FALSE, fill = FALSE) + + theme_ArchR(baseSize = 6.5) + + theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) + + theme( + axis.text.x=element_blank(), + axis.ticks.x=element_blank(), + axis.text.y=element_blank(), + axis.ticks.y=element_blank() + ) +}) +p3 <- do.call(cowplot::plot_grid, c(list(ncol = 3),p2)) +plotPDF(p3, + name = "TFbinding/02_Plot-UMAP-GeneIntegration-w-Imputation", + width = 10, + height = 10, + ArchRProj = projPM, + addDOC = FALSE) + + +# 6. Get TF motifs with scaled deviation > 1 per cell type + +# get deviation matrix from ArrowFiles +devmatrix <- getMatrixFromProject(projPM, + useMatrix = "MotifMatrix_SCZ") + +# subset to cell type-specific deviation matrix, get scaled mean deviation +# score per motif per cell type and filter to those > 1 +dev_ct <- list() +for (ct in unique(colData(devmatrix)$celltypes_refined)){ + + devmatrix_ct <- devmatrix[, colData(devmatrix)$celltypes_refined == ct] + assays(devmatrix_ct)$z + assays(devmatrix_ct)$deviation + + dev_ct_scale <- as.data.frame(scale(rowMeans(assays(devmatrix_ct)$deviation))) %>% + rownames_to_column("motif") %>% + filter(V1 > 1) + + dev_ct[[ct]] <- dev_ct_scale + +} + +dev_ct_df <- bind_rows(dev_ct, .id="celltype") +write.csv(dev_ct_df, "data/02_MotifDeviation_chromVAR_greater1.csv") +rm(devmatrix) +rm(devmatrix_ct) + +# 6.1 Compare chromVAR motifs to enriched motifs +dev_ct_df <- read.csv("data/02_MotifDeviation_chromVAR_greater1.csv") +tfenr <- read.csv("data/01_MotifEnrichments_threshold.csv") +motifs_enr_count <- tfenr %>% + group_by(celltype) %>% + count() +# # A tibble: 15 × 2 +# # Groups: celltype [15] +# celltype n +# +# 1 Astro_FB 127 +# 2 Astro_PP 128 +# 3 Endothelial 86 +# 4 Exc_L2-3 111 +# 5 Exc_L3-5 143 +# 6 Exc_L4-6_1 127 +# 7 Exc_L4-6_2 98 +# 8 In_LAMP5 57 +# 9 In_PVALB_Ba 78 +# 10 In_PVALB_Ch 150 +# 11 In_SST 79 +# 12 In_VIP 122 +# 13 Microglia 153 +# 14 OPC 113 +# 15 Oligodendrocyte 111 + +# join of shared motifs that are enriched and highly accessible in same celltype +motifs_joined <- tfenr %>% + inner_join(dev_ct_df, by=c("celltype", "motif")) +dim(motifs_joined) # [1] 526 6 +write.csv(motifs_joined, "data/02_MotifsEnrichedANDAccessible.csv") + +# count overlapping motifs per celltype +motifs_joined_count <- motifs_joined %>% + group_by(celltype) %>% + count(celltype) +motifs_joined_count +write.csv(motifs_joined_count, "data/02_MotifsEnrichedANDAccessible_count.csv") +# # A tibble: 15 × 2 +# # Groups: celltype [15] +# celltype n +# +# 1 Astro_FB 42 +# 2 Astro_PP 41 +# 3 Endothelial 12 +# 4 Exc_L2-3 65 +# 5 Exc_L3-5 53 +# 6 Exc_L4-6_1 26 +# 7 Exc_L4-6_2 56 +# 8 In_LAMP5 8 +# 9 In_PVALB_Ba 38 +# 10 In_PVALB_Ch 63 +# 11 In_SST 36 +# 12 In_VIP 13 +# 13 Microglia 47 +# 14 OPC 10 +# 15 Oligodendrocyte 16 + +# Barplot with number of shared motifs per celltype +ggplot(motifs_joined_count, aes(x=celltype, y=n)) + + geom_bar(stat="identity", fill="steelblue")+ + geom_text(aes(label=n), vjust=1.6, color="white", size=3.5)+ + xlab("") + + ylab("Number of motifs") + + scale_x_discrete(guide = guide_axis(angle = 45)) + + theme_minimal() + + theme( + strip.text.x = element_text(size = 15), + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + legend.title = element_text(size = 12) + ) +ggsave(paste0(projPM@projectMetadata$outputDirectory, "/Plots/TFbinding/02_Barplot_Motifs_EnrichedAndAccessible.pdf"), + height = 8, + width = 10) + +# 7. Find motifs with deviation > 1 AND gene expression + +# # Use GeneIntegrationMatrix --> failed +# rnamatrix <- getMatrixFromProject(projPM, +# useMatrix = "GeneIntegrationMatrix", +# threads = 8) +# # Matrix is too large to read in: +# # Error in cbind.Matrix(x, y, deparse.level = 0L) : +# # p[length(p)] cannot exceed 2^31-1 + +# read RNA data +rna <- readRDS("../../../scanpy_adata/adata_labelTransfer_celltypes_samplesFilt_AnnaAnnotation_RELN_woObs.rds") +Idents(object = rna) +Idents(object = rna) <- "ctAnna_r1" + +# add column with gene name to TF motif table +# case of heterdimers: all "components" need to expressed in > X% of cells +motifs_joined <- motifs_joined %>% + mutate(gene_name = str_remove(motif, "_[0-9]+")) %>% + mutate(gene_name = str_remove(gene_name, "\\.var\\.[0-9]")) %>% + separate(gene_name, into = c("motif_gene1", "motif_gene2", "motif_gene3"), sep="\\.\\.", remove=FALSE) + + +list_exp <- list() +for (ct in unique(motifs_joined$celltype)){ + + # subset single-cell RNA object to cells from celltype + rna_ct <- subset(x=rna, idents = ct) + #rna_ct@assays$RNA@data + #rna_ct@assays$RNA@counts + + # get TF genes for this celltype + genes_ct <- c(motifs_joined[motifs_joined$celltype == ct,]$motif_gene1, + motifs_joined[motifs_joined$celltype == ct,]$motif_gene2, + motifs_joined[motifs_joined$celltype == ct,]$motif_gene3) + genes_ct_rna <- intersect(genes_ct, row.names(rna_ct@assays$RNA@meta.features)) + + # get percentage of cells in celltype expression respective gene + tf_exp_vec <- sapply(genes_ct_rna, + function(x) sum(GetAssayData(object = rna_ct, slot = "data")[x,]>0)/nrow(rna_ct@meta.data)) + tf_exp <- data.frame("gene_name" = names(tf_exp_vec), "perc_exp" = tf_exp_vec, + row.names = NULL) + list_exp[[ct]] <- tf_exp + +} +tf_exp_df <- bind_rows(list_exp, .id="celltype") + +# 5% cutoff for gene expression +tf_exp_5perc <- tf_exp_df %>% filter(perc_exp >= 0.05) +# tab_print_5perc <- tf_exp_5perc %>% +# left_join(motifs_joined, by=c("celltype", "gene_name")) +# dim(tab_print_5perc) +# # [1] 153 10 +tab_print_5perc <- motifs_joined %>% + inner_join(tf_exp_5perc, by=c("celltype", "motif_gene1"="gene_name")) %>% + left_join(tf_exp_5perc, by=c("celltype", "motif_gene2"="gene_name")) %>% + left_join(tf_exp_5perc, by=c("celltype", "motif_gene3"="gene_name")) %>% + filter(is.na(motif_gene2) | perc_exp.y >= 0.05) %>% + filter(is.na(motif_gene3) | perc_exp >= 0.05) +dim(tab_print_5perc) +# [1] 157 10 +write.csv(tab_print_5perc, "data/02_Motifs_Expression5perc.csv") +tab_print_5perc <- read.csv("data/02_Motifs_Expression5perc.csv") +tf_exp_5perc_count <- tab_print_5perc %>% + group_by(celltype) %>% + count(celltype) +write.csv(tf_exp_5perc_count, "data/02_Motifs_Expression5perc_count.csv") + +# 10% cutoff for gene expression +tf_exp_10perc <- tf_exp_df %>% filter(perc_exp >= 0.10) +# tab_print_10perc <- tf_exp_10perc %>% +# left_join(motifs_joined, by=c("celltype", "gene_name")) +tab_print_10perc <- motifs_joined %>% + inner_join(tf_exp_10perc, by=c("celltype", "motif_gene1"="gene_name")) %>% + left_join(tf_exp_10perc, by=c("celltype", "motif_gene2"="gene_name")) %>% + left_join(tf_exp_10perc, by=c("celltype", "motif_gene3"="gene_name")) %>% + filter(is.na(motif_gene2) | perc_exp.y >= 0.05) %>% + filter(is.na(motif_gene3) | perc_exp >= 0.05) +dim(tab_print_10perc) +# [1] 106 13 +write.csv(tab_print_10perc, "data/02_Motifs_Expression10perc.csv") +tab_print_10perc <- read.csv("data/02_Motifs_Expression10perc.csv") +tf_exp_10perc_count <- tab_print_10perc %>% + group_by(celltype) %>% + count(celltype) +write.csv(tf_exp_10perc_count, "data/02_Motifs_Expression10perc_count.csv") + +# Barplot with number of TFs fulfillung all criteria +ggplot(tf_exp_10perc_count, aes(x=celltype, y=n)) + + geom_bar(stat="identity", fill="steelblue")+ + geom_text(aes(label=n), vjust=1.6, color="white", size=3.5)+ + xlab("") + + ylab("Number of motifs") + + scale_x_discrete(guide = guide_axis(angle = 45)) + + theme_minimal() + + theme( + strip.text.x = element_text(size = 15), + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + legend.title = element_text(size = 12) + ) +ggsave(paste0(projPM@projectMetadata$outputDirectory, + "/Plots/TFbinding/02_Barplot_Motifs_EnrichedAndAccessibleAndExp10Perc.pdf"), + height = 8, + width = 10) + +ggplot(tf_exp_5perc_count, aes(x=celltype, y=n)) + + geom_bar(stat="identity", fill="steelblue")+ + geom_text(aes(label=n), vjust=1.6, color="white", size=3.5)+ + xlab("") + + ylab("Number of motifs") + + scale_x_discrete(guide = guide_axis(angle = 45)) + + theme_minimal() + + theme( + strip.text.x = element_text(size = 15), + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + legend.title = element_text(size = 12) + ) +ggsave(paste0(projPM@projectMetadata$outputDirectory, + "/Plots/TFbinding/02_Barplot_Motifs_EnrichedAndAccessibleAndExp5Perc.pdf"), + height = 8, + width = 10) + + +# 8. Barplot with number of enriched, accessible and expressed motifs/TFs +df_all <- bind_rows(list("enriched" = motifs_enr_count, + "accessible" = motifs_joined_count, + "expressed" = tf_exp_5perc_count), + .id = "filter_step") +df_all$filter_step <- factor(df_all$filter_step, levels=c("enriched", "accessible", "expressed")) + +ggplot(df_all, aes(x=celltype, y=n, fill = filter_step)) + + geom_bar(stat="identity", position="dodge")+ + geom_text(aes(label=n, group = filter_step), + position = position_dodge(width = 0.9), + vjust=1.6, color="white", size=2.5)+ + xlab("") + + ylab("Number of motifs") + + scale_x_discrete(guide = guide_axis(angle = 45)) + + scale_fill_manual(name="", + breaks=c("enriched", "accessible", "expressed"), + labels=c("Enriched", + "Enriched and \nAccessible", + "Enriched and \nAccessible and \nGene Expressed"), + values=c("#2a2956", "#ac2711", "#df8547")) + + #values=c("#5f2423", "#ec2730", "#dcbe3b")) + + theme_minimal() + + theme( + strip.text.x = element_text(size = 15), + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + legend.title = element_text(size = 12) + ) +ggsave(paste0(projPM@projectMetadata$outputDirectory, + "/Plots/TFbinding/02_Barplot_Motifs_Enriched-Accessible-Exp5Perc.pdf"), + height = 8, + width = 10) + + + + diff --git a/TF_binding/03_GWAS-SNPcontainingCelltypePeakSet_SCZ.R b/TF_binding/03_GWAS-SNPcontainingCelltypePeakSet_SCZ.R new file mode 100644 index 0000000..6171785 --- /dev/null +++ b/TF_binding/03_GWAS-SNPcontainingCelltypePeakSet_SCZ.R @@ -0,0 +1,195 @@ +## Extract peaks containing GWAS SNPs + +# run with conda environment sc-atac-tf + +library(ArchR) +library(Seurat) +library(dplyr) +library(tidyr) +library(RColorBrewer) +library(BSgenome.Hsapiens.UCSC.hg38) +library(VariantAnnotation) +library(gwasvcf) +library(plyranges) + +basedir <- "/psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/" +# folder of ArchR project +archr_folder <- "../ArchRSubset_SCZ" + +### FUNCTIONALTIES + +# function to change chr names from "chr1" to "NC_000..." +change_chr_name <- function(peak_set){ + + # join the chromosomes with official names from NCBI FASTA + # --> necessary to extract peak sequences from FASTA file + change_chr <- data.frame("chrom" = as.vector(peak_set@seqnames)) %>% + left_join(official_chroms, by=c("chrom" = "UCSC.style.name")) + + # expand chromosome levels to the official names and later delete them after changing + seqlevels(peak_set) <- c(seqlevels(peak_set), unique(change_chr$RefSeq.seq.accession)) + peak_set <- peak_set %>% + mutate(seqnames = factor(change_chr$RefSeq.seq.accession, levels = seqlevels(peak_set))) + seqlevels(peak_set) <- unique(change_chr$RefSeq.seq.accession) + + return(peak_set) +} + +# function to subset celltype specific marker peaks to those containing a GWAS SNP +subsetCelltypePeaks <- function(ct, final_peaks){ + + ## Risk and protective SNPs + # subset peaks from Peak Set to those containing a GWAS SNP + peaks_snp <- subsetByOverlaps(final_peaks, gwas_granges) # peaks with GWAS SNP in it + # change chromosome names + peaks_snp <- change_chr_name(peaks_snp) + # write peak set (GRanges) to bed file with function from rtracklayer + export.bed(peaks_snp, paste0("data/03_OverlapSNPsAndPeaks/03_GWAS-SNPcontainingPeaks_", ct, "_riskAndProtect.bed")) + + # subset GWAS SNPs to those overlapping a peak for later analysis + snps_peak <- subsetByOverlaps(rowRanges(vcf), final_peaks) + # write SNPs (GRanges) to bed file with function from rtracklayer + export.bed(snps_peak, paste0("data/03_OverlapSNPsAndPeaks/03_GWAS-SNPsInPeaks_", ct, "_riskAndProtect.bed")) + + ## Risk SNPs + # subset peaks from Peak Set to those containing a GWAS SNP + peaks_snp <- subsetByOverlaps(final_peaks, risk_granges) # peaks with GWAS SNP in it + # change chromosome names + peaks_snp <- change_chr_name(peaks_snp) + # write peak set (GRanges) to bed file with function from rtracklayer + export.bed(peaks_snp, paste0("data/03_OverlapSNPsAndPeaks/03_GWAS-SNPcontainingPeaks_", ct, "_risk.bed")) + + # subset GWAS SNPs to those overlapping a peak for later analysis + snps_peak <- subsetByOverlaps(risk_granges, final_peaks) + # write SNPs (GRanges) to bed file with function from rtracklayer + export.bed(snps_peak, paste0("data/03_OverlapSNPsAndPeaks/03_GWAS-SNPsInPeaks_", ct, "_risk.bed")) + + ## Potective SNPs + # subset peaks from Peak Set to those containing a GWAS SNP + peaks_snp <- subsetByOverlaps(final_peaks, protect_granges) # peaks with GWAS SNP in it + # change chromosome names + peaks_snp <- change_chr_name(peaks_snp) + # write peak set (GRanges) to bed file with function from rtracklayer + export.bed(peaks_snp, paste0("data/03_OverlapSNPsAndPeaks/03_GWAS-SNPcontainingPeaks_", ct, "_protect.bed")) + + # subset GWAS SNPs to those overlapping a peak for later analysis + snps_peak <- subsetByOverlaps(protect_granges, final_peaks) + # write SNPs (GRanges) to bed file with function from rtracklayer + export.bed(snps_peak, paste0("data/03_OverlapSNPsAndPeaks/03_GWAS-SNPsInPeaks_", ct, "_protect.bed")) + + return(peaks_snp) +} + +### 1. Read data + +projPM <- loadArchRProject(path = paste0(archr_folder,"/")) + +### 2. Read GWAS SNPs (all significant ones regardless of LD) + +vcf <- readVcf(paste0(basedir, "genotype/GWAS_sumstats/SCZ2022/SCZ2022_sig_hg38_normChr.vcf")) +gwas_granges <- rowRanges(vcf) # GRanges object +effect_sizes <- geno(vcf)$ES + +# split into protective and risk SNPs +snps_risk <- row.names(effect_sizes)[which(effect_sizes > 0)] +snps_protect <- row.names(effect_sizes)[which(effect_sizes < 0)] +risk_granges <- gwas_granges[snps_risk,] +protect_granges <- gwas_granges[snps_protect,] + +# look at distance to nearest neighbour +near_risk <- distanceToNearest(risk_granges) +min(near_risk@elementMetadata@listData$distance) +near_protect <- distanceToNearest(protect_granges) +min(near_protect@elementMetadata@listData$distance) + +# read official chromosome names as used in the reference genome FASTA file +official_chroms <- read.table( + "data/sequence_report.tsv", + sep="\t", + header=TRUE +) + +### 3. Get peaks that are accessible per celltype + +# peak matrix across all cell types +peak_mat <- getMatrixFromProject(ArchRProj = projPM, useMatrix = "PeakMatrix") + +# function to get GWAS-SNP overlapping peak set per celltype +snpOverlappingPeaks <- function(ct, peakMat){ + + cells_ct <- rownames(projPM@cellColData[projPM@cellColData$celltypes_refined == ct,]) + # projPM_ct <- subsetCells(ArchRProj = projPM, cellNames = cells_ct) + # projPM_ct <- addFeatureCounts( + # ArchRProj = projPM_ct, + # features = projPM@peakSet, + # name = paste0(ct, "_") + # ) + + # subset peak matrix to cells from respective cell type + peak_mat_ct <- subset(peakMat, select = rownames(colData(peakMat)) %in% cells_ct) + n_cells_ct <- ncol(peak_mat_ct) + + # count cells with at least one count for respective peak + ncells_counts <- rowSums(assays(peak_mat_ct)[["PeakMatrix"]]>0) + quantile(ncells_counts, probs = c(0,0.25,0.5,0.75,1)) + # 0% 25% 50% 75% 100% + # 0 8 12 27 10034 + indices_peaks <- which(ncells_counts >= n_cells_ct*0.05) + peakset_ct_counts <- projPM@peakSet[indices_peaks,] + + # peaks that were identified + peakset_ct <- projPM@peakSet[names(projPM@peakSet) == ct,] + + # union of peaks with count in >= 5% of cells in cell type and peaks that were identified in cell type + union_peakset_ct <- union(peakset_ct_counts, peakset_ct) + + # numbers of scREs + stat_scRE_ct <- list("celltype" = ct, + "n_counts" = length(peakset_ct_counts), + "n_ident" = length(peakset_ct), + "n_comb" = length(union_peakset_ct)) + + # function also writes peak sets with SNPs in it to files + final_peakset_ct <- subsetCelltypePeaks(ct, union_peakset_ct) + + return(stat_scRE_ct) +} + +### 4. Subset cell type-specific marker peaks + +stat_scRE <- mclapply(unique(peak_mat@colData$celltypes_refined), snpOverlappingPeaks, peak_mat) +df_stat_scRE <- purrr::map_df(stat_scRE, tibble::as_tibble) # from "list of lists" to dataframe +df_long <- df_stat_scRE %>% + pivot_longer(cols = n_counts:n_comb) +df_long$name <- factor(df_long$name, levels = c("n_ident", "n_counts", "n_comb")) + +### 5. Plot number of scREs + +ggplot(df_long, aes(x=celltype, y=value, fill=name)) + + geom_bar(stat="identity", position="dodge")+ + # geom_text(aes(label=value, group = name), + # position = position_dodge(width = 0.9), + # vjust=1.6, color="white", size=2.5)+ + xlab("") + + ylab("Number of peaks") + + scale_x_discrete(guide = guide_axis(angle = 45)) + + scale_fill_manual(name="", + breaks=c("n_ident", "n_counts", "n_comb"), + labels=c("Peak Calling", + "Counts in >= 5% \nof Nuclei", + "Total scREs"), + #values=c("#2a2956", "#ac2711", "#df8547")) + + values=c("#2c3452", "#5293e5", "#1447a8")) + + theme_minimal() + + theme( + strip.text.x = element_text(size = 15), + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + legend.title = element_text(size = 12) + ) +ggsave(paste0("plots/Dataset/03_Numbers-scREs.pdf"), + height = 8, + width = 10) diff --git a/TF_binding/04_MapSNPidsToCorrectGenome.R b/TF_binding/04_MapSNPidsToCorrectGenome.R new file mode 100644 index 0000000..05530f3 --- /dev/null +++ b/TF_binding/04_MapSNPidsToCorrectGenome.R @@ -0,0 +1,298 @@ +## Mapping of SCZ2022 SNPs to correct genome (hg 38), +## expansion of significant hits using LD scores +## and writing it to vcf format as preparation for GATK FastaAlternateReferenceMaker + +# 0. Load packages and path setup +library(biomaRt) # update? +library(gwasvcf) # write summary statistics to vcf file +library(VariantAnnotation) +library(dplyr) +library(stringr) +library(magrittr) +library(ldblock) +library(GenomeInfoDb) +library(readxl) +library(ggplot2) + +basedir <- "~/Documents/PostmortemBrain/workspace/" +gwas_folder <- "~/Documents/PostmortemBrain/genotype/GWAS_sumstats/SCZ2022/" + +## 1. Read GWAS data + +# read SCZ2022 sumstats (hg 19) +sumstats <- read.table(paste0(gwas_folder, "SCZ2022.txt"), + header = TRUE) +sumstats_sig <- sumstats %>% + filter(PVAL <= 5*10^-8) +nrow(sumstats_sig) # 20457 + +# read Supp. Table 1 from Publication with 313 independent SNPs (hg19) +supptab1 <- read_xls(paste0(gwas_folder, "PaperTrubetskoy_SupplementaryTable1.xls"), + sheet = "ST1 624 LD clumps_disc") %>% + filter(P <= 5*10^-8) %>% + mutate(CHR = paste0("chr", CHR)) + +# read SNIPA (LD proxy search) results for 313 independent SNPs with cutoff 0.8 (hg19) +snipa <- read.table(paste0(gwas_folder, "SNIPA_proxySearch.csv"), + header = TRUE) +nrow(snipa) # 10812 +length(unique(snipa$QRSID)) # 301 --> for 301 of 313 SNPs Proxies in LD were found + +# how many snipa SNPs are among the 20000? +overlap_snipa <- intersect(snipa$RSID, sumstats_sig$ID) + + +## 2. LD expansion of 313 independent SNPs + +# adapt expandSnpSet function from ldblock package so that it returns rs IDs of +# SNPs in LD +expandSnpSet_adapt <- function (rsl, lb = 0.8, ldstruct, chrn = "chr17", popn = "CEU", + txtgzfn = dir(system.file("hapmap", package = "ldblock"), + full.names = TRUE)) +{ + if (missing(ldstruct)) + curhm = hmld(txtgzfn, poptag = popn, chrom = chrn)@ldmat + else curhm = ldstruct@ldmat + bad = setdiff(rsl, rownames(curhm)) + if (length(bad) > 0) + warning(paste0("dropping ", length(bad), " rsn not matched in ld matrix")) + rsl = setdiff(rsl, bad) + hits = apply(curhm[rsl, ], 1, function(x) colnames(curhm)[which(x >= lb)]) + sort(unique(c(unlist(hits), rsl, bad))) +} + +# expand SNPs using LD per chromosome with LD reference panel of hg19 +list_snps <- list() +for (chr in 1:22){ + chro <- paste0("chr", chr) + supptab1_chr <- supptab1 %>% filter(CHR == chro) + ceu_chr <- hmld(paste0(basedir, "/RefGenome_LDStructure/ld_", chro, "_CEU.txt.gz"), + poptag = "CEU", chrom = chro) + # catch error for chr 17 as only 1 SNP is in LD panel + possibleError <- tryCatch( + expr = { + exset <- expandSnpSet_adapt(supptab1_chr$SNP, ldstruct = ceu_chr, lb=.8) + }, error = function(e){ + message('Caught an error!') + print(e) + } + ) + + if(inherits(possibleError, "error")) next + + print(all(supptab1_chr$SNP %in% exset)) + list_snps[[chro]] <- exset +} + +ld_snps <- c(supptab1$SNP, unlist(list_snps)) +length(ld_snps) # 6995 + +# --> these are less than the 20457 genome wide significant ones from the GWAS sumstats +# --> we are gonna proceed with the 20457 + + +## 3. Map SNP ids to hg38 genome + +snp_mart = useMart("ENSEMBL_MART_SNP", dataset="hsapiens_snp") +listMarts() +# biomart version +# 1 ENSEMBL_MART_ENSEMBL Ensembl Genes 112 +# 2 ENSEMBL_MART_MOUSE Mouse strains 112 +# 3 ENSEMBL_MART_SNP Ensembl Variation 112 +# 4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 112 +# --> Ensembl Variation 112 is based on GRCh38 coordinates + +# experiment with these commands +listFilters(snp_mart) +listAttributes(snp_mart) +attributePages(snp_mart) +listDatasets(snp_mart) + +# attributes we are interested in for SNPs +snp_attributes = c("refsnp_id", "chr_name", "chrom_start") + +# get these attributes for hg38 from BioMart +snp_locations = getBM(attributes=snp_attributes, filters="snp_filter", + values=sumstats_sig$ID, mart=snp_mart) +head(snp_locations) + +write.table(snp_locations, + paste0(gwas_folder, "SCZ2022_sig_hg38.txt"), + sep = "\t", + quote = FALSE, + row.names = FALSE) + + +## 4. Combine GWAS results with hg38 positions and correct chromosome names + +# read SNP locations +snp_locations <- read.table( + paste0(gwas_folder, "SCZ2022_sig_hg38.txt"), + header=TRUE +) + +# filter out mappings to alternative references, eg "HSCHR3_3_CTG3" +snp_locations_filt <- snp_locations %>% + filter(!str_detect(chr_name, "^H")) # filter out mappings to alternative references, eg "HSCHR3_3_CTG3" + +# read official chromosome names as used in the reference genome FASTA file +official_chroms <- read.table( + paste0(gwas_folder, "sequence_report.tsv"), + sep="\t", + header=TRUE +) + +# change chromsome column in sumstats to character to prepare for merge +sumstats <- sumstats %>% + mutate(CHROM = as.character(CHROM)) + +# merge hg38 locations with original sumstats +vv <- snp_locations_filt %>% + inner_join(sumstats, + by = c("refsnp_id"="ID", "chr_name"="CHROM")) +# besides join on rsID, additionally based on +# chromosome, as rsIDs partly map to multiple locations in the genome, due to the +# fact that the sequence of the surrounding region maps to multiple locations +# --> chrom_start is now hg38 position and POS is hg19 position + +# join with chromosome names from reference genome FASTA file +vv <- vv %>% + mutate(chr_name = paste0("chr", chr_name)) %>% + left_join(official_chroms[c("UCSC.style.name", "RefSeq.seq.accession")], + by = c("chr_name"="UCSC.style.name")) +max(vv$PVAL) + + +# ## 5. Switch reference and risk allele correctly for negative effect sizes +# # --> we decided to not switch alleles but disentangle protective and risk SNPs +# # later/run the analyses separately +# +# # copy risk allele to a new column +# vv$A2_copy <- vv$A2 +# +# # switch the alleles for those SNPs with negative BETA value +# # additionally adapt the allele frequency and BETA itself +# vv_switch <- vv %>% +# mutate(A2 = case_when(BETA < 0 ~ A1, +# BETA >= 0 ~ A2), +# A1 = case_when(BETA < 0 ~ A2_copy, +# BETA >= 0 ~ A1), +# FCAS = case_when(BETA < 0 ~ 1-FCAS, +# BETA >= 0 ~ FCAS), +# FCON = case_when(BETA < 0 ~ 1-FCON, +# BETA >= 0 ~ FCON), +# BETA = case_when(BETA < 0 ~ -BETA, +# BETA >= 0 ~ BETA) +# ) %>% +# select(-A2_copy) + + + + +## 6. Save GWAS results with hg38 genome as vcf file + +# notes on alleles: +# in original GWAS summary statistics A1 is the allele that BETA, PVAL and so on +# refer to, it is also the allele from the "normal" human reference genome +# as GATK FastaAlternateReferenceMaker inserts "ALT" from vcf files into genome, +# I am switching the alleles here and change BETA to -BETA + +# with special chromosome names +out <- vv %$% # function from magrittr that allows to call columns within function directly + create_vcf( + chrom = RefSeq.seq.accession, + pos = chrom_start, + nea = A1, + ea = A2, # ALT allele that is inserted into synthetic alternate reference + snp = refsnp_id, + ea_af = FCAS, # FCAS is allele frequency of A1 in cases, there is another column for controls + effect = -BETA, + se = SE, + pval = PVAL, #10 ^ -LP, + n = NCAS+NCON, + name = "SCZ2022" + ) +head(out) + +writeVcf(out, file=paste0(gwas_folder, "SCZ2022_sig_hg38.vcf")) + +# with normal chromosome names +out <- vv %$% # function from magrittr that allows to call columns within function directly + create_vcf( + chrom = chr_name, + pos = chrom_start, + nea = A1, + ea = A2, # ALT allele that is inserted into synthetic alternate reference + snp = refsnp_id, + ea_af = FCAS, # FCAS is allele frequency of A1 in cases, there is another column for controls + effect = -BETA, + se = SE, + pval = PVAL, #10 ^ -LP, + n = NCAS+NCON, + name = "SCZ2022" + ) +head(out) + +writeVcf(out, file=paste0(gwas_folder, "SCZ2022_sig_hg38_normChr.vcf")) + +# also write as tsv +write.table(vv, paste0(gwas_folder, "SCZ2022_sig_hg38.tsv"), + quote = FALSE, row.names = FALSE, sep="\t") + + +## 7. Plot of significant SNPs, similar to Manhattan plot + +vv$chr_name <- factor(vv$chr_name, levels = paste0("chr", 1:22)) +vv_plot <- vv %>% + + # Compute chromosome size + group_by(chr_name) %>% + summarise(chr_len = max(chrom_start)) %>% + + # Calculate cumulative position of each chromosome + mutate(tot=cumsum(as.numeric(chr_len))-chr_len) %>% + select(-chr_len) %>% + + # Add this to initial dataset + left_join(vv, ., by=c("chr_name" = "chr_name")) %>% + + # Cumulative position of each SNP + arrange(chr_name, chrom_start) %>% + mutate(BPcum=chrom_start+tot) + +axisdf = vv_plot %>% + group_by(chr_name) %>% + summarize(center=( max(BPcum) + min(BPcum) ) / 2 ) + + +ggplot(vv_plot, aes(x=BPcum, y=BETA, col=-log10(PVAL))) + + + # Show all points + #geom_point( aes(color=as.factor(chr_name)), alpha=0.8, size=1.3) + + geom_point( alpha=0.8, size=1) + + #scale_color_manual(values = rep(c("grey", "skyblue"), 22 )) + + scale_color_gradient(name = "-log10(P)", + low="lightgrey", high="darkred", + limits=c(0, max(-log10(vv_plot$PVAL))), + oob = scales::squish) + + # scale_color_viridis(option="D", direction=1) + + + # custom X axis: + scale_x_continuous( label = axisdf$chr_name, breaks= axisdf$center, + guide = guide_axis(angle = 45)) + + scale_y_continuous(expand = c(0, 0) ) + # remove space between plot area and x axis + xlab("Chromosome") + + ylab("Beta/Effect Size") + + + # Custom the theme: + theme_bw() + + theme( + #legend.position="none", + panel.border = element_blank(), + panel.grid.major.x = element_blank(), + panel.grid.minor.x = element_blank() + ) +ggsave(paste0(basedir, "scripts/ATAC/03_TFbinding/plots/Dataset/04_GWAS-SNPs.pdf"), + height = 7, width = 10) +ggsave(paste0(basedir, "scripts/ATAC/03_TFbinding/plots/Dataset/04_GWAS-SNPs.png"), + height = 7, width = 10) diff --git a/TF_binding/05_CutNumber_FastaHeader.py b/TF_binding/05_CutNumber_FastaHeader.py new file mode 100644 index 0000000..39e4c87 --- /dev/null +++ b/TF_binding/05_CutNumber_FastaHeader.py @@ -0,0 +1,21 @@ +# Cut the number and space at beginning of FASTA header +# in alternative reference genome with GWAS SNPs + +import os +import re + +ref_dir = os.chdir("/Users/nathalie_gerstner/Documents/PostmortemBrain/genotype/GWAS_sumstats/SCZ2022") + +fasta= open('GRCh38_SCZ2022_altRef.fasta') +newfasta= open('GRCh38_SCZ2022_altRef_fixHead.fasta', 'w') + +for line in fasta: + if line.startswith('>'): + newname= re.sub('\d+\s', '', line, 1) + newname= re.sub(':', ' ', newname, 1) + newfasta.write(newname) + else: + newfasta.write(line) + +fasta.close() +newfasta.close() \ No newline at end of file diff --git a/TF_binding/06_RunFIMO_SCZ.R b/TF_binding/06_RunFIMO_SCZ.R new file mode 100644 index 0000000..0fb4155 --- /dev/null +++ b/TF_binding/06_RunFIMO_SCZ.R @@ -0,0 +1,171 @@ +# run FIMO (Finding Individual Motif Occurrences) on reference +# genome sequences and SNP-alternated sequences of peaks + +# run with conda environment sc-atac-tf + +library(dplyr) +library(GenomicRanges) +library(VariantAnnotation) +library(rtracklayer) +library(plyranges) + +# global parameter +setwd("/psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/scripts/ATAC/03_TFbinding") +jaspar_meme_dir <- "data/JASPAR2020_CORE_vertebrates_non-redundant_pfms_meme/" +fimo_dir <- "/psycl/g/mpsngs/software1/meme/bin/" +p_thresh <- 0.99 #pvalue threshold for FIMO --> 0.99 as we want to get all results + +# celltype of interest +args = commandArgs(TRUE) +ct = args[1] +# ct <- "Microglia" + + +# 1. Read file with motifs enriched + accessible + expressed +motifs <- read.csv("data/02_Motifs_Expression5perc.csv") + +motifs_ct <- motifs %>% + filter(celltype == ct) + +# 1a. Read file with JASPAR matrix name --> gene name mapping +motif_map <- read.table("data/table_JASPAR2020_CORE_vertebrates_non-redundant_pfms_transfac_genename.tsv", + sep = "\t", header=TRUE) %>% + dplyr::select("jaspar_matrix", "gene_name") +motifs_ct <- motifs_ct %>% + left_join(motif_map, by=c("gene_name")) + +# 1b. Read GWAS SNPs (all significant ones regardless of LD) +vcf <- readVcf("data/SCZ2022_sig_hg38_normChr.vcf") +gwas_granges <- rowRanges(vcf) # GRanges object + +# read official chromosome names as used in the reference genome FASTA file +official_chroms <- read.table( + "data/sequence_report.tsv", + sep="\t", + header=TRUE +) + +# join the chromosomes with official names from NCBI FASTA +# --> necessary to extract peak sequences from FASTA file +change_chr <- data.frame("chrom" = as.vector(gwas_granges@seqnames)) %>% + left_join(official_chroms, by=c("chrom" = "UCSC.style.name")) + +# expand chromosome levels to the official names and later delete them after changing +seqlevels(gwas_granges) <- c(seqlevels(gwas_granges), unique(change_chr$RefSeq.seq.accession)) +gwas_granges <- gwas_granges %>% + mutate(seqnames = factor(change_chr$RefSeq.seq.accession, levels = seqlevels(gwas_granges))) +seqlevels(gwas_granges) <- unique(change_chr$RefSeq.seq.accession) + + +# 2. Create folder to save FIMO output files in +dir.create(paste0("data/FIMO_out/", ct, ".ref")) +dir.create(paste0("data/FIMO_out/", ct, ".alt")) +dir.create(paste0("data/FIMO_out/", ct, ".diff")) + +# 3. Run FIMO on reference and altered FASTA sequences +for (i in 1:nrow(motifs_ct)){ + m <- motifs_ct$jaspar_matrix[i] + + system(paste0(fimo_dir, "fimo --thresh ", p_thresh, + " --o data/FIMO_out/", ct, ".ref/", m, " ", + jaspar_meme_dir, m, ".meme ", + "data/03_GWAS-SNPcontainingPeaks_ref.fasta") ) + + system(paste0(fimo_dir, "fimo --thresh ", p_thresh, + " --o data/FIMO_out/", ct, ".alt/", m, " ", + jaspar_meme_dir, m, ".meme ", + "data/03_GWAS-SNPcontainingPeaks_alt.fasta") ) +} + +# 4. Intersect between matched motifs and GWAS SNPs +for (i in 1:nrow(motifs_ct)){ + m <- motifs_ct$jaspar_matrix[i] + + for (mode in c("ref", "alt")){ + + fimo_out <- read.table( + paste0("data/FIMO_out/", ct, ".", mode, "/", m, "/fimo.tsv"), + header=TRUE) + + granges_out <- makeGRangesFromDataFrame(fimo_out, + seqnames.field = "sequence_name", + keep.extra.columns = TRUE) + + # subset peaks from Peak Set to those containing a GWAS SNP + fimo_snp <- subsetByOverlaps(granges_out, gwas_granges) # 2377 peaks with GWAS SNP in it + + # save GWAS SNP containing FIMO output ranges to rds file + saveRDS(fimo_snp, paste0("data/FIMO_out/", ct, ".", mode, "/", m, "/fimo_snp.rds")) #~3000 in one case + + } +} + + +# 5. Aftermath: differential binding score + +# PSEUDOCODE: +# for each cell type: get complete peak set tested with FIMO + +# for each target TF/motif in this celltype: + +# for ref and alt sequence: + +# open respective fimo-SNP file (fimo hits with SNP overlap) +# for each peak from complete peak set: sum up -log10-transformed pvalue within each peak +# peaks without motif hit get score of 0 + +# for each peak: + +# calculate diff binding score (ref binding score - alt binding score) +# save [tf, motif, peakID, ref binding score, alt binding score, diff binding score] + +#m <- motifs_ct$jaspar_matrix[1] + +diff_scores <- function(m){ + + # read file with peaks tested (for this celltype?) + peaks_tested <- import("data/03_GWAS-SNPcontainingPeaks.bed") + + sumDict <- list() + for (mode in c("ref", "alt")){ + + # read file with GWAS-SNP containing motif hits + motif_hits <- readRDS(paste0("data/FIMO_out/", ct, ".", mode, "/", m, "/fimo_snp.rds")) + + # for each peak get overlapping motif hits + sumDict[[mode]] <- list() + for (i in 1:length(peaks_tested)){ # i -> eg 1124 + + hits_peak <- subsetByOverlaps(motif_hits, peaks_tested[i,]) + score <- ifelse(length(hits_peak) >= 1, sum(-log10(hits_peak$p.value)), 0) + + sumDict[[mode]][[i]] <- score + } + } + + res_df <- data.frame("motif" = character(), + "peakNum" = character(), + "ref_score" = integer(), + "alt_score" = integer(), + "diff_score" = integer()) + for (i in 1:length(peaks_tested)){ + + diff_score <- sumDict[["ref"]][[i]]-sumDict[["alt"]][[i]] + + res_df <- rbind(res_df, + list("motif" = m, + "peakNum" = i, + "ref_score" = sumDict[["ref"]][[i]], + "alt_score" = sumDict[["alt"]][[i]], + "diff_score" = diff_score)) + } + + # add column with gene_name + res_df <- res_df %>% + left_join(motif_map, by=c("motif"="jaspar_matrix")) + write.table(res_df, paste0("data/FIMO_out/", ct, ".diff/", m, "_fimo_scores.tsv"), + sep="\t", row.names = FALSE) + +} + +mclapply(motifs_ct$jaspar_matrix, diff_scores, mc.cores = 10) \ No newline at end of file diff --git a/TF_binding/06_runRunFIMO_SCZ.sh b/TF_binding/06_runRunFIMO_SCZ.sh new file mode 100644 index 0000000..505d96c --- /dev/null +++ b/TF_binding/06_runRunFIMO_SCZ.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +#SBATCH --partition=p.psycl +#SBATCH --cpus-per-task=10 +#SBATCH --mem=200G +#SBATCH --time=11-00:00:00 +#SBATCH --output=slurm_06_%j.out +#SBATCH --error=slurm_06_%j.err + +source ~/.bashrc + +conda activate sc-atac-tf + +# Run script on all celltypes +ct=("Astro_FB" "Astro_PP" "Endothelial" "Exc_L2-3" "Exc_L3-5" "Exc_L4-6_1" "Exc_L4-6_2" "In_PVALB_Ba" "In_PVALB_Ch" "In_SST" "In_VIP" "Microglia" "Oligodendrocyte" "OPC") +#ct=("Astro_FB") + +for i in "${ct[@]}"; do + Rscript /psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/scripts/ATAC/03_TFbinding/06_RunFIMO_SCZ.R "$i" +done + +conda deactivate \ No newline at end of file diff --git a/TF_binding/06a_ExtractFASTA_GWAS-SNPcontainingCelltypePeaks_SCZ.sh b/TF_binding/06a_ExtractFASTA_GWAS-SNPcontainingCelltypePeaks_SCZ.sh new file mode 100644 index 0000000..3a936cc --- /dev/null +++ b/TF_binding/06a_ExtractFASTA_GWAS-SNPcontainingCelltypePeaks_SCZ.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +#SBATCH --partition=p.psycl +#SBATCH --cpus-per-task=10 +#SBATCH --mem=200G +#SBATCH --time=11-00:00:00 +#SBATCH --output=slurm_06_%j.out +#SBATCH --error=slurm_06_%j.err + +source ~/.bashrc + +# Iterate through all celltypes +ct=("Astro_FB" "Astro_PP" "Endothelial" "Exc_L2-3" "Exc_L3-5" "Exc_L4-6_1" "Exc_L4-6_2" "In_PVALB_Ba" "In_PVALB_Ch" "In_SST" "In_VIP" "Microglia" "Oligodendrocyte" "OPC") +#ct=("Microglia") + +for i in "${ct[@]}"; do + # reference FASTA file + /psycl/g/mpsngs/software1/bedtools2/bin/bedtools getfasta \ + -fi "/psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/genotype/GWAS_sumstats/SCZ2022/GRCh38_latest_genomic.fna" \ + -bed "/psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/scripts/ATAC/03_TFbinding/data/03_OverlapSNPsAndPeaks/03_GWAS-SNPcontainingPeaks_$i.bed" \ + -fo "/psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/scripts/ATAC/03_TFbinding/data/03_OverlapSNPsAndPeaks/03_GWAS-SNPcontainingPeaks_ref_$i.fasta" + # alternate (GWAS-SNP containing) FASTA file + /psycl/g/mpsngs/software1/bedtools2/bin/bedtools getfasta \ + -fi "/psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/genotype/GWAS_sumstats/SCZ2022/GRCh38_SCZ2022_altRef_fixHead.fasta" \ + -bed "/psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/scripts/ATAC/03_TFbinding/data/03_OverlapSNPsAndPeaks/03_GWAS-SNPcontainingPeaks_$i.bed" \ + -fo "/psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/scripts/ATAC/03_TFbinding/data/03_OverlapSNPsAndPeaks/03_GWAS-SNPcontainingPeaks_alt_$i.fasta" +done + +# /psycl/g/mpsngs/software1/bedtools2/bin/bedtools getfasta \ +# -fi "/psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/genotype/GWAS_sumstats/SCZ2022/GRCh38_latest_genomic.fna" \ +# -bed "/psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/scripts/ATAC/03_TFbinding/data/03_OverlapSNPsAndPeaks/03_GWAS-SNPcontainingPeaks_Astro_FB.bed" + +# /psycl/g/mpsngs/software1/bedtools2/bin/bedtools getfasta \ +# -fi "/psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/genotype/GWAS_sumstats/SCZ2022/GRCh38_SCZ2022_altRef_fixHead.fasta" \ +# -bed "/psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/scripts/ATAC/03_TFbinding/data/03_OverlapSNPsAndPeaks/03_GWAS-SNPcontainingPeaks_Astro_FB.bed" \ No newline at end of file diff --git a/TF_binding/06b_RunFIMO_celltypePeaks_effectDir_SCZ.R b/TF_binding/06b_RunFIMO_celltypePeaks_effectDir_SCZ.R new file mode 100644 index 0000000..e0774fe --- /dev/null +++ b/TF_binding/06b_RunFIMO_celltypePeaks_effectDir_SCZ.R @@ -0,0 +1,199 @@ +# run FIMO (Finding Individual Motif Occurrences) on reference +# genome sequences and SNP-alternated sequences of peaks + +# run with conda environment sc-atac-tf + +library(dplyr) +library(stringr) +library(GenomicRanges) +library(VariantAnnotation) +library(rtracklayer) +library(plyranges) + +# global parameter +basedir <- "/psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/" +setwd(paste0(basedir,"scripts/ATAC/03_TFbinding")) +jaspar_meme_dir <- "data/JASPAR2020_CORE_vertebrates_non-redundant_pfms_meme/" +fimo_dir <- "/psycl/g/mpsngs/software1/meme/bin/" +p_thresh <- 0.99 #pvalue threshold for FIMO --> 0.99 as we want to get all results + +# celltype of interest +args = commandArgs(TRUE) +ct = args[1] +# ct <- "Astro_FB" +# ct <- "OPC" + + +# 1. Read file with motifs enriched + accessible + expressed +motifs <- read.csv("data/02_Motifs_Expression5perc.csv") +motifs$gene_name <- str_replace_all(motifs$gene_name, '\\.\\.', '::') + +motifs_ct <- motifs %>% + filter(celltype == ct) +head(motifs_ct) + +# 1a. Read file with JASPAR matrix name --> gene name mapping +motif_map <- read.table("data/table_JASPAR2020_CORE_vertebrates_non-redundant_pfms_transfac_genename.tsv", + sep = "\t", header=TRUE) %>% + dplyr::select("jaspar_matrix", "jaspar_id") %>% + distinct() +motifs_ct <- motifs_ct %>% + left_join(motif_map, by=c("gene_name"="jaspar_id")) + +# 1b. Read GWAS SNPs (all significant ones regardless of LD) +vcf <- readVcf(paste0(basedir,"genotype/GWAS_sumstats/SCZ2022/SCZ2022_sig_hg38_normChr.vcf")) +gwas_granges <- rowRanges(vcf) # GRanges object +effect_sizes <- geno(vcf)$ES + +# read official chromosome names as used in the reference genome FASTA file +official_chroms <- read.table( + "data/sequence_report.tsv", + sep="\t", + header=TRUE +) + +# join the chromosomes with official names from NCBI FASTA +# --> necessary to extract peak sequences from FASTA file +change_chr <- data.frame("chrom" = as.vector(gwas_granges@seqnames)) %>% + left_join(official_chroms, by=c("chrom" = "UCSC.style.name")) + +# expand chromosome levels to the official names and later delete them after changing +seqlevels(gwas_granges) <- c(seqlevels(gwas_granges), unique(change_chr$RefSeq.seq.accession)) +gwas_granges <- gwas_granges %>% + mutate(seqnames = factor(change_chr$RefSeq.seq.accession, levels = seqlevels(gwas_granges))) +seqlevels(gwas_granges) <- unique(change_chr$RefSeq.seq.accession) + +# split into protective and risk SNPs +snps_risk <- row.names(effect_sizes)[which(effect_sizes > 0)] +snps_protect <- row.names(effect_sizes)[which(effect_sizes < 0)] +risk_granges <- gwas_granges[snps_risk,] +protect_granges <- gwas_granges[snps_protect,] + + +# 2. Create folder to save FIMO output files in +dir.create(paste0("data/FIMO_out/", ct, ".ref.celltypePeaks")) +dir.create(paste0("data/FIMO_out/", ct, ".alt.celltypePeaks")) +dir.create(paste0("data/FIMO_out/", ct, ".diff.celltypePeaks")) + +# 3. Run FIMO on reference and altered FASTA sequences +for (i in 1:nrow(motifs_ct)){ + m <- motifs_ct$jaspar_matrix[i] + + # --oc overwrites folder if it exists already --> --o for not overwriting + system(paste0(fimo_dir, "fimo --thresh ", p_thresh, + " --oc data/FIMO_out/", ct, ".ref.celltypePeaks/", m, " ", + jaspar_meme_dir, m, ".meme ", + "data/03_OverlapSNPsAndPeaks/03_GWAS-SNPcontainingPeaks_ref_", ct, ".fasta") ) + + system(paste0(fimo_dir, "fimo --thresh ", p_thresh, + " --oc data/FIMO_out/", ct, ".alt.celltypePeaks/", m, " ", + jaspar_meme_dir, m, ".meme ", + "data/03_OverlapSNPsAndPeaks/03_GWAS-SNPcontainingPeaks_alt_", ct, ".fasta") ) +} + +# 4. Intersect between matched motifs and GWAS SNPs +for (i in 1:nrow(motifs_ct)){ + m <- motifs_ct$jaspar_matrix[i] + + for (mode in c("ref", "alt")){ + + fimo_out <- read.table( + paste0("data/FIMO_out/", ct, ".", mode, ".celltypePeaks/", m, "/fimo.tsv"), + header=TRUE) + + granges_out <- makeGRangesFromDataFrame(fimo_out, + seqnames.field = "sequence_name", + keep.extra.columns = TRUE) # Microglia ref: 84020 motif occurences + + # Risk and protective SNPs combined + # subset motif matches to those containing a GWAS SNP + fimo_snp <- subsetByOverlaps(granges_out, gwas_granges) # Microglia ref: 4236 motif occurences + # save GWAS SNP containing FIMO matches to rds file + saveRDS(fimo_snp, paste0("data/FIMO_out/", ct, ".", mode, ".celltypePeaks/", m, "/fimo_snp_riskAndProtect.rds")) + + # Only risk SNPs + # subset motif matches to those containing a GWAS SNP + fimo_snp_risk <- subsetByOverlaps(granges_out, risk_granges) + # save GWAS SNP containing FIMO matches to rds file + saveRDS(fimo_snp_risk, paste0("data/FIMO_out/", ct, ".", mode, ".celltypePeaks/", m, "/fimo_snp_risk.rds")) + + # Only protective SNPs + # subset motif matches to those containing a GWAS SNP + fimo_snp_protect <- subsetByOverlaps(granges_out, protect_granges) + # save GWAS SNP containing FIMO matches to rds file + saveRDS(fimo_snp_protect, paste0("data/FIMO_out/", ct, ".", mode, ".celltypePeaks/", m, "/fimo_snp_protect.rds")) + + } +} + + +# 5. Aftermath: differential binding score + +# PSEUDOCODE: +# for each cell type: get complete peak set tested with FIMO + +# for each target TF/motif in this celltype: + +# for ref and alt sequence: + +# open respective fimo-SNP file (fimo hits with SNP overlap) +# for each peak from complete peak set: sum up -log10-transformed pvalue within each peak +# peaks without motif hit get score of 0 + +# for each peak: + +# calculate diff binding score (ref binding score - alt binding score) +# save [tf, motif, peakID, ref binding score, alt binding score, diff binding score] + +#m <- motifs_ct$jaspar_matrix[1] + +diff_scores <- function(m, effDir = c("riskAndProtect", "risk", "protect")){ + + # read file with peaks tested (for this celltype?) + peaks_tested <- import(paste0("data/03_OverlapSNPsAndPeaks/03_GWAS-SNPcontainingPeaks_", ct, "_", effDir, ".bed")) + + sumDict <- list() + for (mode in c("ref", "alt")){ + + # read file with GWAS-SNP containing motif hits + motif_hits <- readRDS(paste0("data/FIMO_out/", ct, ".", mode, ".celltypePeaks/", m, "/fimo_snp_", effDir, ".rds")) + + # for each peak get overlapping motif hits + sumDict[[mode]] <- list() + for (i in 1:length(peaks_tested)){ # i -> eg 1124 + + hits_peak <- subsetByOverlaps(motif_hits, peaks_tested[i,]) + score <- ifelse(length(hits_peak) >= 1, sum(-log10(hits_peak$p.value)), 0) + + sumDict[[mode]][[i]] <- score + } + } + + res_df <- data.frame("motif" = character(), + "peakNum" = character(), + "ref_score" = integer(), + "alt_score" = integer(), + "diff_score" = integer()) + for (i in 1:length(peaks_tested)){ + + diff_score <- sumDict[["ref"]][[i]]-sumDict[["alt"]][[i]] + + res_df <- rbind(res_df, + list("motif" = m, + "peakNum" = i, + "ref_score" = sumDict[["ref"]][[i]], + "alt_score" = sumDict[["alt"]][[i]], + "diff_score" = diff_score)) + } + + # add column with gene_name + res_df <- res_df %>% + left_join(motif_map, by=c("motif"="jaspar_matrix")) + write.table(res_df, paste0("data/FIMO_out/", ct, ".diff.celltypePeaks/", m, "_fimo_scores_", effDir, ".tsv"), + sep="\t", row.names = FALSE) + +} + +mclapply(motifs_ct$jaspar_matrix, diff_scores, "riskAndProtect", mc.cores = 10) +mclapply(motifs_ct$jaspar_matrix, diff_scores, "risk", mc.cores = 10) +mclapply(motifs_ct$jaspar_matrix, diff_scores, "protect", mc.cores = 10) \ No newline at end of file diff --git a/TF_binding/06b_runRunFIMO_celltypePeaks_SCZ.sh b/TF_binding/06b_runRunFIMO_celltypePeaks_SCZ.sh new file mode 100644 index 0000000..6a4871c --- /dev/null +++ b/TF_binding/06b_runRunFIMO_celltypePeaks_SCZ.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +#SBATCH --partition=p.psycl +#SBATCH --cpus-per-task=10 +#SBATCH --mem=200G +#SBATCH --time=11-00:00:00 +#SBATCH --output=slurm_06_%j.out +#SBATCH --error=slurm_06_%j.err + +source ~/.bashrc + +conda activate sc-atac-tf + +# Run script on all celltypes +ct=("Astro_FB" "Astro_PP" "Endothelial" "Exc_L2-3" "Exc_L3-5" "Exc_L4-6_1" "Exc_L4-6_2" "In_PVALB_Ba" "In_PVALB_Ch" "In_SST" "In_VIP" "Microglia" "Oligodendrocyte" "OPC") +#ct=("Astro_FB") + +for i in "${ct[@]}"; do + Rscript /psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/scripts/ATAC/03_TFbinding/06b_RunFIMO_celltypePeaks_effectDir_SCZ.R "$i" +done + +conda deactivate \ No newline at end of file diff --git a/TF_binding/07_Plots_DiffScores_celltypePeaks_effectDir.R b/TF_binding/07_Plots_DiffScores_celltypePeaks_effectDir.R new file mode 100644 index 0000000..3edb49b --- /dev/null +++ b/TF_binding/07_Plots_DiffScores_celltypePeaks_effectDir.R @@ -0,0 +1,303 @@ +# Make plots visualizing the differential binding scores + +# run with conda environment sc-atac-tf + +library(ggplot2) +library(ggrepel) +library(stringr) +library(dplyr) +library(ComplexHeatmap) +library(RColorBrewer) +library(gridExtra) +library(cowplot) +library(circlize) + +# risk or protective SNPs? or both? +effDir <- "risk" +#effDir <- "protect" +# effDir <- "riskAndProtect" + +min_diff_binding_score <- 3 +min_diff <- 5 +celltypes <- c("Astro_FB", "Astro_PP", "Endothelial", "Exc_L2-3", "Exc_L3-5", + "Exc_L4-6_1", "Exc_L4-6_2", "In_PVALB_Ba", "In_PVALB_Ch", + "In_RELN", "In_SST", "In_VIP", "Microglia", "Oligodendrocyte", "OPC") + +# 1. Read file with JASPAR matrix name --> gene name mapping +motif_map <- read.table("data/table_JASPAR2020_CORE_vertebrates_non-redundant_pfms_transfac_genename.tsv", + sep = "\t", header=TRUE) %>% + dplyr::select("jaspar_matrix", "jaspar_id") %>% + distinct() + +# 1a. read official chromosome names as used in the reference genome FASTA file +official_chroms <- read.table( + "data/sequence_report.tsv", + sep="\t", + header=TRUE +) + +# 2. Read each diff binding score file and generate plots +list_diff <- list() +list_var_df <- list() +for (ct in celltypes){ + + dir.create(paste0("plots/DiffBinding.celltypePeaks/", ct)) + diff_files <- list.files(path = paste0("data/FIMO_out/", ct, ".diff.celltypePeaks"), + pattern = paste0("*_fimo_scores_", effDir, ".tsv"), + full.names = TRUE) + + list_diff[[ct]] <- list() + var_df <- data.frame("motif" = character(), + "high" = integer(), + "low" = integer()) + for (f in diff_files){ + # extract motif from file name + motif <- str_extract(f, "MA[0-9]+\\.[0-9]") + + # prepare data for plotting (identify variable (up/down) motif occurences) + diff_sco <- read.table(f, sep="\t", header=TRUE) %>% + mutate(label=ifelse(abs(diff_score) > min_diff_binding_score, 'variable', 'stable')) + diff_var <- diff_sco %>% + filter(label == "variable") %>% + mutate(diff_type = ifelse(diff_score < 0, "low", "high")) + meta_var <- diff_var %>% + group_by(diff_type) %>% + count() + + # scatterplot of reference score vs alternate score, colored by "stability" + ggplot(diff_sco, aes(x=ref_score, y=alt_score, color=label)) + + geom_point() + + scale_color_manual(name = "Motif stability", + breaks = c("stable", "variable"), + values = c("grey", "orange")) + + xlab("FIMO reference binding score") + + ylab("FIMO alternate binding score") + + theme_light() + + theme( + strip.text.x = element_text(size = 15), + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + legend.title = element_text(size = 12) + ) + ggsave(paste0("plots/DiffBinding.celltypePeaks/", ct, "/Scatterplot_", motif, "_", effDir, ".pdf"), + width=8, height=6) + + list_diff[[ct]][[motif]] <- diff_sco + var_df <- rbind(var_df, + list("motif" = motif, "high" = meta_var$n[1], "low" = meta_var$n[2])) + } + + # scatterplot with variable motif occurrences that gain vs loose + var_df <- var_df %>% + mutate(high = ifelse(is.na(high), 0, high)) %>% + mutate(low = ifelse(is.na(low), 0, low)) %>% + mutate(data_label = ifelse(abs(high-low)>=min_diff, "variable", "stable")) %>% + left_join(motif_map, by=c("motif"="jaspar_matrix")) + list_var_df[[ct]] <- var_df + ggplot(var_df, aes(x=high, y=low, color=data_label)) + + geom_point() + + #geom_text(aes(label=gene_name), size=3, hjust=1, vjust=1) + + geom_text_repel(aes(label=jaspar_id), size = 3)+ + geom_abline(slope = 1, intercept = 5, color = "grey") + + geom_abline(slope = 1, intercept = -5, color = "grey") + + scale_color_manual(name = "Motif stability", + breaks = c("stable", "variable"), + values = c("grey", "red")) + + xlab("Number of ctREs with lost TF binding by GWAS-SNPs") + + ylab("Number of ctREs with gained TF binding by GWAS-SNPs") + + xlim(0, max(c(var_df$high, var_df$low), na.rm=T)+0.5) + + ylim(0, max(c(var_df$high, var_df$low), na.rm=T)+0.5) + + theme_light() + + theme( + strip.text.x = element_text(size = 15), + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + legend.title = element_text(size = 12) + ) + ggsave(paste0("plots/DiffBinding.celltypePeaks/", ct, "/Scatterplot_HighLow_", effDir, ".pdf"), + width=8, height=6) + +} + +# concatenate var_df across celltypes +new_list <- Filter(function(x) {nrow(x) > 0}, list_var_df) +all_var_df <- bind_rows(new_list, .id="celltype") %>% + mutate(diff = high - low) %>% + mutate(plot_label = paste0(jaspar_id, " (", celltype, ")")) +write.csv(all_var_df, paste0("data/FIMO_out/07_DiffScores_AllMotifs_AllCelltypes_", effDir, ".csv")) +filt_var_df <- all_var_df %>% + filter(data_label == "variable") +write.csv(filt_var_df, paste0("data/FIMO_out/07_DiffScores_DisruptedMotifs_AllCelltypes_", effDir, ".csv")) + +# 3. Histogram and scatterplot across all celltypes +# plot histogram of diff scores +ggplot(all_var_df, aes(x=diff)) + + geom_histogram( ill="#69b3a2", color="#e9ecef", alpha=0.9) + + xlab("Difference between number of ctREs with gained and lost TF binding by GWAS-SNPs") + + theme_light() + + theme( + strip.text.x = element_text(size = 15), + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + legend.title = element_text(size = 12) + ) +ggsave(paste0("plots/DiffBinding.celltypePeaks/Histogram_DifferenceHighLow_", effDir, ".pdf"), + width=8, height=6) + +# plot high vs low all celltypes +ggplot(all_var_df, aes(x=high, y=low, color=data_label)) + + geom_point() + + #geom_text(aes(label=gene_name), size=3, hjust=1, vjust=1) + + geom_text_repel(aes(label=plot_label), size = 3)+ + geom_abline(slope = 1, intercept = 5, color = "grey") + + geom_abline(slope = 1, intercept = -5, color = "grey") + + scale_color_manual(name = "Motif stability", + breaks = c("stable", "variable"), + values = c("grey", "red")) + + xlab("Number of ctREs with lost TF binding by GWAS-SNPs") + + ylab("Number of ctREs with gained TF binding by GWAS-SNPs") + + xlim(0, max(c(all_var_df$high, all_var_df$low), na.rm=T)+0.5) + + ylim(0, max(c(all_var_df$high, all_var_df$low), na.rm=T)+0.5) + + theme_light() + + theme( + strip.text.x = element_text(size = 15), + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + legend.title = element_text(size = 12) + ) + ggsave(paste0("plots/DiffBinding.celltypePeaks/Scatterplot_HighLow_", effDir, ".pdf"), + width=8, height=6) + +# 4. Plot heatmaps for the frequently disrupted motifs + +anno_colors <- list( + Chromosome=c( + "chr1"=brewer.pal(9, 'Set1')[1], + "chr2"=brewer.pal(9, 'Set1')[2], + "chr3"=brewer.pal(9, 'Set1')[3], + "chr4"=brewer.pal(9, 'Set1')[4], + "chr5"=brewer.pal(9, 'Set1')[5], + "chr6"=brewer.pal(9, 'Set1')[6], + "chr7"=brewer.pal(9, 'Set1')[7], + "chr8"=brewer.pal(9, 'Set1')[8], + "chr9"=brewer.pal(9, 'Set1')[9], + "chr10"=brewer.pal(8, 'Set2')[1], + "chr11"=brewer.pal(8, 'Set2')[2], + "chr12"=brewer.pal(8, 'Set2')[3], + "chr13"=brewer.pal(8, 'Set2')[4], + "chr14"=brewer.pal(8, 'Set2')[5], + "chr15"=brewer.pal(8, 'Set2')[6], + "chr16"=brewer.pal(8, 'Set2')[7], + "chr17"=brewer.pal(8, 'Set3')[2], + "chr18"=brewer.pal(8, 'Set3')[3], + "chr19"=brewer.pal(8, 'Set3')[4], + "chr20"=brewer.pal(8, 'Set3')[5], + "chr21"=brewer.pal(8, 'Set3')[6], + "chr22"=brewer.pal(8, 'Set3')[7] + )) +# Create a color palette +colors <- brewer.pal(11, 'RdBu') + +# Create a custom color mapping centered around zero +col_heatmap <- colorRamp2(c(-5, 0, 5), c(colors[1], "white", colors[11])) + +#col_heatmap <- colorRampPalette(rev(brewer.pal(11, 'RdBu')))(100) + +heatmap_list <- list() +for (ct in unique(filt_var_df$celltype)){ + + print(ct) + + # read delta binding scores + motifs_ct <- filt_var_df$motif[filt_var_df$celltype == ct] + diffscores <- lapply(motifs_ct, function(x){ + read.table(paste0("data/FIMO_out/", ct, ".diff.celltypePeaks/", x, "_fimo_scores_", effDir, ".tsv"), + sep = "\t", header=TRUE) + }) + #diffscores <- bind_rows(diffscores) + + # read coordinates of respective peaks + motif_peaks <- read.table(paste0("data/03_OverlapSNPsAndPeaks/03_GWAS-SNPcontainingPeaks_", ct, "_", effDir, ".bed")) %>% + mutate(peak_id = seq(1:nrow(diffscores[[1]]))) + motif_peaks <- motif_peaks %>% + left_join(official_chroms[c("RefSeq.seq.accession", "UCSC.style.name")], by=c("V1"="RefSeq.seq.accession")) + + # generate matrix for heatmap with chrom annotation + heatmap_mat <- lapply(diffscores, function(x) x %>% select(diff_score)) + heatmap_mat <- t(as.matrix(bind_cols(heatmap_mat))) + row.names(heatmap_mat) <- lapply(diffscores, function(x) x$jaspar_id[1]) + + column_ha = HeatmapAnnotation( + Chromosome = factor(motif_peaks$UCSC.style.name, levels = names(anno_colors$Chromosome)), + col = anno_colors, + #show_annotation_name = FALSE, + #annotation_legend_title = "", + annotation_legend_param = list(ncol = 3, + title_position = "topcenter", + #legend_direction = "horizontal", + legend_position = "bottom")) + heatmap <- Heatmap(heatmap_mat, name = "Delta Binding Score", + top_annotation = column_ha, + col = col_heatmap, + cluster_columns=FALSE, + cluster_rows = FALSE, + heatmap_legend_param = list( + direction = "horizontal", + title_position = "topcenter", + #legend_direction = "horizontal", + legend_position = "bottom" + #at = c(-10,10), + #labels = c("Strong binding\nat risk allele", "Strong binding\nat nonrisk allele") + ) + ) + pdf(paste0("plots/DiffBinding.celltypePeaks/", ct, "/07_Heatmap_DisruptedTFs_", effDir, "_", ct, ".pdf"), + height = 3, width= 8) + draw( + heatmap, + heatmap_legend_side = "right", + annotation_legend_side = "right" + ) + dev.off() + + heatmap_list[[ct]] <- heatmap + +} + +# Function to capture the grob of each heatmap +capture_heatmap_grob <- function(ht) { + #grid.grabExpr(draw(ht, heatmap_legend_side = "right", annotation_legend_side = "right")) + grid.grabExpr(draw(ht, show_heatmap_legend=FALSE, show_annotation_legend=FALSE)) +} + +# Capture heatmap grobs +heatmap_grobs <- lapply(heatmap_list, capture_heatmap_grob) + +# Capture the combined legend using the first heatmap +# lg1 <- color_mapping_legend(draw(heatmap_list[[1]])@ht_list[[1]]@matrix_color_mapping, plot = FALSE) +# lg2 <- color_mapping_legend(draw(heatmap_list[[1]])@top_annotation[[1]]@col, plot = FALSE) +# lg2 <- Legend(labels=names(anno_colors$Chromosome), title = "Chromosome", legend_gp = anno_colors$Chromosome, +# ncol = 4, title_position="topcenter") +# pd = packLegend(lg1, lg2, direction="horizontal") +# combined_legend <- grid.grabExpr(draw(pd)) +combined_legend <- grid.grabExpr(draw(heatmap_list[[1]], merge_legends = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")) + +# right size for each heatmap +size_vec_df <- filt_var_df %>% group_by(tolower(celltype)) %>% count() +size_vec <- size_vec_df$n + 0.8 +# Arrange the heatmaps and the combined legend +pdf(paste0("plots/DiffBinding.celltypePeaks/07_Heatmap_DisruptedTFs_", effDir, ".pdf"), + height = nrow(filt_var_df)/3+5, width= 10) +grid.arrange(grobs = c(heatmap_grobs, list(combined_legend)), ncol = 1, heights = c(size_vec, 5)) +dev.off() \ No newline at end of file diff --git a/TF_binding/08_MappingSNP_GenotypesImputed_celltypePeaks_effectDir_SCZ.R b/TF_binding/08_MappingSNP_GenotypesImputed_celltypePeaks_effectDir_SCZ.R new file mode 100644 index 0000000..1fec2cb --- /dev/null +++ b/TF_binding/08_MappingSNP_GenotypesImputed_celltypePeaks_effectDir_SCZ.R @@ -0,0 +1,229 @@ +# Get allele frequencies for SCZ GWAS-SNPs located in motifs of respective TFs + +# run with conda environment sc-atac-tf + +library(ArchR) +library(ggplot2) +library(ggrepel) +#library(ggpubr) +library(stringr) +library(dplyr) +library(tidyr) +library(GenomicRanges) +library(VariantAnnotation) +library(rtracklayer) +library(plyranges) + +# function to change chr names from "chr1" to "NC_000..." +change_chr_name <- function(peak_set){ + + # join the chromosomes with official names from NCBI FASTA + # --> necessary to extract peak sequences from FASTA file + change_chr <- data.frame("chrom" = as.vector(peak_set@seqnames)) %>% + left_join(official_chroms, by=c("chrom" = "UCSC.style.name")) + + # expand chromosome levels to the official names and later delete them after changing + seqlevels(peak_set) <- c(seqlevels(peak_set), unique(change_chr$RefSeq.seq.accession)) + peak_set <- peak_set %>% + mutate(seqnames = factor(change_chr$RefSeq.seq.accession, levels = seqlevels(peak_set))) + seqlevels(peak_set) <- unique(change_chr$RefSeq.seq.accession) + + return(peak_set) +} + +# risk or protective SNPs? or both? +effDir <- "risk" +#effDir <- "protect" +#effDir <- "riskAndProtect" + +# folder of ArchR project +basedir <- "/psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/" +archr_folder <- "../ArchRSubset_SCZ" + +# 1a. Read TFs +tf <- read.csv(paste0("data/FIMO_out/07_DiffScores_DisruptedMotifs_AllCelltypes_", effDir, ".csv")) + +# 1b. Read ArchR project to save sample "SU" IDs +projPM <- loadArchRProject(path = paste0(archr_folder,"/")) +# family_ids <- data.frame(id=rep("*", nrow(projPM@sampleColData)), +# family_id = paste0("SU", projPM@sampleColData$SU.Number)) +family_ids <- data.frame("IID" = paste0("SU", projPM@sampleColData$SU.Number)) +# subset the original fam-file, as to the respective samples to keep the right format +famfile <- read.table(paste0(basedir, "genotype/genotype_data/raw_imputations/M01019/M01019_brainbank.fam")) %>% + filter(V2 %in% family_ids$IID) +write.table(famfile, paste0("data/SchizophreniaCohort_IDs.txt"), + row.names=FALSE, col.names=FALSE, quote=FALSE, sep="\t") + +# 1c. Read GWAS SNPs (all significant ones regardless of LD) +vcf <- readVcf(paste0(basedir, "genotype/GWAS_sumstats/SCZ2022/SCZ2022_sig_hg38_normChr.vcf")) +gwas_granges <- rowRanges(vcf) # GRanges object +effect_sizes <- geno(vcf)$ES + +# subset to protective or risk SNPs depending on "effDir" setting +if(effDir == "risk"){ + snps_risk <- row.names(effect_sizes)[which(effect_sizes > 0)] + gwas_granges <- gwas_granges[snps_risk,] +} else if(effDir == "protective"){ + snps_protect <- row.names(effect_sizes)[which(effect_sizes < 0)] + gwas_granges <- gwas_granges[snps_protect,] +} + +# Read official chromosome names as used in the reference genome FASTA file +official_chroms <- read.table( + "data/sequence_report.tsv", + sep="\t", + header=TRUE +) + +gwas_granges <- change_chr_name(gwas_granges) + + +# 1d. Read phenotype data for disease status +phenotype <- read.csv(paste0(basedir, "phenotype/phenotype_clean.csv")) %>% + mutate("ID" = paste0("SU", SU.Number)) + +# 2. Subset Genotype data with PLINK2 to samples with SCZ or controls +# and set MAF to 0.05 +system("/psycl/g/mpsngs/software1/plink2 --bfile /psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/genotype/genotype_data/imputed/qc_bestguess/lmu_brainbank_become_bggt_all --maf 0.05 --keep /psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/scripts/ATAC/03_TFbinding/data/SchizophreniaCohort_IDs.txt --make-bed --out /psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/genotype/SchizophreniaCohort/cohort_subset_imputed") + +# 3. Get SNPs that are located in TF motifs with altered binding scores +# read mapping of rsIDs to Illumina GSA IDs +# use file from Illumina: https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/global-screening-array-24/v2-0/infinium-global-screening-array-24-v2-0-a1-b150-rsids.zip +illumina_ids <- read.table("data/GSA-24v2-0_A1_b150_rsids.txt", header=TRUE) +risk_allele_counts <- data.frame() +risk_allele_frequency <- data.frame() +for (ct in unique(tf$celltype)){ + tf_celltype <- tf %>% filter(celltype == ct) + dir.create(paste0(basedir, "genotype/SchizophreniaCohort/", ct)) + + for (m in tf_celltype$motif){ + fimo_snp <- readRDS(paste0("data/FIMO_out/",ct,".alt.celltypePeaks/", m, "/fimo_snp_", effDir, ".rds")) + + # subset GWAS SNPs to those that are in a disrupted motif + dis_snp <- subsetByOverlaps(gwas_granges, fimo_snp) + df_dis_snp <- data.frame("RSID" = names(dis_snp), + "risk_allele" = sapply(dis_snp$ALT, as.character)) + #df_dis_snp <- data.frame("RSID" = names(dis_snp)) + write.table(df_dis_snp, paste0("data/FIMO_out/",ct,".diff.celltypePeaks/", m, "_gwas_snps_", effDir, ".txt"), + col.names=FALSE, row.names=FALSE, quote=FALSE) + + # map to Illumina GSA IDs and save in another file + df_illumina <- df_dis_snp %>% + left_join(illumina_ids, by=c("RSID"="RsID")) %>% + mutate(SNP = ifelse(is.na(Name), RSID, Name)) %>% + dplyr::select(SNP, risk_allele) + write.table(df_illumina, paste0("data/FIMO_out/",ct,".diff.celltypePeaks/", m, "_gwas_snps_illumina_", effDir, ".txt"), + col.names=FALSE, row.names=FALSE, quote=FALSE) + + + # subset genotype dataset to these SNPs + system(paste0("/psycl/g/mpsngs/software1/plink2 --bfile ", basedir, "genotype/SchizophreniaCohort/cohort_subset_imputed --extract data/FIMO_out/",ct,".diff.celltypePeaks/", m, "_gwas_snps_illumina_", effDir, ".txt --export A --export-allele data/FIMO_out/",ct,".diff.celltypePeaks/", m, "_gwas_snps_illumina_", effDir, ".txt --out ", basedir, "genotype/SchizophreniaCohort/", ct, "/Subset_", m, "_", effDir)) + + # Load the cohort genotype data + genotypes <- fread(paste0(basedir, "genotype/SchizophreniaCohort/", ct, "/Subset_", m, "_", effDir, ".raw")) + + risk_allele_sums <- rowSums(genotypes[,7:ncol(genotypes)], na.rm=T) + + + # Prepare results data frame + results <- data.frame(IID = genotypes$IID, Risk_Allele_Matches = risk_allele_sums, celltype = ct, motif = m) + + # join with disease status + results <- results %>% + left_join(phenotype[, c("ID", "Status")], by = c("IID"="ID")) + + # boxplot across all celltypes and motifs + ggplot(results, aes(x=as.factor(Status), y=Risk_Allele_Matches)) + + geom_boxplot() + + geom_jitter() + + xlab("Disease Status") + + ylab("Matched allele with SCZ GWAS-SNP risk allele") + ggsave(paste0("plots/DiffBinding.celltypePeaks/08_MatchedRiskAlleles_", ct, "_", m , "_", effDir, ".pdf"), + height = 8, width = 8) + + risk_allele_counts <- rbind(risk_allele_counts, results) + + + # Transform genotypes into long format for another plot + genotypes_long <- genotypes %>% + pivot_longer(!FID:PHENOTYPE, names_to="SNP_ID", values_to="risk_allele_count") %>% + left_join(phenotype[, c("ID", "Status")], by = c("IID"="ID")) %>% + mutate(risk_allele_count = as.factor(risk_allele_count), + Status = as.factor(Status)) + + # count allele frequency per disease status + genotype_count <- genotypes_long %>% + group_by(risk_allele_count,Status) %>% + count() %>% + mutate(celltype = ct, + motif = m) + + # the difference between cases and controls is just 1 (35 controls vs 36 cases) + genotype_count %>% + group_by(Status) %>% + summarise(summedSNPs = sum(n)) + + # barplot + ggplot(genotype_count, aes(x=risk_allele_count, y=n, fill=Status)) + + geom_bar(stat="identity", position="dodge") + + xlab("Number of Risk Alleles") + + ylab("Frequency") + + ggtitle(paste0("Risk Allele Frequency across ", ncol(genotypes)-6, " GWAS SNPs")) + + scale_fill_manual(name = "Disease Status", + breaks = c(0, 1), + labels = c("Control", "Case"), + values = c("#AA3939", "#226666")) + + theme_light() + + theme( + strip.text.x = element_text(size = 15), + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + legend.title = element_text(size = 12) + ) + ggsave(paste0("plots/DiffBinding.celltypePeaks/08_GenotypeFrequency-DiseaseStatus_", ct, "_", m ,"_", effDir,".pdf"), + height = 8, width = 8) + + risk_allele_frequency <- rbind(risk_allele_frequency, genotype_count) + } + +} + +# boxplot across all celltypes and motifs +ggplot(risk_allele_counts, aes(x=as.factor(Status), y=Risk_Allele_Matches)) + + geom_boxplot() + + geom_jitter() + + xlab("Disease Status") + + ylab("Matched allele with SCZ GWAS-SNP risk allele") +ggsave(paste0("plots/DiffBinding.celltypePeaks/08_MatchedRiskAlleles_AllCelltypes_AllDisruptedMotifs_", effDir, ".pdf"), + height = 8, width = 8) + +# barplot across all celltypes and motifs +risk_allele_frequency <- risk_allele_frequency %>% + group_by(risk_allele_count, Status) %>% + summarise(summedSNPs = sum(n)) + +ggplot(risk_allele_frequency, aes(x=risk_allele_count, y=summedSNPs, fill=Status)) + + geom_bar(stat="identity", position="dodge") + + xlab("Number of Risk Alleles") + + ylab("Frequency") + + #ggtitle(paste0("Risk Allele Frequency across ", ncol(genotypes)-6, " GWAS SNPs")) + + scale_fill_manual(name = "Disease Status", + breaks = c(0, 1), + labels = c("Control", "Case"), + values = c("#AA3939", "#226666")) + + theme_light() + + theme( + strip.text.x = element_text(size = 15), + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + legend.title = element_text(size = 12) + ) +ggsave(paste0("plots/DiffBinding.celltypePeaks/08_GenotypeFrequency-DiseaseStatus_", effDir, ".pdf"), + height = 8, width = 8) + diff --git a/TF_binding/09_TargetGeneMapping_H-MAGMA_SCZ.R b/TF_binding/09_TargetGeneMapping_H-MAGMA_SCZ.R new file mode 100644 index 0000000..1430554 --- /dev/null +++ b/TF_binding/09_TargetGeneMapping_H-MAGMA_SCZ.R @@ -0,0 +1,165 @@ +# Map GWAS-SNPs in disrupted motifs to target genes using H-MAGMA (including Hi-C data from PsychENCODE) + +# run with conda environment sc-atac-tf + +library(ArchR) +library(ggplot2) +library(ggrepel) +#library(ggpubr) +library(stringr) +library(dplyr) +library(tidyr) +library(data.table) +library(GenomicRanges) +library(VariantAnnotation) +library(rtracklayer) +library(plyranges) +library(biomaRt) + +# function to change chr names from "chr1" to "NC_000..." +change_chr_name <- function(peak_set){ + + # join the chromosomes with official names from NCBI FASTA + # --> necessary to extract peak sequences from FASTA file + change_chr <- data.frame("chrom" = as.vector(peak_set@seqnames)) %>% + left_join(official_chroms, by=c("chrom" = "UCSC.style.name")) + + # expand chromosome levels to the official names and later delete them after changing + seqlevels(peak_set) <- c(seqlevels(peak_set), unique(change_chr$RefSeq.seq.accession)) + peak_set <- peak_set %>% + mutate(seqnames = factor(change_chr$RefSeq.seq.accession, levels = seqlevels(peak_set))) + seqlevels(peak_set) <- unique(change_chr$RefSeq.seq.accession) + + return(peak_set) +} + +# folder of ArchR project +basedir <- "/psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/" +archr_folder <- "../ArchRSubset_SCZ" + +# risk or protective SNPs? or both? +#effDir <- "risk" +effDir <- "protect" +# effDir <- "riskAndProtect" + +mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl")) + +# 1a. Read TFs +tf <- read.csv(paste0("data/FIMO_out/07_DiffScores_DisruptedMotifs_AllCelltypes_", effDir, ".csv")) + +# 1b. Read target gene mapping from H-MAGMA +# tg_mapping <- read.delim(paste0(basedir, "scripts/RNA/08_DEanalysis/DESeq2_RELN/tables/external/H-MAGMA/", +# "Adult_brain.genes.annot"), +# sep="\t", fill=TRUE, header=FALSE) +data <- readLines(paste0(basedir, "scripts/RNA/08_DEanalysis/DESeq2_RELN/tables/external/H-MAGMA/", + "Adult_brain.genes.annot")) +read_different <- function(data) { + list_of_rowdata <- lapply(data, function(x) data.frame(matrix(unlist(strsplit(x, "\t")), nrow = 1))) + my_table <- data.frame(rbindlist(list_of_rowdata, fill = T)) + return(my_table) +} +tg_mapping <- read_different(data) +# give column names to the data frame +colnames(tg_mapping) <- c("GeneID", "GenomicCoordinates", paste0("SNP", 1:(ncol(tg_mapping)-2))) +# reshape the data from wide to long format +long_mapping <- tg_mapping %>% + gather(key="SNP_column", value = "SNP_ID", -GeneID, -GenomicCoordinates) %>% + filter(!is.na(SNP_ID)) %>% + dplyr::select(GeneID, SNP_ID) + +# 1c. Read official chromosome names as used in the reference genome FASTA file +official_chroms <- read.table( + "data/sequence_report.tsv", + sep="\t", + header=TRUE +) + + + +# 2. Get overlapping enhancer for each disrupted motif from PsychENCODE data +number_tg <- data.frame("celltype" = character(), + "motif" = character(), + "n_target" = integer(), + "n_noTarget" = integer()) +for (ct in unique(tf$celltype)){ + + # SNP-containing peaks + ct_snp_peaks <- import(paste0("data/03_OverlapSNPsAndPeaks/03_GWAS-SNPcontainingPeaks_", ct, "_", effDir, ".bed")) + + # read SNPs that overlap cell type peaks to subset SNP set already to those + ct_snps <- import(paste0("data/03_OverlapSNPsAndPeaks/03_GWAS-SNPsInPeaks_", ct, "_", effDir, ".bed")) + ct_snps_chrom <- change_chr_name(ct_snps) + + # filter to SNPs in peaks + filt_mapping <- long_mapping %>% + filter(SNP_ID %in% ct_snps$name) + # join with gene symbols + gene_names <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id","hgnc_symbol"), + values = filt_mapping$GeneID, mart= mart) + filt_mapping <- filt_mapping %>% + left_join(gene_names, by=c("GeneID"="ensembl_gene_id")) + + tf_celltype <- tf %>% filter(celltype == ct) + #dir.create(paste0(basedir, "genotype/SchizophreniaCohort/", ct)) + dir.create(paste0("data/09_TargetGenes/", ct)) + + for (m in tf_celltype$motif){ + + # diff binding peaks with SNP-containing motifs + diff_snp <- read.table(paste0("data/FIMO_out/", ct, ".diff.celltypePeaks/", m, "_fimo_scores_", effDir, ".tsv"), + header=TRUE) %>% + filter(abs(diff_score) > 3) + diff_snp_loc <- ct_snp_peaks[diff_snp$peakNum,] + # # locations of SNP-containing motifs + # fimo_snp <- readRDS(paste0("data/FIMO_out/",ct,".alt.celltypePeaks/", m, "/fimo_snp.rds")) + # fimo_snp <- fimo_snp[diff_snp$peakNum] + + # get snps that are located within these peaks + ct_snps_peaks <- subsetByOverlaps(ct_snps_chrom, diff_snp_loc) + # # get snps that are also located withing motifs within these peaks + # ct_snps_motifs <- subsetByOverlaps(ct_snps_peaks, fimo_snp) # --> all of them for MA1643.1 in OPC + + # set of genes mapped to these SNPs + mapped_genes <- filt_mapping %>% + filter(SNP_ID %in% ct_snps_peaks$name) + write.csv(mapped_genes, paste0("data/09_TargetGenes/", ct, "/09_TargetGenes_", ct, "_", m, "_", effDir, ".csv")) + + # take care of this + number_tg <- rbind(number_tg, + list("celltype" = ct, + "motif" = m, + "n_snpsWTarget" = length(unique(mapped_genes$SNP_ID)), + "n_snpsWOTarget" = length(ct_snps_peaks)-length(unique(mapped_genes$SNP_ID)), + "n_targetGenes" = length(unique(mapped_genes$GeneID)))) + } +} + +# add motif/celltype label to dataframe +number_tg <- number_tg %>% + mutate(label = paste0(motif, " (", celltype, ")")) %>% + mutate(perc_mapped = n_snpsWTarget/(n_snpsWTarget+n_snpsWOTarget)) +write.csv(number_tg, paste0("data/09_TargetGenes/", "09_MappedSnpsAndGenes_", effDir, ".csv")) +# transform to long dataframe +number_tg_long <- number_tg %>% + pivot_longer(c('n_snpsWTarget', 'n_targetGenes'), names_to="type", values_to="number") +ggplot(number_tg_long, aes(x=label, y=number, fill=type)) + + geom_bar(stat="identity", position="dodge") + + xlab("Motif") + + ylab("Number of mapped SNPs/genes") + + scale_fill_manual(name = "", + breaks = c("n_snpsWTarget", "n_targetGenes"), + labels = c("SNPs mapped to target", "TargetGenes"), + values = c("#023047", "#ffb703")) + + scale_x_discrete(guide = guide_axis(angle = 45)) + + theme_light() + + theme( + strip.text.x = element_text(size = 15), + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + legend.title = element_text(size = 12) + ) +ggsave(paste0("plots/DiffBinding.celltypePeaks/09_NumberOfSNPsMappedAndTargetGenes_", effDir, ".pdf"), + height = 8, width = 10) \ No newline at end of file diff --git a/TF_binding/10_TargetGenes_DiffExp_Pseudobulk_SCZ.R b/TF_binding/10_TargetGenes_DiffExp_Pseudobulk_SCZ.R new file mode 100644 index 0000000..49ad18c --- /dev/null +++ b/TF_binding/10_TargetGenes_DiffExp_Pseudobulk_SCZ.R @@ -0,0 +1,335 @@ +# Test target genes for differential expression between risk allele carrier and non-carrier + +# run with conda environment sc-atac-tf + +library(ArchR) +library(ggplot2) +library(ggrepel) +library(stringr) +library(dplyr) +library(tidyr) +library(tibble) +library(data.table) +library(GenomicRanges) +library(VariantAnnotation) +library(rtracklayer) +library(plyranges) +library(biomaRt) +library(DESeq2) +library(edgeR) + + +# folder of ArchR project +basedir <- "/psycl/g/mpsagbinder/mgp/workspace/SingleNuc_PostmortemBrain/" +archr_folder <- "../ArchRSubset_SCZ" + +# risk or protective SNPs? or both? +#effDir <- "risk" +effDir <- "protect" +# effDir <- "riskAndProtect" + +des_init <- "~ Status + Sex + Age + Brain.pH + RIN + PMI" +des <- "~ Carrier + Sex + Age + Brain.pH + RIN + PMI" +des_null <- "~ Sex + Age + Brain.pH + RIN + PMI" + +### FUNCTIONS + +# read pseudobulk count matrix including all cell types and metadata +read_geneExpData <- function(){ + + # Read count and metadata + counts <- readRDS(paste0(basedir, "scripts/RNA/08_DEanalysis/DESeq2_RELN/data/counts.rds")) + metadata <- readRDS(paste0(basedir, "scripts/RNA/08_DEanalysis/DESeq2_RELN/data/metadata.rds")) + metadata$Mode.of.death <- as.factor(str_replace_all(metadata$Mode.of.death, " ", "")) + + # 1a. Split count data to list with entry per celltype + # Not every cluster is present in all samples; create a vector that represents how to split samples + splitf <- sapply(stringr::str_split(rownames(counts), + pattern = "__", + n = 2), + `[`, 1) + # Turn into a list and split the list into components for each cluster and transform, so rows are genes and columns are samples and make rownames as the sample IDs + counts_split <- split.data.frame(counts, + factor(splitf)) %>% + lapply(function(u) + set_colnames(t(u), + str_split(rownames(u), '__', simplify = TRUE)[,2])) + # Explore the different components of list + str(counts_split) + + return(list("counts" = counts_split, "metadata"=metadata)) +} + +# prepare DESeq object from count matrix and metadata for a specific celltype +prepare_DESeqObject <- function(geneExpData, ct, cohort_sub, cutoff_sample=0.75){ + + metadata <- geneExpData$metadata + counts_split <- geneExpData$counts + + # Scale and center numeric variables with very different range + metadata$cell_count_sample <- scale(metadata$cell_count_sample) + metadata$cell_count_pseudosample <- scale(metadata$cell_count_pseudosample) + # Subset the metadata to only the cluster of interest + cluster_metadata <- metadata[which(metadata$cluster_id == ct), ] + head(cluster_metadata, n = 20) + + # Assign the rownames of the metadata to be the sample IDs + rownames(cluster_metadata) <- cluster_metadata$sample_id + cluster_metadata_filt <- cluster_metadata %>% + filter(sample_id %in% cohort_sub$sample_id) + + # Subset the counts to only the specific celltype + counts <- counts_split[[ct]] + cluster_counts_filt <- data.frame(counts[, which(colnames(counts) %in% rownames(cluster_metadata_filt))]) + + dds <- DESeqDataSetFromMatrix(cluster_counts_filt, + colData = cluster_metadata_filt, + design = as.formula(des_init)) + + # Keep only genes expressed in more than specific percentage of samples + dds <- dds[rowSums(counts(dds) >= 10) >= cutoff_sample*ncol(dds), ] + + return(dds) +} + +# function to test for DE between risk allele carriers and non-carriers +test_DE <- function(target_df, genotype_df, deObject, ct, m, m_gene){ + # target_df <- target_exp + # genotypes_df <- genotypes + # deObject <- de_obj + + list_res <- list() + for(index in 1:nrow(target_df)){ + snp <- target_df$SNP_ID[index] + target_gene <- target_df$GeneID[index] + # define carrier and non-carrier group + carrier_status <- genotypes %>% + dplyr::select(IID, {{ snp }}) %>% + as.data.frame() + carrier_status <- carrier_status[!is.na(carrier_status[[snp]]),] + carrier_status$carrier <- as.factor(ifelse(carrier_status[[snp]] == 0, 0, 1)) + rownames(carrier_status) <- paste0(carrier_status$IID, "_PFC_RNA") + # extract carrier status for samples from DESeq object in correct order + col_carrier <- carrier_status[rownames(colData(deObject)), "carrier"] + colData(deObject)$Carrier <- col_carrier + # remove samples with NA in Carrier column + idx <- which(is.na(colData(deObject)$Carrier)) + if(length(idx) > 0){ + deObject_filt <- deObject[,-idx] + }else{ + deObject_filt <- deObject + } + + # adapt design to correct formula + design(deObject_filt) <- as.formula(des) + + if(length(levels(colData(deObject_filt)$Carrier)) > 1){ + # run Wald test + deObject_filt <- DESeq(deObject_filt) + # get results with lfc shrinkage + res <- results(deObject_filt, + contrast = c("Carrier", 1, 0), + alpha = 0.1) + # res <- lfcShrink(deObject_filt, + # contrast = c("Carrier", 1, 0), + # res=res, + # type = "normal") + # Turn the results object into a tibble for use with tidyverse functions + res_tbl <- res %>% + data.frame() %>% + rownames_to_column(var="gene") %>% + as_tibble() + + gene_res <- res_tbl %>% filter(gene == target_gene) + print(gene_res) + list_res[[snp]] <- gene_res + + # Save plotcounts to a data frame object + d <- plotCounts(deObject_filt, gene=target_gene, intgroup="Carrier", returnData=TRUE) + + # Plotting the MOV10 normalized counts, using the samplenames (rownames of d as labels) + plot_title <- paste0(m, " (", m_gene, ") disrupted by ", snp, " targeting ", target_gene, " (", target_df$hgnc_symbol[index], ")\n", + "log2FC: ", round(gene_res$log2FoldChange,2), ", P: ", round(gene_res$pvalue,2)) + ggplot(d, aes(x = Carrier, y = count, color = Carrier)) + + geom_boxplot() + + geom_point(position=position_jitter(w = 0.1,h = 0)) + + ggtitle(plot_title) + + theme_light() + + theme( + strip.text.x = element_text(size = 15), + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + legend.title = element_text(size = 12) + ) + ggsave(paste0("plots/10_TargetGenes_DiffExp_Pseudobulk/", effDir, "/10_TargetGene_DiffExp_Pseudobulk_", ct, "_", m ,"_", snp, "_", target_gene,".pdf"), + height = 8, width = 8) + + # plot corrected gene expression data + plot_corrExpLevels(deObject_filt, snp, target_gene, plot_title) + + } else { + return(NULL) + } + } + + df_results <- bind_rows(list_res, .id="snp") + + return(df_results) +} + +plot_corrExpLevels <- function(deObject_filt, snp, target_gene, plot_title){ + + # correct for batch effects with limma + # Create DGEList object + d0 <- DGEList(counts(deObject_filt), samples = as.data.frame(colData(deObject_filt))) + dim(d0) + + # Normalization factor + d <- calcNormFactors(d0) + d + + # specify model to fitted + mm <- model.matrix(as.formula(des), data = d$samples) + # voom transformation + y <- voom(d, mm, plot = T) + + # remove for batch effects + y3 <- removeBatchEffect(y, batch = d$samples$Sex, # batch2 = d$samples$X6.Batch, + covariates = colData(deObject_filt)[,c("Age", "RIN", "Brain.pH", "PMI")]) + counts_tophit <- as.data.frame(y3[target_gene,]) + colnames(counts_tophit) <- c(target_gene) + + counts_tophit <- mutate(counts_tophit, sample = rownames(counts_tophit)) + # counts_tophit <- left_join(counts_tophit, cluster_metadata, by = c("sample" = "sample_id")) + counts_tophit <- left_join(counts_tophit, as.data.frame(colData(deObject_filt)), by = c("sample" = "sample_id")) + + # boxplot carrier + ggplot(counts_tophit, aes_string(x="Carrier", y=target_gene, fill = "Carrier")) + + geom_boxplot(outlier.shape = NA) + + geom_jitter() + + scale_fill_brewer(palette="Dark2") + + ylab("Expression level") + + ggtitle(plot_title) + + theme_light() + + theme( + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + strip.text.x = element_text(size = 12) + ) + ggsave(filename = paste0("plots/10_TargetGenes_DiffExp_Pseudobulk/", effDir, "/10_TargetGene_DiffExp-BatchCorr_Pseudobulk_", ct, "_", m ,"_", snp, "_", target_gene,".pdf"), + width = 8, + height = 8) + +} + + +### ANALYSIS + +# 1a. Read TFs +tf <- read.csv(paste0("data/FIMO_out/07_DiffScores_DisruptedMotifs_AllCelltypes_", effDir, ".csv")) + +# 1b. Read phenotype data for disease status and "schizophrenia cohort" +phenotype <- read.csv(paste0(basedir, "phenotype/phenotype_clean.csv")) %>% + mutate("ID" = paste0("SU", SU.Number)) + +scz_cohort <- read.table(paste0("data/SchizophreniaCohort_IDs.txt")) %>% + mutate(sample_id = paste0(V2, "_PFC_RNA")) + +# 1c. Read gene expression data +geneExp <- read_geneExpData() + +# 2. Run DE analysis between carrier and non-carrier +illumina_ids <- read.table("data/GSA-24v2-0_A1_b150_rsids.txt", header=TRUE) +list_celltype <- list() +stats_tested <- data.frame() +for (ct in unique(tf$celltype)){ + + #ct <- "OPC" + print(ct) + tf_ct <- tf %>% filter(celltype == ct) + + # create DESeq object for specified cell type + de_obj <- prepare_DESeqObject(geneExp, ct, scz_cohort) + + list_motifs <- list() + for (m in tf_ct$motif){ + + #m <- "MA1643.1" + m_gene <- tf_ct$jaspar_id[tf_ct$motif == m] + + # Read target genes + target_genes <- read.csv(paste0("data/09_TargetGenes/", ct, "/09_TargetGenes_", ct, "_", m, "_", effDir, ".csv")) + + # Load the cohort genotype data + genotypes <- fread(paste0(basedir, "genotype/SchizophreniaCohort/", ct, "/Subset_", m, "_", effDir, ".raw")) + + # SNPs present in genotype data with respective target genes + colnames(genotypes)[7:ncol(genotypes)] <- str_remove(colnames(genotypes)[7:ncol(genotypes)], "_[ATGC]") + snps_present <- colnames(genotypes)[7:ncol(genotypes)] + snps_mapped <- illumina_ids %>% filter(Name %in% snps_present) + snps_rs <- union(snps_mapped$RsID, snps_present) + + # subset target gene list to those with respective genotype information + target_sub <- target_genes %>% + filter(SNP_ID %in% snps_rs) %>% + mutate(ExpFilt = ifelse(GeneID %in% rownames(counts(de_obj)), "present", "absent")) + target_exp <- target_sub %>% + filter(ExpFilt == "present") + + # run DE analysis between carriers and non-carriers + if(nrow(target_exp) > 0){ + res_df <- test_DE(target_exp, genotypes, de_obj, ct, m, m_gene) + + list_motifs[[m]] <- res_df + } + + stats_tested <- rbind(stats_tested, + list("celltype" = ct, + "motif" = m, + "n_disSNPs" = nrow(target_sub), + "n_disSNPs_present" = length(which(target_sub$ExpFilt == "present")), + "n_disSNPs_testDE" = ifelse(!is.null(nrow(list_motifs[[m]])), + nrow(list_motifs[[m]]), 0))) + + } + + df_motifs <- bind_rows(list_motifs, .id="motif") + list_celltype[[ct]] <- df_motifs +} +write.csv(stats_tested, paste0("data/10_TargetGenes_DiffExp_Pseudobulk/Stats_SnpsAndGenesTested_", effDir, ".csv")) + +df_celltype <- bind_rows(list_celltype, .id="celltype") %>% + left_join(tf[c("celltype", "motif", "jaspar_id")], by=c("celltype", "motif")) +df_celltype$FDR <- p.adjust(df_celltype$pvalue, method="fdr") +write.csv(df_celltype, paste0("data/10_TargetGenes_DiffExp_Pseudobulk/DiffExp_Pseudobulk_", effDir, ".csv")) + +# 3. Plot all results +df_celltype <- read.csv(paste0("data/10_TargetGenes_DiffExp_Pseudobulk/DiffExp_Pseudobulk_", effDir, ".csv")) +df_celltype$DE <- ifelse(df_celltype$FDR <= 0.1, TRUE, FALSE) # 0.05 or 0.1? + +ggplot(df_celltype, aes(x=celltype, y=log2FoldChange, col=DE, size=-log10(FDR))) + + geom_point() + + scale_color_manual(breaks = c(TRUE, FALSE), + labels = c("DE", "not DE"), + values = c("darkorange", "darkgrey")) + + scale_x_discrete(guide = guide_axis(angle = 45)) + + ylab("log2(Fold Change)") + + xlab("Cell type") + + theme_light() + + theme( + axis.title.x = element_text(size = 15), + axis.title.y = element_text(size = 15), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 12), + strip.text.x = element_text(size = 12) + ) +ggsave(filename = paste0("plots/10_TargetGenes_DiffExp_Pseudobulk/", effDir, "/10_Dotplot_TargetGene_DiffExp-BatchCorr_Pseudobulk.pdf"), + width = 10, + height = 7)