add TF binding code

mpip · Feb 3, 2025 · ca70146 · ca70146
commit ca70146
Show file tree

Hide file tree

Showing 16 changed files with 2,652 additions and 0 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/README.md b/README.md
@@ -0,0 +1,5 @@
+# Schizophrenia risk variants modulate transcription factor binding and gene expression in cortical cell types
+
+Schizophrenia is a complex neuropsychiatric disorder with a strong genetic component. Genome-wide association studies (GWAS) have identified numerous risk variants, but their functional impact on gene regulation remains largely unknown. We investigate the disruption and enhancement of transcription factor (TF) binding motifs by schizophrenia-associated GWAS SNPs in 15 cortical cell types of the human brain. By integrating single-nucleus ATAC-seq and RNA-seq data from 71 donors (36 affected by schizophrenia) with GWAS summary statistics, we identify specific TF motifs whose binding affinities are altered by risk and protective SNPs. We find that risk SNPs  predominantly disrupt TF binding, while protective SNPs show a more balanced effect. Furthermore, we demonstrate that disrupted TF motifs can lead to altered expression of target genes, including NAGA in excitatory neurons and HLA-B in oligodendrocyte precursor cells. These genes have been previously implicated in schizophrenia and our study provides a mechanism for their dysregulation through altered TF binding. Our findings highlight the importance of considering cell type-specific effects and provide a genome-wide map of TF motif disruptions in schizophrenia, offering insights into the regulatory mechanisms underlying disease risk.
+
+This repository contains all scripts used for the analysis of TF binding disruption and enhancement through schizophrenia GWAS SNPs and the differential expression analysis of respective target genes between carriers and non-carriers of the respective alternative allele. 
diff --git a/TF_binding/01_CellTypeSpecificPeaks_SCZ.R b/TF_binding/01_CellTypeSpecificPeaks_SCZ.R
@@ -0,0 +1,171 @@
+## Identify marker features based on the peak matrix
+## only on SCZ cases and controls
+
+# run with conda environment sc-atac
+
+library(ArchR)
+library(dplyr)
+library(tibble)
+library(tidyr)
+library(RColorBrewer)
+library(BSgenome.Hsapiens.UCSC.hg38)
+
+# folder of ArchR project
+archr_folder <- "../ArchRSubset_SCZ"
+
+# 1. Read data
+
+projPM <- loadArchRProject(path = paste0(archr_folder,"/"))
+
+# 2. Check out available matrices, peak matrix was added to ArchR project in 14_1_markerFeatures.R, but marker features were not saved to object or file
+#projPM_peak <- addPeakMatrix(projPM)
+getAvailableMatrices(projPM)
+# [1] "GeneIntegrationMatrix" "GeneScoreMatrix"       "MotifMatrix"          
+# [4] "MotifMatrix_SCZ"       "PeakMatrix"            "TileMatrix" 
+peak_mat <- getMatrixFromProject(projPM, useMatrix="PeakMatrix")
+min(peak_mat@colData$ReadsInPeaks) # 482
+
+# 3. Add marker features for each cell type compared to all others
+
+markersPeaks <- getMarkerFeatures(
+    ArchRProj = projPM, 
+    useMatrix = "PeakMatrix", 
+    groupBy = "celltypes_refined",
+    bias = c("TSSEnrichment", "log10(nFrags)"),
+    testMethod = "wilcoxon"
+) 
+
+saveRDS(markersPeaks, "data/markersPeaks.rds")
+markersPeaks <- readRDS("data/markersPeaks.rds")
+
+markerList <- getMarkers(markersPeaks, 
+                cutOff = "FDR <= 0.05 & Log2FC >= 1")
+markerList
+markerList_GRanges <- getMarkers(markersPeaks,
+                 cutOff = "FDR <= 0.05 & Log2FC >= 1", 
+                 returnGR = TRUE)
+markerList_GRanges
+
+# 3a. Plot number of marker peaks per cell type
+n_markers <- sapply(markerList, nrow)
+df_n_markers <- data.frame(celltype=names(n_markers), 
+                        value=n_markers, row.names=NULL)
+write.csv(df_n_markers, "data/01_Number-MarkerPeaks.csv")
+
+# Barplot with number of marker peaks per celltype
+ggplot(df_n_markers, aes(x=celltype, y=value)) +
+    geom_bar(stat="identity", fill="steelblue")+
+    geom_text(aes(label=value), vjust=1.6, color="white", size=3.5)+
+    xlab("") +
+    ylab("Number of marker peaks") +
+    scale_x_discrete(guide = guide_axis(angle = 45)) +
+    theme_minimal() +
+    theme(
+            strip.text.x = element_text(size = 15),
+            axis.title.x = element_text(size = 15),
+            axis.title.y = element_text(size = 15),
+            axis.text.x = element_text(size = 12),
+            axis.text.y = element_text(size = 12),
+            legend.text = element_text(size = 12),
+            legend.title = element_text(size = 12)
+        )
+ggsave(paste0(projPM@projectMetadata$outputDirectory, "/Plots/TFbinding/01_Barplot_MarkerPeaks.pdf"),
+    height = 8,
+    width = 10)
+
+# 4. Add motif annotations to each peak
+
+projPM <- addMotifAnnotations(ArchRProj = projPM,
+            motifSet = "JASPAR2020", 
+            #collection = "CORE", 
+            annoName = "Motif")
+            #force = TRUE) # force was necessary after subsetting to SCZ samples
+
+
+# 5. Motif Enrichment in Marker Peaks
+
+enrichMotifs <- peakAnnoEnrichment(
+    seMarker = markersPeaks,
+    ArchRProj = projPM,
+    peakAnnotation = "Motif",
+    cutOff = "FDR <= 0.05 & Log2FC >= 1"
+)
+enrichMotifs
+head(enrichMotifs@assays@data$mlog10p)
+
+saveRDS(enrichMotifs, "data/enrichMotifs.rds")
+enrichMotifs <- readRDS("data/enrichMotifs.rds")
+
+# 5.1 Plot heatmap of enriched motifs per cell type
+
+heatmapEM <- plotEnrichHeatmap(enrichMotifs, 
+                                n = 7, # limit of motifs shown per cell type
+                                transpose = TRUE)
+ComplexHeatmap::draw(heatmapEM, 
+                    heatmap_legend_side = "bot", 
+                    annotation_legend_side = "bot")
+plotPDF(heatmapEM, 
+        name = "TFbinding/01_Motifs-Enriched-Marker-Heatmap", 
+        width = 8, 
+        height = 6, 
+        ArchRProj = projPM, 
+        addDOC = FALSE)
+
+# 5.2 TF binding enrichments with P < 1*10^-10
+
+# extract motif-celltype pairs with respective P value
+tfenr <- as.data.frame(enrichMotifs@assays@data$mlog10p) %>%
+        rownames_to_column(var = "motif") %>%
+        pivot_longer(!motif, names_to="celltype", values_to="mlog10p") %>%
+        filter(mlog10p >= 10)
+write.csv(tfenr, "data/01_MotifEnrichments_threshold.csv")
+
+# count enriched motifs per celltype
+tf_enr_count <- tfenr %>%
+    group_by(celltype) %>%
+    count(celltype)
+write.csv(tf_enr_count, "data/01_MotifEnrichments_count.csv")
+
+# # A tibble: 15 × 2
+# # Groups:   celltype [15]
+#    celltype            n
+#    <chr>           <int>
+#  1 Astro_FB          127
+#  2 Astro_PP          128
+#  3 Endothelial        86
+#  4 Exc_L2-3          111
+#  5 Exc_L3-5          143
+#  6 Exc_L4-6_1        127
+#  7 Exc_L4-6_2         98
+#  8 In_LAMP5           57
+#  9 In_PVALB_Ba        78
+# 10 In_PVALB_Ch       150
+# 11 In_SST             79
+# 12 In_VIP            122
+# 13 Microglia         153
+# 14 OPC               113
+# 15 Oligodendrocyte   111
+
+# Barplot with number of motif enrichments with P < 1*10^-10 per celltype
+ggplot(tf_enr_count, aes(x=celltype, y=n)) +
+    geom_bar(stat="identity", fill="steelblue")+
+    geom_text(aes(label=n), vjust=1.6, color="white", size=3.5)+
+    xlab("") +
+    ylab("Number of enriched motifs") +
+    scale_x_discrete(guide = guide_axis(angle = 45)) +
+    theme_minimal() +
+    theme(
+            strip.text.x = element_text(size = 15),
+            axis.title.x = element_text(size = 15),
+            axis.title.y = element_text(size = 15),
+            axis.text.x = element_text(size = 12),
+            axis.text.y = element_text(size = 12),
+            legend.text = element_text(size = 12),
+            legend.title = element_text(size = 12)
+        )
+ggsave(paste0(projPM@projectMetadata$outputDirectory, "/Plots/TFbinding/01_Barplot_Enriched_Motifs.pdf"),
+    height = 8,
+    width = 10)
+
+# 6. Save ArchR project with motif annotations
+saveArchRProject(ArchRProj = projPM)