Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
LabelTransfer_SingleNuclei/05_labeltransfer_atac_cicero_geneactivity.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
340 lines (289 sloc)
14.1 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################## | |
## Project: scATACseq label transfer | |
## Date: 22.07.2020 | |
## Author: Nathalie | |
################################################## | |
setwd("~/Documents/PostmortemBrain/analysis/markerGeneDefinition") | |
#setwd("/net/PE1/raid1/LAURA/SC_ANALYSIS/markerGeneDefinition") | |
library(Seurat) | |
#library(Signac) | |
library(dplyr) | |
library(reshape2) | |
library(RColorBrewer) | |
library(ggplot2) | |
library(org.Hs.eg.db) | |
library(Matrix) | |
library(readr) | |
### DATA LAURA ################################### | |
dataset <- "dataLaura" | |
#Read ATAC data ----------------------- | |
#genome regions | |
mat<- readMM(paste0(dataset,"/raw_atac/matrix.mtx")) | |
peaks<- read_tsv(paste0(dataset,"/raw_atac/peaks.bed"), col_names = F) | |
peaks<- peaks %>% | |
mutate(id1 = paste(X2, X3, sep = "-")) %>% | |
mutate(id = paste(X1, id1, sep = ":")) | |
barcodes<- read_tsv(paste0(dataset,"/raw_atac/barcodes.tsv"), col_names =F) | |
rownames(mat)<- peaks$id1 | |
colnames(mat)<- barcodes$X1 | |
data_atac <- CreateSeuratObject(counts = mat, | |
assay = "ATAC", | |
project = dataset, | |
min.features = 500) | |
data_atac <- subset(data_atac, subset = nCount_ATAC > 500 & nCount_ATAC < 10000) | |
rm(mat) | |
rm(peaks) | |
rm(barcodes) | |
#Read binarized object and add meta data to raw data | |
data_binarized <- readRDS(paste0(dataset, "/scATAC_PMaggr_clusterd_annotated.rds")) | |
data_atac <- AddMetaData(object = data_atac, | |
metadata = data_binarized@meta.data) | |
rm(data_binarized) | |
#Read Cicero gene activity matrix | |
gene_atac <- readRDS(paste0(dataset,"/PM_geneActivities_dgMat.rds")) | |
#Get gene symbols for ENSEMBL IDs | |
symbols <- unname(mapIds(org.Hs.eg.db, keys = dimnames(gene_atac)[[1]], keytype = "ENSEMBL", column="SYMBOL")) | |
dimnames(gene_atac)[[1]] <- symbols | |
gene_atac <- as(gene_atac, "dgTMatrix") | |
#Remove duplicated gene symbols from matrix | |
OP2 <- function(x) { | |
nms <- rownames(x) | |
uniquenms <- unique(nms) | |
# build the sparseMatrix again: x's with same index values are automatically | |
# added together, keeping in mind that indexes stored from 0 but built from 1 | |
sparseMatrix(i = match(nms, uniquenms)[x@i + 1], | |
j = x@j + 1, | |
x = x@x, | |
dimnames = list(uniquenms, colnames(x)), | |
giveCsparse = FALSE) | |
} | |
gene_atac <- OP2(gene_atac) | |
gene_atac <- gene_atac[-which(is.na(rownames(gene_atac))),] | |
#Use only cells that are present in both (gene activity matrix and feature matrix) | |
cells_intersect <- intersect(colnames(data_atac), colnames(gene_atac)) | |
data_atac <- subset(data_atac, cells = cells_intersect) | |
data_atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene_atac[, cells_intersect]) | |
rm(gene_atac) | |
rm(cells_intersect) | |
rm(symbols) | |
data_atac$tech <- "atac" | |
#Preprocessing of ATAC data ------------------- | |
DefaultAssay(data_atac) <- "ACTIVITY" | |
data_atac <- FindVariableFeatures(data_atac) | |
#Normalization already done with Cicero by Michael | |
#data_atac <- NormalizeData(data_atac) | |
#data_atac <- ScaleData(data_atac) | |
DefaultAssay(data_atac) <- "ATAC" | |
VariableFeatures(data_atac) <- names(which(Matrix::rowSums(data_atac) > 100)) | |
data_atac <- RunLSI(data_atac, n = 50, scale.max = NULL) | |
#Sequencing depth correlation, remove 1st component of downstream analysis if strong correlation | |
DepthCor(data_atac) # --> 1st component can remain | |
DepthCor(data_atac, n = 50) | |
#Non-linear dimension reduction and clustering | |
data_atac <- RunUMAP(data_atac, reduction = "lsi", dims = 1:50) | |
data_atac <- FindNeighbors(data_atac, reduction = "lsi", dims = 1:50) | |
data_atac$seurat_clusters_michael <- data_atac$seurat_clusters #copy clusters, otherwise overwritten | |
data_atac <- FindClusters(data_atac, resolution = 0.1) | |
DimPlot(object = data_atac, group = "seurat_clusters", label = TRUE) + NoLegend() | |
#Read RNA data with cell type labels | |
data_rna <- readRDS(paste0(dataset,"/data_object_sct_integrated.rds")) | |
data_rna$tech <- "rna" | |
#Plot ATAC and RNA data next to each other (with Michael's labels) | |
p1 <- DimPlot(data_atac, reduction = "umap", group.by = "jointId", label = TRUE, repel = TRUE) + NoLegend() + ggtitle("scATAC-seq") | |
p2 <- DimPlot(data_rna, group.by = "sub_cluster", label = TRUE, repel = TRUE) + NoLegend() + ggtitle("scRNA-seq") | |
p1 + p2 | |
#Label transfer ------------------ | |
#Remove genes from RNA data that are not in ATAC data (for transfer) | |
intersect_features <- intersect(dimnames(data_rna@assays$RNA)[[1]], | |
dimnames(data_atac@assays$ACTIVITY)[[1]]) | |
data_rna <- subset(x = data_rna, features = intersect_features) | |
#Find Transfer Anchors and Transfer labels | |
transfer.anchors <- FindTransferAnchors(reference = data_rna, query = data_atac, | |
features = VariableFeatures(object = data_rna), | |
reference.assay = "RNA", query.assay = "ACTIVITY", | |
reduction = "cca") | |
save.image(paste0(dataset,"/preprocessed_atac_data_precalculated_geneactivity.RData")) | |
load(paste0(dataset,"/preprocessed_atac_data_precalculated_geneactivity.RData")) | |
celltype.predictions <- TransferData(anchorset = transfer.anchors, refdata = data_rna$sub_cluster, | |
weight.reduction = data_atac[["lsi"]]) | |
data_atac <- AddMetaData(data_atac, metadata = celltype.predictions) | |
#Prediction scores (quality of prediction) | |
hist(data_atac$prediction.score.max) | |
abline(v = 0.5, col = "red") | |
#How many cell have prediction score > 0.5 --> about 50% | |
table(data_atac$prediction.score.max > 0.5) | |
#Plot UMAPs with transferred cell type labels (ATAC and RNA) | |
data_atac$predicted.id <- factor(data_atac$predicted.id, levels = levels(as.factor(data_rna$sub_cluster))) # to make the colors match | |
p1 <- DimPlot(data_atac, group.by = "predicted.id", label = TRUE, repel = TRUE) + ggtitle("scATAC-seq cells") + | |
NoLegend() + scale_colour_hue(drop = FALSE) | |
p2 <- DimPlot(data_rna, group.by = "sub_cluster", label = TRUE, repel = TRUE) + ggtitle("scRNA-seq cells") + | |
NoLegend() | |
png(paste0(dataset,"/atac_UMAP_labeltransfer.png"), width = 900, height = 600) | |
p1 + p2 | |
dev.off() | |
#Plot UMAPs with transferred labels and seurat clusters | |
p1 <- DimPlot(data_atac, group.by = "predicted.id", label = TRUE, repel = TRUE) + ggtitle("Transferred Labels") + | |
NoLegend() + scale_colour_hue(drop = FALSE) | |
p2 <- DimPlot(data_atac, group.by = "seurat_clusters", label = TRUE, repel = TRUE) + ggtitle("Seurat Clusters") + | |
NoLegend() | |
png(paste0(dataset,"/atac_UMAP_labeltransfer_seuratClusters.png"), width = 900, height = 600) | |
p1 + p2 | |
dev.off() | |
### Assign cell type clusters to labels ------------ | |
x <- as.numeric(data_atac$seurat_clusters) | |
y <- as.character(data_atac$predicted.id) | |
overlap <- table(x, y) | |
# Plot relative values from each louvain cluster | |
rel_ol <- function(m) m / rep(colSums(m), each = nrow(m)) | |
overlap_perc <- rel_ol(t(overlap)) | |
overlap_perc_long <- melt(overlap_perc) | |
overlap_perc_long$x <- as.factor(overlap_perc_long$x) | |
ggplot(overlap_perc_long, aes(x,y))+ | |
geom_tile(aes(fill=value))+ | |
geom_text(aes(label=round(x = value, digits = 2)))+ | |
scale_fill_gradient(low="white", | |
high="darkred")+ | |
theme(panel.grid.major.x=element_blank(), #no gridlines | |
panel.grid.minor.x=element_blank(), | |
panel.grid.major.y=element_blank(), | |
panel.grid.minor.y=element_blank(), | |
panel.background=element_rect(fill="white"), # background=white | |
axis.text.x = element_text(angle=90, hjust = 1,vjust=1,size = 12,face = "bold"), | |
plot.title = element_text(size=15,face="bold"), | |
axis.text.y = element_text(size = 12,face = "bold")) + | |
ggtitle("Overlap Louvain Cluster and Transferred Labels")+ | |
theme(legend.title=element_text(face="bold", size=14)) + | |
scale_x_discrete(name="Louvain cluster")+ | |
scale_y_discrete(name="Transferred Label")+ | |
labs(fill="Overlap") | |
ggsave(paste0(dataset,"/atac_overlap_louvain_transferlabel.png")) | |
# Assign labels to louvain clusters according to highest overlap | |
ass_cluster <- apply(overlap_perc, 2, function(t) | |
rownames(overlap_perc)[which.max(t)]) | |
ass_vec <- ass_cluster[as.character(as.numeric(data_atac$seurat_clusters))] | |
data_atac <- AddMetaData( | |
object = data_atac, | |
metadata = ass_vec, | |
col.name = "seurat_transfer_label" | |
) | |
p1 <- DimPlot(data_atac, group.by = "seurat_transfer_label", label = TRUE, repel = TRUE) + ggtitle("scATACseq") + | |
NoLegend() + scale_colour_hue(drop = FALSE) | |
p2 <- DimPlot(data_rna, group.by = "sub_cluster", label = TRUE, repel = TRUE) + ggtitle("scRNAseq") + | |
NoLegend() | |
png(paste0(dataset,"/atac_UMAP_labeltransfer_clusters.png"), width = 900, height = 600) | |
p1 + p2 | |
dev.off() | |
#Subset cells from seurat cluster 0 and refine assignment | |
data_0 <- subset(data_atac, subset = seurat_clusters == 0) | |
data_0 <- FindNeighbors(data_0, reduction = "lsi", dims = 1:50) | |
data_0 <- FindClusters(data_0, resolution = 0.1) | |
#Overlap | |
x <- as.numeric(data_0$seurat_clusters) | |
y <- as.character(data_0$predicted.id) | |
overlap <- table(x, y) | |
overlap_perc <- rel_ol(t(overlap)) | |
overlap_perc_long <- melt(overlap_perc) | |
overlap_perc_long$x <- as.factor(overlap_perc_long$x) | |
ggplot(overlap_perc_long, aes(x,y))+ | |
geom_tile(aes(fill=value))+ | |
geom_text(aes(label=round(x = value, digits = 2)))+ | |
scale_fill_gradient(low="white", | |
high="darkred")+ | |
theme(panel.grid.major.x=element_blank(), #no gridlines | |
panel.grid.minor.x=element_blank(), | |
panel.grid.major.y=element_blank(), | |
panel.grid.minor.y=element_blank(), | |
panel.background=element_rect(fill="white"), # background=white | |
axis.text.x = element_text(angle=90, hjust = 1,vjust=1,size = 12,face = "bold"), | |
plot.title = element_text(size=15,face="bold"), | |
axis.text.y = element_text(size = 12,face = "bold")) + | |
ggtitle("Overlap Louvain Subcluster (Cluster 0) and Transferred Labels")+ | |
theme(legend.title=element_text(face="bold", size=14)) + | |
scale_x_discrete(name="Louvain cluster")+ | |
scale_y_discrete(name="Transferred Label")+ | |
labs(fill="Overlap") | |
ggsave(paste0(dataset,"/atac_overlap_louvain_cluster0.png"), width = 12, height = 18) | |
#Assign labels to louvain clusters according to highest overlap | |
ass_cluster <- apply(overlap_perc, 2, function(t) | |
rownames(overlap_perc)[which.max(t)]) | |
ass_vec <- ass_cluster[as.character(as.numeric(data_0$seurat_clusters))] | |
data_0 <- AddMetaData( | |
object = data_0, | |
metadata = ass_vec, | |
col.name = "seurat_transfer_cluster0" | |
) | |
#Merge cluster assignments of Cluster 0 to data object | |
data_atac$seurat_transfer_label[Cells(data_0)] <- data_0$seurat_transfer_cluster0 | |
#Set Cluster 1 to "Unknown" since these cells are most likely not excitatory neurons | |
data_atac$seurat_transfer_label[data_atac$seurat_clusters == 1] <- "Unknown" | |
#Plot UMAP with refined assignment | |
p1 <- DimPlot(data_atac, group.by = "seurat_transfer_label", label = TRUE, repel = TRUE) + ggtitle("scATACseq") + | |
NoLegend() + scale_colour_hue(drop = FALSE) | |
p2 <- DimPlot(data_rna, group.by = "sub_cluster", label = TRUE, repel = TRUE) + ggtitle("scRNAseq") + | |
NoLegend() | |
png(paste0(dataset,"/atac_UMAP_labeltransfer_clustersRefined.png"), width = 900, height = 600) | |
p1 + p2 | |
dev.off() | |
### Overlap between transferred labels and Michael's annotations ------------ | |
x <- as.character(data_atac$jointId) | |
y <- as.character(data_atac$predicted.id) | |
overlap <- table(x, y) | |
# Plot relative values from each louvain cluster | |
rel_ol <- function(m) m / rep(colSums(m), each = nrow(m)) | |
overlap_perc <- rel_ol(t(overlap)) | |
overlap_perc_long <- melt(overlap_perc) | |
overlap_perc_long$x <- as.factor(overlap_perc_long$x) | |
ggplot(overlap_perc_long, aes(x,y))+ | |
geom_tile(aes(fill=value))+ | |
geom_text(aes(label=round(x = value, digits = 2)))+ | |
scale_fill_gradient(low="white", | |
high="darkred")+ | |
theme(panel.grid.major.x=element_blank(), #no gridlines | |
panel.grid.minor.x=element_blank(), | |
panel.grid.major.y=element_blank(), | |
panel.grid.minor.y=element_blank(), | |
panel.background=element_rect(fill="white"), # background=white | |
axis.text.x = element_text(angle=90, hjust = 1,vjust=1,size = 12,face = "bold"), | |
plot.title = element_text(size=15,face="bold"), | |
axis.text.y = element_text(size = 12,face = "bold")) + | |
ggtitle("Overlap Michael's annotations and Transferred Labels")+ | |
theme(legend.title=element_text(face="bold", size=14)) + | |
scale_x_discrete(name="Michael's annotations")+ | |
scale_y_discrete(name="Transferred Label")+ | |
labs(fill="Overlap") | |
ggsave(paste0(dataset,"/atac_overlap_michaelsannotations.png")) | |
p1 <- DimPlot(data_atac, group.by = "jointId", label = TRUE, repel = TRUE) + ggtitle("Michael's annotations") + | |
NoLegend() + scale_colour_hue(drop = FALSE) | |
p2 <- DimPlot(data_atac, group.by = "seurat_transfer_label", label = TRUE, repel = TRUE) + ggtitle("Transferred Labels") + | |
NoLegend() | |
png(paste0(dataset,"/atac_UMAP_labeltransfer_comparisonMichaelsAnnotations.png"), width = 900, height = 600) | |
p1 + p2 | |
dev.off() | |
#Save Seurat object | |
saveRDS(data_atac, file = paste0(dataset,"/scATAC_labeltransfer.rds")) | |
# # COEMBEDDING | |
# # restrict the imputation to variable genes from scRNA-seq | |
# genes.use <- VariableFeatures(data_rna) | |
# refdata <- GetAssayData(data_rna, assay = "RNA", slot = "data")[genes.use, ] | |
# | |
# # refdata (input) contains a scRNA-seq expression matrix for the scRNA-seq cells. imputation | |
# # (output) will contain an imputed scRNA-seq matrix for each of the ATAC cells | |
# imputation <- TransferData(anchorset = transfer.anchors, refdata = refdata, weight.reduction = data_atac[["lsi"]]) | |
# | |
# # add the imputed data matrix to the atac object | |
# data_atac[["ATAC"]] <- imputation | |
# coembed <- merge(x = data_rna, y = data_atac) | |
# | |
# # run PCA and UMAP on combined object, to visualize the co-embedding of both datasets | |
# coembed <- ScaleData(coembed, features = genes.use, do.scale = FALSE) | |
# coembed <- RunPCA(coembed, features = genes.use, verbose = FALSE) | |
# coembed <- RunUMAP(coembed, dims = 1:30) | |
# coembed$celltype <- ifelse(!is.na(coembed$sub_cluster), coembed$sub_cluster, coembed$predicted.id) | |
# | |
# # plot co-embedding | |
# p1 <- DimPlot(coembed, group.by = "tech") | |
# p2 <- DimPlot(coembed, group.by = "celltype", label = TRUE, repel = TRUE) | |
# png(paste0(dataset,"/atac_UMAP_coembedding.png"), width = 900, height = 600) | |
# p1 + p2 | |
# dev.off() | |
# |