Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
LabelTransfer_SingleNuclei/05d_labeltransfer_atac_nonbinarized.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
179 lines (149 sloc)
7.18 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################## | |
## Project: scATACseq label transfer, own gene activity matrix | |
## Date: 22.06.2020 | |
## Author: Nathalie | |
################################################## | |
setwd("~/Documents/PostmortemBrain/analysis/markerGeneDefinition") | |
#setwd("/net/PE1/raid1/LAURA/SC_ANALYSIS/markerGeneDefinition") | |
library(Seurat) | |
library(Signac) | |
library(dplyr) | |
library(reshape2) | |
library(RColorBrewer) | |
library(ggplot2) | |
library(org.Hs.eg.db) | |
library(Matrix) | |
library(readr) | |
memory.size(max=TRUE) | |
### DATA LAURA ################################### | |
dataset <- "dataLaura" | |
# ATAC data | |
# genome regions | |
mat<- readMM(paste0(dataset,"/raw_atac/matrix.mtx")) | |
peaks<- read_tsv(paste0(dataset,"/raw_atac/peaks.bed"), col_names = F) | |
peaks<- peaks %>% | |
mutate(id1 = paste(X2, X3, sep = "-")) %>% | |
mutate(id = paste(X1, id1, sep = ":")) | |
barcodes<- read_tsv(paste0(dataset,"/raw_atac/barcodes.tsv"), col_names =F) | |
rownames(mat)<- peaks$id | |
colnames(mat)<- barcodes$X1 | |
data_atac <- CreateSeuratObject(counts = mat, | |
assay = "ATAC", | |
project = dataset, | |
min.features = 500) | |
data_atac <- subset(data_atac, subset = nCount_ATAC > 1000 & nCount_ATAC < 10000) | |
rm(mat) | |
rm(peaks) | |
rm(barcodes) | |
data_binarized <- readRDS(paste0(dataset, "/scATAC_PMaggr_clusterd_annotated.rds")) | |
intersect_cells <- intersect(colnames(data_atac), colnames(data_binarized)) | |
data_atac <- subset(data_atac, cells = intersect_cells) | |
data_atac <- AddMetaData(object = data_atac, | |
metadata = data_binarized@meta.data) | |
rm(data_binarized) | |
# plot distribution of counts | |
VlnPlot(data_atac, features = c("nFeature_ATAC", "nCount_ATAC"), ncol = 2, | |
group.by = "orig.ident", | |
pt.size = 0) | |
# create gene activity matrix | |
mat <- data_atac@assays$ATAC@counts | |
activity.matrix <- CreateGeneActivityMatrix(peak.matrix = mat, | |
annotation.file = "integration_rna_atac/Homo_sapiens.GRCh37.87.gtf", | |
seq.levels = c(1:22, "X", "Y"), | |
upstream = 2000, | |
verbose = TRUE, | |
keep.sparse = TRUE) | |
# add gene activity matrix to Seurat object in own assay | |
data_atac[["ACTIVITY"]] <- CreateAssayObject(counts = activity.matrix) | |
data_atac$tech <- "atac" | |
# 16784 genes, 17274 cells | |
# preprocess gene activity data | |
DefaultAssay(data_atac) <- "ACTIVITY" | |
data_atac <- FindVariableFeatures(data_atac) | |
data_atac <- NormalizeData(data_atac) | |
data_atac <- ScaleData(data_atac) | |
# preprocess peak data | |
DefaultAssay(data_atac) <- "ATAC" | |
VariableFeatures(data_atac) <- names(which(Matrix::rowSums(data_atac) > 100)) | |
data_atac <- RunLSI(data_atac, n = 50, scale.max = NULL) | |
data_atac <- RunUMAP(data_atac, reduction = "lsi", dims = 1:50) | |
# RNA data | |
data_rna <- readRDS(paste0(dataset,"/data_object_sct_integrated.rds")) | |
data_rna$tech <- "rna" | |
#DefaultAssay(data_rna) <- "RNA" | |
# 25789 genes, 24501 cells | |
# plot ATAC and RNA data in UMAP | |
p1 <- DimPlot(data_atac, reduction = "umap") + NoLegend() + ggtitle("scATAC-seq") | |
p2 <- DimPlot(data_rna, group.by = "sub_cluster", label = TRUE, repel = TRUE) + NoLegend() + ggtitle("scRNA-seq") | |
p1 + p2 | |
save.image(paste0(dataset,"/preprocessed_atac_data.RData")) | |
# remove genes from RNA data that are not in ATAC data (for transfer) | |
intersect_features <- intersect(dimnames(data_rna@assays$RNA)[[1]], | |
dimnames(data_atac@assays$ACTIVITY)[[1]]) | |
data_rna <- subset(x = data_rna, features = intersect_features) | |
# identify transfer anchors and predict cell types in ATAC data | |
# transfer.anchors <- FindTransferAnchors(reference = data_rna, query = data_atac, | |
# reference.assay = "integrated", query.assay = "ACTIVITY") | |
transfer.anchors <- FindTransferAnchors(reference = data_rna, query = data_atac, | |
reference.assay = 'RNA', query.assay = 'ACTIVITY', | |
reduction = 'cca') | |
celltype.predictions <- TransferData(anchorset = transfer.anchors, refdata = data_rna$sub_cluster, | |
weight.reduction = data_atac[["lsi"]]) | |
data_atac <- AddMetaData(data_atac, metadata = celltype.predictions) | |
# prediction scores (quality of prediction) | |
hist(data_atac$prediction.score.max) | |
abline(v = 0.5, col = "red") | |
# --> relatively low quality | |
# how many cell have prediction score > 0.5 --> less than 10% | |
table(data_atac$prediction.score.max > 0.5) | |
# plot UMAPs with transferred cell type labels | |
data_atac$predicted.id <- factor(data_atac$predicted.id, levels = levels(as.factor(data_rna$sub_cluster))) # to make the colors match | |
p1 <- DimPlot(data_atac, group.by = "predicted.id", label = TRUE, repel = TRUE) + ggtitle("scATAC-seq cells") + | |
NoLegend() + scale_colour_hue(drop = FALSE) | |
p2 <- DimPlot(data_rna, group.by = "sub_cluster", label = TRUE, repel = TRUE) + ggtitle("scRNA-seq cells") + | |
NoLegend() | |
png(paste0(dataset,"/atac_UMAP_labeltransfer.png"), width = 900, height = 600) | |
p1 + p2 | |
dev.off() | |
# COEMBEDDING | |
# restrict the imputation to variable genes from scRNA-seq | |
genes.use <- VariableFeatures(data_rna) | |
refdata <- GetAssayData(data_rna, assay = "RNA", slot = "data")[genes.use, ] | |
# refdata (input) contains a scRNA-seq expression matrix for the scRNA-seq cells. imputation | |
# (output) will contain an imputed scRNA-seq matrix for each of the ATAC cells | |
imputation <- TransferData(anchorset = transfer.anchors, refdata = refdata, weight.reduction = data_atac[["lsi"]]) | |
# add the imputed data matrix to the atac object | |
data_atac[["RNA"]] <- imputation | |
coembed <- merge(x = data_rna, y = data_atac) | |
# run PCA and UMAP on combined object, to visualize the co-embedding of both datasets | |
coembed <- ScaleData(coembed, features = genes.use, do.scale = FALSE) | |
coembed <- RunPCA(coembed, features = genes.use, verbose = FALSE) | |
coembed <- RunUMAP(coembed, dims = 1:30) | |
coembed$celltype <- ifelse(!is.na(coembed$sub_cluster), coembed$sub_cluster, coembed$predicted.id) | |
# plot co-embedding | |
p1 <- DimPlot(coembed, group.by = "tech") | |
p2 <- DimPlot(coembed, group.by = "celltype", label = TRUE, repel = TRUE) | |
png(paste0(dataset,"/atac_UMAP_coembedding.png"), width = 900, height = 600) | |
p1 + p2 | |
dev.off() | |
# aggregate counts to pseudo bulk and correlate | |
rna_pseudobulk <- apply(GetAssayData(object = data_rna, | |
assay = "RNA", | |
slot = "data"), | |
MARGIN = 1, | |
FUN = sum) | |
rna_pseudobulk <- rna_pseudobulk[intersect_features] | |
atac_pseudobulk <- apply(GetAssayData(object = data_atac, | |
assay = "ACTIVITY", | |
slot = "data"), | |
MARGIN = 1, | |
FUN = sum) | |
atac_pseudobulk <- atac_pseudobulk[intersect_features] | |
png(paste0(dataset,"/atac_rna_correlation.png"), width = 900, height = 600) | |
plot(log(rna_pseudobulk), | |
log(atac_pseudobulk), | |
xlab="Collapsed counts RNA", | |
ylab="Collapsed counts ATAC", | |
main="Correlation of scRNAseq and scATACseq") | |
dev.off() | |
cor(rna_pseudobulk, atac_pseudobulk) |