Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
##################################################
## Project: scATACseq label transfer, own gene activity matrix
## Date: 22.06.2020
## Author: Nathalie
##################################################
setwd("~/Documents/PostmortemBrain/analysis/markerGeneDefinition")
#setwd("/net/PE1/raid1/LAURA/SC_ANALYSIS/markerGeneDefinition")
library(Seurat)
library(Signac)
library(dplyr)
library(reshape2)
library(RColorBrewer)
library(ggplot2)
library(org.Hs.eg.db)
library(Matrix)
library(readr)
memory.size(max=TRUE)
### DATA LAURA ###################################
dataset <- "dataLaura"
# ATAC data
# genome regions
mat<- readMM(paste0(dataset,"/raw_atac/matrix.mtx"))
peaks<- read_tsv(paste0(dataset,"/raw_atac/peaks.bed"), col_names = F)
peaks<- peaks %>%
mutate(id1 = paste(X2, X3, sep = "-")) %>%
mutate(id = paste(X1, id1, sep = ":"))
barcodes<- read_tsv(paste0(dataset,"/raw_atac/barcodes.tsv"), col_names =F)
rownames(mat)<- peaks$id
colnames(mat)<- barcodes$X1
data_atac <- CreateSeuratObject(counts = mat,
assay = "ATAC",
project = dataset,
min.features = 500)
data_atac <- subset(data_atac, subset = nCount_ATAC > 1000 & nCount_ATAC < 10000)
rm(mat)
rm(peaks)
rm(barcodes)
data_binarized <- readRDS(paste0(dataset, "/scATAC_PMaggr_clusterd_annotated.rds"))
intersect_cells <- intersect(colnames(data_atac), colnames(data_binarized))
data_atac <- subset(data_atac, cells = intersect_cells)
data_atac <- AddMetaData(object = data_atac,
metadata = data_binarized@meta.data)
rm(data_binarized)
# plot distribution of counts
VlnPlot(data_atac, features = c("nFeature_ATAC", "nCount_ATAC"), ncol = 2,
group.by = "orig.ident",
pt.size = 0)
# create gene activity matrix
mat <- data_atac@assays$ATAC@counts
activity.matrix <- CreateGeneActivityMatrix(peak.matrix = mat,
annotation.file = "integration_rna_atac/Homo_sapiens.GRCh37.87.gtf",
seq.levels = c(1:22, "X", "Y"),
upstream = 2000,
verbose = TRUE,
keep.sparse = TRUE)
# add gene activity matrix to Seurat object in own assay
data_atac[["ACTIVITY"]] <- CreateAssayObject(counts = activity.matrix)
data_atac$tech <- "atac"
# 16784 genes, 17274 cells
# preprocess gene activity data
DefaultAssay(data_atac) <- "ACTIVITY"
data_atac <- FindVariableFeatures(data_atac)
data_atac <- NormalizeData(data_atac)
data_atac <- ScaleData(data_atac)
# preprocess peak data
DefaultAssay(data_atac) <- "ATAC"
VariableFeatures(data_atac) <- names(which(Matrix::rowSums(data_atac) > 100))
data_atac <- RunLSI(data_atac, n = 50, scale.max = NULL)
data_atac <- RunUMAP(data_atac, reduction = "lsi", dims = 1:50)
# RNA data
data_rna <- readRDS(paste0(dataset,"/data_object_sct_integrated.rds"))
data_rna$tech <- "rna"
#DefaultAssay(data_rna) <- "RNA"
# 25789 genes, 24501 cells
# plot ATAC and RNA data in UMAP
p1 <- DimPlot(data_atac, reduction = "umap") + NoLegend() + ggtitle("scATAC-seq")
p2 <- DimPlot(data_rna, group.by = "sub_cluster", label = TRUE, repel = TRUE) + NoLegend() + ggtitle("scRNA-seq")
p1 + p2
save.image(paste0(dataset,"/preprocessed_atac_data.RData"))
# remove genes from RNA data that are not in ATAC data (for transfer)
intersect_features <- intersect(dimnames(data_rna@assays$RNA)[[1]],
dimnames(data_atac@assays$ACTIVITY)[[1]])
data_rna <- subset(x = data_rna, features = intersect_features)
# identify transfer anchors and predict cell types in ATAC data
# transfer.anchors <- FindTransferAnchors(reference = data_rna, query = data_atac,
# reference.assay = "integrated", query.assay = "ACTIVITY")
transfer.anchors <- FindTransferAnchors(reference = data_rna, query = data_atac,
reference.assay = 'RNA', query.assay = 'ACTIVITY',
reduction = 'cca')
celltype.predictions <- TransferData(anchorset = transfer.anchors, refdata = data_rna$sub_cluster,
weight.reduction = data_atac[["lsi"]])
data_atac <- AddMetaData(data_atac, metadata = celltype.predictions)
# prediction scores (quality of prediction)
hist(data_atac$prediction.score.max)
abline(v = 0.5, col = "red")
# --> relatively low quality
# how many cell have prediction score > 0.5 --> less than 10%
table(data_atac$prediction.score.max > 0.5)
# plot UMAPs with transferred cell type labels
data_atac$predicted.id <- factor(data_atac$predicted.id, levels = levels(as.factor(data_rna$sub_cluster))) # to make the colors match
p1 <- DimPlot(data_atac, group.by = "predicted.id", label = TRUE, repel = TRUE) + ggtitle("scATAC-seq cells") +
NoLegend() + scale_colour_hue(drop = FALSE)
p2 <- DimPlot(data_rna, group.by = "sub_cluster", label = TRUE, repel = TRUE) + ggtitle("scRNA-seq cells") +
NoLegend()
png(paste0(dataset,"/atac_UMAP_labeltransfer.png"), width = 900, height = 600)
p1 + p2
dev.off()
# COEMBEDDING
# restrict the imputation to variable genes from scRNA-seq
genes.use <- VariableFeatures(data_rna)
refdata <- GetAssayData(data_rna, assay = "RNA", slot = "data")[genes.use, ]
# refdata (input) contains a scRNA-seq expression matrix for the scRNA-seq cells. imputation
# (output) will contain an imputed scRNA-seq matrix for each of the ATAC cells
imputation <- TransferData(anchorset = transfer.anchors, refdata = refdata, weight.reduction = data_atac[["lsi"]])
# add the imputed data matrix to the atac object
data_atac[["RNA"]] <- imputation
coembed <- merge(x = data_rna, y = data_atac)
# run PCA and UMAP on combined object, to visualize the co-embedding of both datasets
coembed <- ScaleData(coembed, features = genes.use, do.scale = FALSE)
coembed <- RunPCA(coembed, features = genes.use, verbose = FALSE)
coembed <- RunUMAP(coembed, dims = 1:30)
coembed$celltype <- ifelse(!is.na(coembed$sub_cluster), coembed$sub_cluster, coembed$predicted.id)
# plot co-embedding
p1 <- DimPlot(coembed, group.by = "tech")
p2 <- DimPlot(coembed, group.by = "celltype", label = TRUE, repel = TRUE)
png(paste0(dataset,"/atac_UMAP_coembedding.png"), width = 900, height = 600)
p1 + p2
dev.off()
# aggregate counts to pseudo bulk and correlate
rna_pseudobulk <- apply(GetAssayData(object = data_rna,
assay = "RNA",
slot = "data"),
MARGIN = 1,
FUN = sum)
rna_pseudobulk <- rna_pseudobulk[intersect_features]
atac_pseudobulk <- apply(GetAssayData(object = data_atac,
assay = "ACTIVITY",
slot = "data"),
MARGIN = 1,
FUN = sum)
atac_pseudobulk <- atac_pseudobulk[intersect_features]
png(paste0(dataset,"/atac_rna_correlation.png"), width = 900, height = 600)
plot(log(rna_pseudobulk),
log(atac_pseudobulk),
xlab="Collapsed counts RNA",
ylab="Collapsed counts ATAC",
main="Correlation of scRNAseq and scATACseq")
dev.off()
cor(rna_pseudobulk, atac_pseudobulk)