Skip to content
Permalink
4a40c91ccf
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
169 lines (142 sloc) 7.05 KB
##################################################
## Project: scATACseq label transfer
## Date: 22.07.2020
## Author: Nathalie
##################################################
## WORSE THAN IF GENE ACTIVITY MATRIX IS NOT NORMALIZED AGAIN
setwd("~/Documents/PostmortemBrain/analysis/markerGeneDefinition")
#setwd("/net/PE1/raid1/LAURA/SC_ANALYSIS/markerGeneDefinition")
library(Seurat)
library(Signac)
library(dplyr)
library(reshape2)
library(RColorBrewer)
library(ggplot2)
library(org.Hs.eg.db)
library(Matrix)
library(readr)
### DATA LAURA ###################################
dataset <- "dataLaura"
# ATAC data
# genome regions
mat<- readMM(paste0(dataset,"/raw_atac/matrix.mtx"))
peaks<- read_tsv(paste0(dataset,"/raw_atac/peaks.bed"), col_names = F)
peaks<- peaks %>%
mutate(id1 = paste(X2, X3, sep = "-")) %>%
mutate(id = paste(X1, id1, sep = ":"))
barcodes<- read_tsv(paste0(dataset,"/raw_atac/barcodes.tsv"), col_names =F)
rownames(mat)<- peaks$id1
colnames(mat)<- barcodes$X1
data_atac <- CreateSeuratObject(counts = mat,
assay = "ATAC",
project = dataset,
min.features = 500)
data_atac <- subset(data_atac, subset = nCount_ATAC > 500 & nCount_ATAC < 10000)
rm(mat)
rm(peaks)
rm(barcodes)
data_binarized <- readRDS(paste0(dataset, "/scATAC_PMaggr_clusterd_annotated.rds"))
data_atac <- AddMetaData(object = data_atac,
metadata = data_binarized@meta.data)
rm(data_binarized)
# gene activity matrix
gene_atac <- readRDS(paste0(dataset,"/PM_geneActivities_dgMat.rds"))
#mat <- data_atac@assays$RNA@counts
#activity.matrix <- CreateGeneActivityMatrix(peak.matrix = mat, annotation.file = "integration_rna_atac/Homo_sapiens.GRCh37.87.gtf",
# seq.levels = c(1:22, "X", "Y"), upstream = 2000, verbose = TRUE)
# get gene symbols for ENSEMBL IDs
symbols <- unname(mapIds(org.Hs.eg.db, keys = dimnames(gene_atac)[[1]], keytype = "ENSEMBL", column="SYMBOL"))
dimnames(gene_atac)[[1]] <- symbols
gene_atac <- as(gene_atac, "dgTMatrix")
# remove duplicated gene symbols from matrix
OP2 <- function(x) {
nms <- rownames(x)
uniquenms <- unique(nms)
# build the sparseMatrix again: x's with same index values are automatically
# added together, keeping in mind that indexes stored from 0 but built from 1
sparseMatrix(i = match(nms, uniquenms)[x@i + 1],
j = x@j + 1,
x = x@x,
dimnames = list(uniquenms, colnames(x)),
giveCsparse = FALSE)
}
gene_atac <- OP2(gene_atac)
gene_atac <- gene_atac[-which(is.na(rownames(gene_atac))),]
#Use only cells that are present in both (gene activity matrix and feature matrix)
cells_intersect <- intersect(colnames(data_atac), colnames(gene_atac))
data_atac <- subset(data_atac, cells = cells_intersect)
data_atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene_atac[, cells_intersect])
rm(gene_atac)
rm(cells_intersect)
rm(symbols)
data_atac$tech <- "atac"
DefaultAssay(data_atac) <- "ACTIVITY"
data_atac <- FindVariableFeatures(data_atac)
#Normalization already done with Cicero by Michael?
data_atac <- NormalizeData(data_atac)
data_atac <- ScaleData(data_atac)
DefaultAssay(data_atac) <- "ATAC"
VariableFeatures(data_atac) <- names(which(Matrix::rowSums(data_atac) > 100))
data_atac <- RunLSI(data_atac, n = 50, scale.max = NULL)
data_atac <- RunUMAP(data_atac, reduction = "lsi", dims = 1:50)
#Read RNA data
data_rna <- readRDS(paste0(dataset,"/data_object_sct_integrated.rds"))
data_rna$tech <- "rna"
# symbols <- unname(mapIds(org.Hs.eg.db,
# keys = dimnames(data_rna@data)[[1]],
# keytype = "SYMBOL",
# column="ENSEMBL"))
p1 <- DimPlot(data_atac, reduction = "umap") + NoLegend() + ggtitle("scATAC-seq")
p2 <- DimPlot(data_rna, group.by = "sub_cluster", label = TRUE, repel = TRUE) + NoLegend() + ggtitle("scRNA-seq")
p1 + p2
p1 <- DimPlot(data_atac, reduction = "umap", group.by = "jointId", label = TRUE, repel = TRUE) + NoLegend() + ggtitle("scATAC-seq")
p2 <- DimPlot(data_rna, group.by = "sub_cluster", label = TRUE, repel = TRUE) + NoLegend() + ggtitle("scRNA-seq")
p1 + p2
#Label transfer
# remove genes from RNA data that are not in ATAC data (for transfer)
intersect_features <- intersect(dimnames(data_rna@assays$RNA)[[1]],
dimnames(data_atac@assays$ACTIVITY)[[1]])
data_rna <- subset(x = data_rna, features = intersect_features)
transfer.anchors <- FindTransferAnchors(reference = data_rna, query = data_atac,
features = VariableFeatures(object = data_rna),
reference.assay = "RNA", query.assay = "ACTIVITY",
reduction = "cca")
save.image(paste0(dataset,"/preprocessed_atac_data_precalculated_geneactivity_normalization.RData"))
celltype.predictions <- TransferData(anchorset = transfer.anchors, refdata = data_rna$sub_cluster,
weight.reduction = data_atac[["lsi"]])
data_atac <- AddMetaData(data_atac, metadata = celltype.predictions)
# prediction scores (quality of prediction)
hist(data_atac$prediction.score.max)
abline(v = 0.5, col = "red")
# how many cell have prediction score > 0.5 --> about 50%
table(data_atac$prediction.score.max > 0.5)
# plot UMAPs with transferred cell type labels
data_atac$predicted.id <- factor(data_atac$predicted.id, levels = levels(as.factor(data_rna$sub_cluster))) # to make the colors match
p1 <- DimPlot(data_atac, group.by = "predicted.id", label = TRUE, repel = TRUE) + ggtitle("scATAC-seq cells") +
NoLegend() + scale_colour_hue(drop = FALSE)
p2 <- DimPlot(data_rna, group.by = "sub_cluster", label = TRUE, repel = TRUE) + ggtitle("scRNA-seq cells") +
NoLegend()
png(paste0(dataset,"/atac_UMAP_labeltransfer_precalculated_geneactivity.png"), width = 900, height = 600)
p1 + p2
dev.off()
# COEMBEDDING
# restrict the imputation to variable genes from scRNA-seq
genes.use <- VariableFeatures(data_rna)
refdata <- GetAssayData(data_rna, assay = "RNA", slot = "data")[genes.use, ]
# refdata (input) contains a scRNA-seq expression matrix for the scRNA-seq cells. imputation
# (output) will contain an imputed scRNA-seq matrix for each of the ATAC cells
imputation <- TransferData(anchorset = transfer.anchors, refdata = refdata, weight.reduction = data_atac[["lsi"]])
# add the imputed data matrix to the atac object
data_atac[["ATAC"]] <- imputation
coembed <- merge(x = data_rna, y = data_atac)
# run PCA and UMAP on combined object, to visualize the co-embedding of both datasets
coembed <- ScaleData(coembed, features = genes.use, do.scale = FALSE)
coembed <- RunPCA(coembed, features = genes.use, verbose = FALSE)
coembed <- RunUMAP(coembed, dims = 1:30)
coembed$celltype <- ifelse(!is.na(coembed$sub_cluster), coembed$sub_cluster, coembed$predicted.id)
# plot co-embedding
p1 <- DimPlot(coembed, group.by = "tech")
p2 <- DimPlot(coembed, group.by = "celltype", label = TRUE, repel = TRUE)
png(paste0(dataset,"/atac_UMAP_coembedding.png"), width = 900, height = 600)
p1 + p2
dev.off()