Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
LabelTransfer_SingleNuclei/05b_labeltransfer_atac_precalculated_geneactivity_normalization.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
169 lines (142 sloc)
7.05 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################## | |
## Project: scATACseq label transfer | |
## Date: 22.07.2020 | |
## Author: Nathalie | |
################################################## | |
## WORSE THAN IF GENE ACTIVITY MATRIX IS NOT NORMALIZED AGAIN | |
setwd("~/Documents/PostmortemBrain/analysis/markerGeneDefinition") | |
#setwd("/net/PE1/raid1/LAURA/SC_ANALYSIS/markerGeneDefinition") | |
library(Seurat) | |
library(Signac) | |
library(dplyr) | |
library(reshape2) | |
library(RColorBrewer) | |
library(ggplot2) | |
library(org.Hs.eg.db) | |
library(Matrix) | |
library(readr) | |
### DATA LAURA ################################### | |
dataset <- "dataLaura" | |
# ATAC data | |
# genome regions | |
mat<- readMM(paste0(dataset,"/raw_atac/matrix.mtx")) | |
peaks<- read_tsv(paste0(dataset,"/raw_atac/peaks.bed"), col_names = F) | |
peaks<- peaks %>% | |
mutate(id1 = paste(X2, X3, sep = "-")) %>% | |
mutate(id = paste(X1, id1, sep = ":")) | |
barcodes<- read_tsv(paste0(dataset,"/raw_atac/barcodes.tsv"), col_names =F) | |
rownames(mat)<- peaks$id1 | |
colnames(mat)<- barcodes$X1 | |
data_atac <- CreateSeuratObject(counts = mat, | |
assay = "ATAC", | |
project = dataset, | |
min.features = 500) | |
data_atac <- subset(data_atac, subset = nCount_ATAC > 500 & nCount_ATAC < 10000) | |
rm(mat) | |
rm(peaks) | |
rm(barcodes) | |
data_binarized <- readRDS(paste0(dataset, "/scATAC_PMaggr_clusterd_annotated.rds")) | |
data_atac <- AddMetaData(object = data_atac, | |
metadata = data_binarized@meta.data) | |
rm(data_binarized) | |
# gene activity matrix | |
gene_atac <- readRDS(paste0(dataset,"/PM_geneActivities_dgMat.rds")) | |
#mat <- data_atac@assays$RNA@counts | |
#activity.matrix <- CreateGeneActivityMatrix(peak.matrix = mat, annotation.file = "integration_rna_atac/Homo_sapiens.GRCh37.87.gtf", | |
# seq.levels = c(1:22, "X", "Y"), upstream = 2000, verbose = TRUE) | |
# get gene symbols for ENSEMBL IDs | |
symbols <- unname(mapIds(org.Hs.eg.db, keys = dimnames(gene_atac)[[1]], keytype = "ENSEMBL", column="SYMBOL")) | |
dimnames(gene_atac)[[1]] <- symbols | |
gene_atac <- as(gene_atac, "dgTMatrix") | |
# remove duplicated gene symbols from matrix | |
OP2 <- function(x) { | |
nms <- rownames(x) | |
uniquenms <- unique(nms) | |
# build the sparseMatrix again: x's with same index values are automatically | |
# added together, keeping in mind that indexes stored from 0 but built from 1 | |
sparseMatrix(i = match(nms, uniquenms)[x@i + 1], | |
j = x@j + 1, | |
x = x@x, | |
dimnames = list(uniquenms, colnames(x)), | |
giveCsparse = FALSE) | |
} | |
gene_atac <- OP2(gene_atac) | |
gene_atac <- gene_atac[-which(is.na(rownames(gene_atac))),] | |
#Use only cells that are present in both (gene activity matrix and feature matrix) | |
cells_intersect <- intersect(colnames(data_atac), colnames(gene_atac)) | |
data_atac <- subset(data_atac, cells = cells_intersect) | |
data_atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene_atac[, cells_intersect]) | |
rm(gene_atac) | |
rm(cells_intersect) | |
rm(symbols) | |
data_atac$tech <- "atac" | |
DefaultAssay(data_atac) <- "ACTIVITY" | |
data_atac <- FindVariableFeatures(data_atac) | |
#Normalization already done with Cicero by Michael? | |
data_atac <- NormalizeData(data_atac) | |
data_atac <- ScaleData(data_atac) | |
DefaultAssay(data_atac) <- "ATAC" | |
VariableFeatures(data_atac) <- names(which(Matrix::rowSums(data_atac) > 100)) | |
data_atac <- RunLSI(data_atac, n = 50, scale.max = NULL) | |
data_atac <- RunUMAP(data_atac, reduction = "lsi", dims = 1:50) | |
#Read RNA data | |
data_rna <- readRDS(paste0(dataset,"/data_object_sct_integrated.rds")) | |
data_rna$tech <- "rna" | |
# symbols <- unname(mapIds(org.Hs.eg.db, | |
# keys = dimnames(data_rna@data)[[1]], | |
# keytype = "SYMBOL", | |
# column="ENSEMBL")) | |
p1 <- DimPlot(data_atac, reduction = "umap") + NoLegend() + ggtitle("scATAC-seq") | |
p2 <- DimPlot(data_rna, group.by = "sub_cluster", label = TRUE, repel = TRUE) + NoLegend() + ggtitle("scRNA-seq") | |
p1 + p2 | |
p1 <- DimPlot(data_atac, reduction = "umap", group.by = "jointId", label = TRUE, repel = TRUE) + NoLegend() + ggtitle("scATAC-seq") | |
p2 <- DimPlot(data_rna, group.by = "sub_cluster", label = TRUE, repel = TRUE) + NoLegend() + ggtitle("scRNA-seq") | |
p1 + p2 | |
#Label transfer | |
# remove genes from RNA data that are not in ATAC data (for transfer) | |
intersect_features <- intersect(dimnames(data_rna@assays$RNA)[[1]], | |
dimnames(data_atac@assays$ACTIVITY)[[1]]) | |
data_rna <- subset(x = data_rna, features = intersect_features) | |
transfer.anchors <- FindTransferAnchors(reference = data_rna, query = data_atac, | |
features = VariableFeatures(object = data_rna), | |
reference.assay = "RNA", query.assay = "ACTIVITY", | |
reduction = "cca") | |
save.image(paste0(dataset,"/preprocessed_atac_data_precalculated_geneactivity_normalization.RData")) | |
celltype.predictions <- TransferData(anchorset = transfer.anchors, refdata = data_rna$sub_cluster, | |
weight.reduction = data_atac[["lsi"]]) | |
data_atac <- AddMetaData(data_atac, metadata = celltype.predictions) | |
# prediction scores (quality of prediction) | |
hist(data_atac$prediction.score.max) | |
abline(v = 0.5, col = "red") | |
# how many cell have prediction score > 0.5 --> about 50% | |
table(data_atac$prediction.score.max > 0.5) | |
# plot UMAPs with transferred cell type labels | |
data_atac$predicted.id <- factor(data_atac$predicted.id, levels = levels(as.factor(data_rna$sub_cluster))) # to make the colors match | |
p1 <- DimPlot(data_atac, group.by = "predicted.id", label = TRUE, repel = TRUE) + ggtitle("scATAC-seq cells") + | |
NoLegend() + scale_colour_hue(drop = FALSE) | |
p2 <- DimPlot(data_rna, group.by = "sub_cluster", label = TRUE, repel = TRUE) + ggtitle("scRNA-seq cells") + | |
NoLegend() | |
png(paste0(dataset,"/atac_UMAP_labeltransfer_precalculated_geneactivity.png"), width = 900, height = 600) | |
p1 + p2 | |
dev.off() | |
# COEMBEDDING | |
# restrict the imputation to variable genes from scRNA-seq | |
genes.use <- VariableFeatures(data_rna) | |
refdata <- GetAssayData(data_rna, assay = "RNA", slot = "data")[genes.use, ] | |
# refdata (input) contains a scRNA-seq expression matrix for the scRNA-seq cells. imputation | |
# (output) will contain an imputed scRNA-seq matrix for each of the ATAC cells | |
imputation <- TransferData(anchorset = transfer.anchors, refdata = refdata, weight.reduction = data_atac[["lsi"]]) | |
# add the imputed data matrix to the atac object | |
data_atac[["ATAC"]] <- imputation | |
coembed <- merge(x = data_rna, y = data_atac) | |
# run PCA and UMAP on combined object, to visualize the co-embedding of both datasets | |
coembed <- ScaleData(coembed, features = genes.use, do.scale = FALSE) | |
coembed <- RunPCA(coembed, features = genes.use, verbose = FALSE) | |
coembed <- RunUMAP(coembed, dims = 1:30) | |
coembed$celltype <- ifelse(!is.na(coembed$sub_cluster), coembed$sub_cluster, coembed$predicted.id) | |
# plot co-embedding | |
p1 <- DimPlot(coembed, group.by = "tech") | |
p2 <- DimPlot(coembed, group.by = "celltype", label = TRUE, repel = TRUE) | |
png(paste0(dataset,"/atac_UMAP_coembedding.png"), width = 900, height = 600) | |
p1 + p2 | |
dev.off() | |