Skip to content
This repository has been archived by the owner. It is now read-only.

Commit

Permalink
Added container logic for kallisto
Browse files Browse the repository at this point in the history
jenzopr committed Apr 27, 2018
1 parent dea6443 commit 1f380e1
Showing 4 changed files with 125 additions and 13 deletions.
3 changes: 3 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -10,8 +10,11 @@ dependencies:
- kallisto
#- htstream
#- umis
- r-magrittr
- r-data.table
- r-readr
- r-dplyr
- r-matrix
- bioconductor-tximport
- bioconductor-singlecellexperiment
- bioconductor-biomart
43 changes: 30 additions & 13 deletions src/container.snake
Original file line number Diff line number Diff line change
@@ -7,16 +7,33 @@
Provides rules for creation of a SingleCellExperiment RData file
'''

rule create_SingleCellExpression_container:
input:
tx2gene = rules.tx2gene_from_fasta.output,
metadata = find_metadata_files,
quant_files = expand('{o}/{s}/quant.sf', o = config['dirs']['quant'], s = SAMPLE_NAMES)
output:
rdata = join(config['dirs']['R'], 'scData.Rdata')
params:
samples = SAMPLE_NAMES,
colData = samplesheet.ix[SAMPLE_NAMES].to_dict(orient = 'list')
message: 'Create SingleCellExperiment container for experiment.'
script:
'createSCE.R'
if config['action']['quantification'] == 'salmon':
rule create_SingleCellExpression_container_salmon:
input:
tx2gene = rules.tx2gene_from_fasta.output,
metadata = find_metadata_files,
quant_files = expand('{o}/{s}/quant.sf', o = config['dirs']['quant'], s = SAMPLE_NAMES)
output:
rdata = join(config['dirs']['R'], 'scData.Rdata')
params:
samples = SAMPLE_NAMES,
colData = samplesheet.ix[SAMPLE_NAMES].to_dict(orient = 'list')
message: 'Create SingleCellExperiment container for experiment from salmon output.'
script:
'createSCE_salmon.R'
elif config['action']['quantification'] == 'kallisto':
rule create_SingleCellExpression_container_kallisto:
input:
abundance = config['dirs']['quant'] + '/matrix.tsv',
ec = config['dirs']['quant'] + '/matrix.ec',
cells = config['dirs']['quant'] + '/matrix.cells',
tx2gene = join(config['dirs']['ref'], 'tx2gene', basename(REFERENCE_FASTA).rstrip(".fa")),
metadata = find_metadata_files
output:
rdata = join(config['dirs']['R'], 'scData.Rdata')
params:
samples = SAMPLE_NAMES,
colData = samplesheet.ix[SAMPLE_NAMES].to_dict(orient = 'list')
message: 'Create SingleCellExperiment container for experiment from kallisto output.'
script:
'createSCE_kallisto.R'
92 changes: 92 additions & 0 deletions src/createSCE_kallisto.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#
# Load libraries
#
library(magrittr)
library(data.table)
library(Matrix)
library(biomaRt)
library(SingleCellExperiment)

#
# Read tx2gene file
#
tx2gene <- read.table(snakemake@input$tx2gene, sep="\t", col.names = c("transcript", "gene"))

#
# Read quant file from Kallisto and construct expression matrix
#
sparse_data <- read.table(snakemake@input$abundance, header = F, sep = "\t")
umi_counts <- Matrix::sparseMatrix(i = sparse_data$V1+1, j = sparse_data$V2+1, x = sparse_data$V3)

cells <- readLines(snakemake@input$cells)
colnames(umi_counts) <- cells
rownames(umi_counts) <- paste0("EC", 1:nrow(umi_counts))

#
# Provide mapping between ECs and transcripts/genes
#
data.table::fread(snakemake@input$ec, data.table = F) %>%
magrittr::extract2(2) %>%
strsplit(",", fixed = T) %>%
lapply(as.numeric) %>%
lapply(magrittr::add, 1) -> ec_members

#
# Assemble list of expression matrices and adjust colnames
#
expression <- list(umi = umi_counts)

#
# Annotate gene names
#
if (snakemake@config$container$annotate) {
ec_members %>%
lapply(function(m) {unique(tx2gene[m, "gene"])}) -> ec2gene

bm <- biomaRt::getBM(c(snakemake@config$container$biomart$filter, 'external_gene_name'),
filters = snakemake@config$container$biomart$filter,
values = ec2gene %>% unlist %>% unique,
mart = biomaRt::useMart("ensembl", dataset = snakemake@config$container$biomart$dataset))

gene2symbol <- bm[, 'external_gene_name']
names(gene2symbol) <- bm[, snakemake@config$container$biomart$filter]

rowData_ <- data.frame(transcript_ids = sapply(ec_members, function(m) paste0(tx2gene[m, "transcript"], collapse = ",")),
gene_ids = sapply(ec2gene, function(m) paste0(m, collapse = ",")),
symbols = sapply(ec2gene, function(m) { paste0(unname(gene2symbol[m]), collapse = ",") }))
} else {
rowData_ <- NULL
}

#
# Assemble metadata
#
metadata_files <- snakemake@input$metadata
if (!is.null(metadata_files)) {
metadata_ <- lapply(metadata_files, function(p) metadata <- data.table::fread(input = p, sep = "\t", header = T, stringsAsFactors = T, na.strings = "NA"))
metadata <- Reduce(function(a, b) dplyr::left_join(a, b, by = snakemake@config$samplesheet$index), metadata_)

m <- match(colnames(expression[[1]]), make.names(metadata[, snakemake@config$samplesheet$index]))
colData_ <- as.data.frame(metadata[na.omit(m), ])
rownames(colData_) <- make.names(colData_[, snakemake@config$samplesheet$index])
} else {
colData_ <- NULL
}

if (!is.null(colData_) & !is.null(rowData_)) {
scData <- SingleCellExperiment::SingleCellExperiment(assays = expression, colData = colData_, rowData = rowData_)
}
if (is.null(colData_) & is.null(rowData_)) {
scData <- SingleCellExperiment::SingleCellExperiment(assays = expression)
}
if (is.null(colData_) & !is.null(rowData_)) {
scData <- SingleCellExperiment::SingleCellExperiment(assays = expression, rowData = rowData_)
}
if (is.null(rowData_) & !is.null(colData_)) {
scData <- SingleCellExperiment::SingleCellExperiment(assays = expression, colData = colData_)
}

#
# Save generated object
#
save(scData, file = snakemake@output$rdata)
File renamed without changes.

0 comments on commit 1f380e1

Please sign in to comment.