From 1f380e18a3ec7e98946d2e15a7a855eaeabdd855 Mon Sep 17 00:00:00 2001 From: Jens Preussner Date: Fri, 27 Apr 2018 08:25:19 +0200 Subject: [PATCH] Added container logic for kallisto --- environment.yml | 3 + src/container.snake | 43 ++++++++---- src/createSCE_kallisto.R | 92 +++++++++++++++++++++++++ src/{createSCE.R => createSCE_salmon.R} | 0 4 files changed, 125 insertions(+), 13 deletions(-) create mode 100644 src/createSCE_kallisto.R rename src/{createSCE.R => createSCE_salmon.R} (100%) diff --git a/environment.yml b/environment.yml index 2c145a6..7e7bf89 100644 --- a/environment.yml +++ b/environment.yml @@ -10,8 +10,11 @@ dependencies: - kallisto #- htstream #- umis + - r-magrittr + - r-data.table - r-readr - r-dplyr + - r-matrix - bioconductor-tximport - bioconductor-singlecellexperiment - bioconductor-biomart diff --git a/src/container.snake b/src/container.snake index c6b19bf..75bf483 100644 --- a/src/container.snake +++ b/src/container.snake @@ -7,16 +7,33 @@ Provides rules for creation of a SingleCellExperiment RData file ''' -rule create_SingleCellExpression_container: - input: - tx2gene = rules.tx2gene_from_fasta.output, - metadata = find_metadata_files, - quant_files = expand('{o}/{s}/quant.sf', o = config['dirs']['quant'], s = SAMPLE_NAMES) - output: - rdata = join(config['dirs']['R'], 'scData.Rdata') - params: - samples = SAMPLE_NAMES, - colData = samplesheet.ix[SAMPLE_NAMES].to_dict(orient = 'list') - message: 'Create SingleCellExperiment container for experiment.' - script: - 'createSCE.R' +if config['action']['quantification'] == 'salmon': + rule create_SingleCellExpression_container_salmon: + input: + tx2gene = rules.tx2gene_from_fasta.output, + metadata = find_metadata_files, + quant_files = expand('{o}/{s}/quant.sf', o = config['dirs']['quant'], s = SAMPLE_NAMES) + output: + rdata = join(config['dirs']['R'], 'scData.Rdata') + params: + samples = SAMPLE_NAMES, + colData = samplesheet.ix[SAMPLE_NAMES].to_dict(orient = 'list') + message: 'Create SingleCellExperiment container for experiment from salmon output.' + script: + 'createSCE_salmon.R' +elif config['action']['quantification'] == 'kallisto': + rule create_SingleCellExpression_container_kallisto: + input: + abundance = config['dirs']['quant'] + '/matrix.tsv', + ec = config['dirs']['quant'] + '/matrix.ec', + cells = config['dirs']['quant'] + '/matrix.cells', + tx2gene = join(config['dirs']['ref'], 'tx2gene', basename(REFERENCE_FASTA).rstrip(".fa")), + metadata = find_metadata_files + output: + rdata = join(config['dirs']['R'], 'scData.Rdata') + params: + samples = SAMPLE_NAMES, + colData = samplesheet.ix[SAMPLE_NAMES].to_dict(orient = 'list') + message: 'Create SingleCellExperiment container for experiment from kallisto output.' + script: + 'createSCE_kallisto.R' diff --git a/src/createSCE_kallisto.R b/src/createSCE_kallisto.R new file mode 100644 index 0000000..3246c75 --- /dev/null +++ b/src/createSCE_kallisto.R @@ -0,0 +1,92 @@ +# +# Load libraries +# +library(magrittr) +library(data.table) +library(Matrix) +library(biomaRt) +library(SingleCellExperiment) + +# +# Read tx2gene file +# +tx2gene <- read.table(snakemake@input$tx2gene, sep="\t", col.names = c("transcript", "gene")) + +# +# Read quant file from Kallisto and construct expression matrix +# +sparse_data <- read.table(snakemake@input$abundance, header = F, sep = "\t") +umi_counts <- Matrix::sparseMatrix(i = sparse_data$V1+1, j = sparse_data$V2+1, x = sparse_data$V3) + +cells <- readLines(snakemake@input$cells) +colnames(umi_counts) <- cells +rownames(umi_counts) <- paste0("EC", 1:nrow(umi_counts)) + +# +# Provide mapping between ECs and transcripts/genes +# +data.table::fread(snakemake@input$ec, data.table = F) %>% + magrittr::extract2(2) %>% + strsplit(",", fixed = T) %>% + lapply(as.numeric) %>% + lapply(magrittr::add, 1) -> ec_members + +# +# Assemble list of expression matrices and adjust colnames +# +expression <- list(umi = umi_counts) + +# +# Annotate gene names +# +if (snakemake@config$container$annotate) { + ec_members %>% + lapply(function(m) {unique(tx2gene[m, "gene"])}) -> ec2gene + + bm <- biomaRt::getBM(c(snakemake@config$container$biomart$filter, 'external_gene_name'), + filters = snakemake@config$container$biomart$filter, + values = ec2gene %>% unlist %>% unique, + mart = biomaRt::useMart("ensembl", dataset = snakemake@config$container$biomart$dataset)) + + gene2symbol <- bm[, 'external_gene_name'] + names(gene2symbol) <- bm[, snakemake@config$container$biomart$filter] + + rowData_ <- data.frame(transcript_ids = sapply(ec_members, function(m) paste0(tx2gene[m, "transcript"], collapse = ",")), + gene_ids = sapply(ec2gene, function(m) paste0(m, collapse = ",")), + symbols = sapply(ec2gene, function(m) { paste0(unname(gene2symbol[m]), collapse = ",") })) +} else { + rowData_ <- NULL +} + +# +# Assemble metadata +# +metadata_files <- snakemake@input$metadata +if (!is.null(metadata_files)) { + metadata_ <- lapply(metadata_files, function(p) metadata <- data.table::fread(input = p, sep = "\t", header = T, stringsAsFactors = T, na.strings = "NA")) + metadata <- Reduce(function(a, b) dplyr::left_join(a, b, by = snakemake@config$samplesheet$index), metadata_) + + m <- match(colnames(expression[[1]]), make.names(metadata[, snakemake@config$samplesheet$index])) + colData_ <- as.data.frame(metadata[na.omit(m), ]) + rownames(colData_) <- make.names(colData_[, snakemake@config$samplesheet$index]) +} else { + colData_ <- NULL +} + +if (!is.null(colData_) & !is.null(rowData_)) { + scData <- SingleCellExperiment::SingleCellExperiment(assays = expression, colData = colData_, rowData = rowData_) +} +if (is.null(colData_) & is.null(rowData_)) { + scData <- SingleCellExperiment::SingleCellExperiment(assays = expression) +} +if (is.null(colData_) & !is.null(rowData_)) { + scData <- SingleCellExperiment::SingleCellExperiment(assays = expression, rowData = rowData_) +} +if (is.null(rowData_) & !is.null(colData_)) { + scData <- SingleCellExperiment::SingleCellExperiment(assays = expression, colData = colData_) +} + +# +# Save generated object +# +save(scData, file = snakemake@output$rdata) diff --git a/src/createSCE.R b/src/createSCE_salmon.R similarity index 100% rename from src/createSCE.R rename to src/createSCE_salmon.R