08-differential-expression-A-MAST.Rmd

```{r parameters-and-defaults, include = FALSE}
module <- "scRNAseq"
section <- "differential_expression"
```

```{r parameter-merge, include = FALSE}
local_params <- module %>%
  options() %>%
  magrittr::extract2(module) %>%
  magrittr::extract2(section) %>%
  ReporteR.base::validate_params(parameters_and_defaults)
```

Assessment of differences in RNA expression faces some challenges that are unique for measurements from single cells. First, single-cell expression has been shown to exhibit a characteristic bimodal expression pattern, wherein the expression of otherwise abundant genes is either strongly positive or undetected within individual cells. This is partly due to low starting quantities of RNA (technical noise), but may be also due to biological variation between heterogeneous subpopulations. Second, cells scale their transcript copy number with their cell volume to maintain a constant mRNA concentration and thus constant biochemical reaction rates [@padovan_compensation_2015]. Hence, in single cell analysis, cells of varying volumne - and varying mRNA copy number are diluted to an approximately fixed reaction volume, leading to differences in detection rate of various mRNA species that are driven by the initial cell volumes. Technical assay variability (e.g. amplification efficiency) and biological factors (e.g. differences in cell size) affect transcription globally and can significantly influence single cell expression level measurements.
**M**odel-based **A**nalysis of **S**ingle-cell **T**ranscriptomes (MAST) [@finak_mast_2015] estimates and controls for the *cellular detection rate*, i.e. the fraction of genes that are detectably expressed in each cell, as a proxy for both global technical and biological factors, while simultaneously estimating treatment effects between groups of cells. The underlying hurdle model leverages theory of generalized linear models and therefore permits complex experimental designs (e.g. controlling for technical or biological covariates).

```{r scRNAseq-differential-expression-A-MAST-contrasts, include=FALSE, echo = FALSE}
contrast_list <- lapply(local_params$groups, function(f) {
  l <- levels(droplevels(factor(SummarizedExperiment::colData(object_filtered)[, f])))
  r <- combn(x = l, m = 2, simplify = FALSE)
  names(r) <- rep(f, length(r))
  return(r)
})

contrasts <- lapply(contrast_list, function(l) {
  labels <- sapply(l, function(i) {
    paste0(i, collapse = "_vs_")
  })

  column <- unique(names(l))
  labels <- paste0(column, "_", labels)
  items <- lapply(l, function(i) {
    il <- list(
      which(SummarizedExperiment::colData(object_filtered)[, column] == i[1]),
      which(SummarizedExperiment::colData(object_filtered)[, column] == i[2]))
    names(il) <- i
    return(il)
  })
  names(items) <- labels
  return(items)
})

contrasts <- unlist(contrasts, recursive = FALSE)

# Make sure all contrasts are exactly of length 2
assertive.numbers::assert_all_are_equal_to(sapply(contrasts, length), 2)
```

```{r scRNAseq-differential-expression-A-MAST-processing, include=FALSE, echo = FALSE}

mast_output_dir <- tempdir(check = TRUE)

mast_de_stats <- lapply(names(contrasts), function(n) {
  contrast <- contrasts[[n]]

  data <- cbind(SummarizedExperiment::assay(object_filtered, local_params$assay)[, contrast[[1]]],
                SummarizedExperiment::assay(object_filtered, local_params$assay)[, contrast[[2]]])
  condition <- c(rep(names(contrast)[1], length(contrast[[1]])), rep(names(contrast)[2], length(contrast[[2]])))

  cdat <- data.frame(wellKey = colnames(data), condition = factor(condition), stringsAsFactors = F)
  formulae <- "~ condition"

  # Add batch
  if(length(local_params$batch) > 0) {
    cdat$batch <- SummarizedExperiment::colData(object_filtered)[c(contrast[[1]], contrast[[2]]), local_params$batch]
    formulae <- paste0(formulae, " + batch")
  }

  fdat <- data.frame(primerid = rownames(data), stringsAsFactors = F)
  sca <- MAST::FromMatrix(exprsArray = data,
                          cData = cdat,
                          fData = fdat,
                          check_logged = FALSE)

  zlm <- MAST::zlm(as.formula(formulae), sca, method = "bayesglm", ebayes = TRUE, ebayesControl = list(method = "MLE", model = "H1"))
  s <- MAST::summary(zlm, doLRT = paste0('condition', names(contrast)[2]))$datatable

  # Merge hurdle P-values and logFC coefficients from MAST results
  columns_left <- c("primerid", "Pr(>Chisq)")
  columns_right <- c("primerid", "coef", "ci.hi", "ci.lo")
  contrast_query <- paste0("condition", names(contrast)[2])
  res <- data.table:::merge.data.table(s[contrast == contrast_query & component == "H", ..columns_left],
                                       s[contrast == contrast_query & component == "logFC", ..columns_right], by = "primerid")
  colnames(res) <- c("geneID", "pval", "lfc", "lfc.hi", "lfc.lo")

  # Multiple testing correction
  res$p.adjust <- stats::p.adjust(res$pval, method = local_params$adjust_method)

  # Add additional columns
  if(length(local_params$add_columns) > 0) {
    fdt <- data.table::as.data.table(SummarizedExperiment::rowData(object_filtered)[, local_params$add_columns])
    fdt$geneID <- rownames(object_filtered)
    res <- data.table:::merge.data.table(fdt, res, by = "geneID")
  }

  # Write table
  mast_contrast_filename <- file.path(mast_output_dir, paste0(n %>% stringi::stri_replace_first_regex("^\\.", ""), ".txt"))
  write.table(x = res,
              file = mast_contrast_filename,
              quote = FALSE,
              row.names = FALSE,
              col.names = TRUE,
              sep = "\t")

  # Report back number of differentially expressed genes
  data.frame(contrast = paste(names(contrast), collapse = " vs. "),
             filename = mast_contrast_filename,
             upregulated = sum(res$p.adjust < local_params$max_p_adjust & res$lfc > local_params$min_abs_lfc, na.rm = T),
             downregulated = sum(res$p.adjust < local_params$max_p_adjust & res$lfc < -local_params$min_abs_lfc, na.rm = T),
             upregulated_strict = sum(res$p.adjust < local_params$max_p_adjust & res$lfc.lo > local_params$min_abs_lfc, na.rm = T),
             downregulated_strict = sum(res$p.adjust < local_params$max_p_adjust & res$lfc.hi < -local_params$min_abs_lfc, na.rm = T))
})

data_mast_de_stats <- lapply(split(mast_de_stats, rep(local_params$groups, times = sapply(contrast_list, length))), function(l) {
  do.call("rbind", l) %>%
    dplyr::select(contrast, upregulated, downregulated, upregulated_strict, downregulated_strict) %>%
    dplyr::transmute(contrast = contrast, upreg = paste0(upregulated, " (", upregulated_strict, ")"), downreg = paste0(downregulated, " (", downregulated_strict, ")")) -> df
  colnames(df) <- c("Contrast", "Upregulated (strict)", "Downregulated (strict)")
  return(df)
})
attr(data_mast_de_stats, "subheadings") <- paste("from", gsub(pattern = "_", replacement = "-", x = names(data_mast_de_stats)))
```

Table \@ref(tab:scRNAseq-differential-expression-A-MAST-table) shows the number of differentially expressed features across the experimental factors `r ReporteR.base::itemize(names(data_mast_de_stats))`, stratified by different selection criteria.

```{r scRNAseq-differential-expression-A-MAST-table, results="asis"}
table_mast_de_stats <- xtable::xtableList(data_mast_de_stats, caption = paste0("(\\#tab:scRNAseq-differential-expression-A-MAST-table)Number of features with significantly differential expression (MAST) across experimental factors ", ReporteR.base::itemize(gsub(pattern = "_", replacement = "-", x = names(data_mast_de_stats))), ". Up- and downregulation were assesed at an adjusted p-value of $", local_params$max_p_adjust, "$ and a minimum $log_{2}$ fold change of $", local_params$min_abs_lfc, "$. Numbers in parentheses indicate up- and downregulated features whose 95\\% confidence interval lies well above (below) the minimum $log_{2}$ fold change of $", local_params$min_abs_lfc,"$."))

xtable::print.xtableList(table_mast_de_stats, booktabs = TRUE, comment = FALSE, timestamp = NULL, include.rownames = FALSE)
```

```{r scRNAseq-differential-expression-A-MAST-results, echo = FALSE, include = FALSE}
# Zip temporary result files and write out
mast_abs_paths <- as.character(do.call("rbind", mast_de_stats)$filename)

mast_file_base <- "08-A-MAST"
target_path <- file.path(managed_objects$paths$differential_expression_results_dir, mast_file_base)

# Create dir, fill with target files
if(!dir.exists(target_path)) {
  dir.create(target_path, showWarnings = FALSE)
}

for (i in seq_along(mast_abs_paths))
{
    mast_abs_paths %>%
      magrittr::extract2(i) %>%
      file.copy(
        target_path
      )
}

# cwd <- getwd()
# target_path %>% setwd()
#
# mast_rel_paths <- dir(full.names = T)
#
# stop()
# if (local_params$compression == "zip") {
#   zip(zipfile = paste0(target_path, ".zip"),
#       files = mast_rel_paths)
# }

```
	```{r parameters-and-defaults, include = FALSE}
	module <- "scRNAseq"
	section <- "differential_expression"
	```

	```{r parameter-merge, include = FALSE}
	local_params <- module %>%
	options() %>%
	magrittr::extract2(module) %>%
	magrittr::extract2(section) %>%
	ReporteR.base::validate_params(parameters_and_defaults)
	```

	Assessment of differences in RNA expression faces some challenges that are unique for measurements from single cells. First, single-cell expression has been shown to exhibit a characteristic bimodal expression pattern, wherein the expression of otherwise abundant genes is either strongly positive or undetected within individual cells. This is partly due to low starting quantities of RNA (technical noise), but may be also due to biological variation between heterogeneous subpopulations. Second, cells scale their transcript copy number with their cell volume to maintain a constant mRNA concentration and thus constant biochemical reaction rates [@padovan_compensation_2015]. Hence, in single cell analysis, cells of varying volumne - and varying mRNA copy number are diluted to an approximately fixed reaction volume, leading to differences in detection rate of various mRNA species that are driven by the initial cell volumes. Technical assay variability (e.g. amplification efficiency) and biological factors (e.g. differences in cell size) affect transcription globally and can significantly influence single cell expression level measurements.
	Model-based Analysis of Single-cell Transcriptomes (MAST) [@finak_mast_2015] estimates and controls for the cellular detection rate, i.e. the fraction of genes that are detectably expressed in each cell, as a proxy for both global technical and biological factors, while simultaneously estimating treatment effects between groups of cells. The underlying hurdle model leverages theory of generalized linear models and therefore permits complex experimental designs (e.g. controlling for technical or biological covariates).

	```{r scRNAseq-differential-expression-A-MAST-contrasts, include=FALSE, echo = FALSE}
	contrast_list <- lapply(local_params$groups, function(f) {
	l <- levels(droplevels(factor(SummarizedExperiment::colData(object_filtered)[, f])))
	r <- combn(x = l, m = 2, simplify = FALSE)
	names(r) <- rep(f, length(r))
	return(r)
	})

	contrasts <- lapply(contrast_list, function(l) {
	labels <- sapply(l, function(i) {
	paste0(i, collapse = "_vs_")
	})

	column <- unique(names(l))
	labels <- paste0(column, "_", labels)
	items <- lapply(l, function(i) {
	il <- list(
	which(SummarizedExperiment::colData(object_filtered)[, column] == i[1]),
	which(SummarizedExperiment::colData(object_filtered)[, column] == i[2]))
	names(il) <- i
	return(il)
	})
	names(items) <- labels
	return(items)
	})

	contrasts <- unlist(contrasts, recursive = FALSE)

	# Make sure all contrasts are exactly of length 2
	assertive.numbers::assert_all_are_equal_to(sapply(contrasts, length), 2)
	```

	```{r scRNAseq-differential-expression-A-MAST-processing, include=FALSE, echo = FALSE}

	mast_output_dir <- tempdir(check = TRUE)

	mast_de_stats <- lapply(names(contrasts), function(n) {
	contrast <- contrasts[[n]]

	data <- cbind(SummarizedExperiment::assay(object_filtered, local_params$assay)[, contrast[[1]]],
	SummarizedExperiment::assay(object_filtered, local_params$assay)[, contrast[[2]]])
	condition <- c(rep(names(contrast)[1], length(contrast[[1]])), rep(names(contrast)[2], length(contrast[[2]])))

	cdat <- data.frame(wellKey = colnames(data), condition = factor(condition), stringsAsFactors = F)
	formulae <- "~ condition"

	# Add batch
	if(length(local_params$batch) > 0) {
	cdat$batch <- SummarizedExperiment::colData(object_filtered)[c(contrast[[1]], contrast[[2]]), local_params$batch]
	formulae <- paste0(formulae, " + batch")
	}

	fdat <- data.frame(primerid = rownames(data), stringsAsFactors = F)
	sca <- MAST::FromMatrix(exprsArray = data,
	cData = cdat,
	fData = fdat,
	check_logged = FALSE)

	zlm <- MAST::zlm(as.formula(formulae), sca, method = "bayesglm", ebayes = TRUE, ebayesControl = list(method = "MLE", model = "H1"))
	s <- MAST::summary(zlm, doLRT = paste0('condition', names(contrast)[2]))$datatable

	# Merge hurdle P-values and logFC coefficients from MAST results
	columns_left <- c("primerid", "Pr(>Chisq)")
	columns_right <- c("primerid", "coef", "ci.hi", "ci.lo")
	contrast_query <- paste0("condition", names(contrast)[2])
	res <- data.table:::merge.data.table(s[contrast == contrast_query & component == "H", ..columns_left],
	s[contrast == contrast_query & component == "logFC", ..columns_right], by = "primerid")
	colnames(res) <- c("geneID", "pval", "lfc", "lfc.hi", "lfc.lo")

	# Multiple testing correction
	res$p.adjust <- stats::p.adjust(res$pval, method = local_params$adjust_method)

	# Add additional columns
	if(length(local_params$add_columns) > 0) {
	fdt <- data.table::as.data.table(SummarizedExperiment::rowData(object_filtered)[, local_params$add_columns])
	fdt$geneID <- rownames(object_filtered)
	res <- data.table:::merge.data.table(fdt, res, by = "geneID")
	}

	# Write table
	mast_contrast_filename <- file.path(mast_output_dir, paste0(n %>% stringi::stri_replace_first_regex("^\\.", ""), ".txt"))
	write.table(x = res,
	file = mast_contrast_filename,
	quote = FALSE,
	row.names = FALSE,
	col.names = TRUE,
	sep = "\t")

	# Report back number of differentially expressed genes
	data.frame(contrast = paste(names(contrast), collapse = " vs. "),
	filename = mast_contrast_filename,
	upregulated = sum(res$p.adjust < local_params$max_p_adjust & res$lfc > local_params$min_abs_lfc, na.rm = T),
	downregulated = sum(res$p.adjust < local_params$max_p_adjust & res$lfc < -local_params$min_abs_lfc, na.rm = T),
	upregulated_strict = sum(res$p.adjust < local_params$max_p_adjust & res$lfc.lo > local_params$min_abs_lfc, na.rm = T),
	downregulated_strict = sum(res$p.adjust < local_params$max_p_adjust & res$lfc.hi < -local_params$min_abs_lfc, na.rm = T))
	})

	data_mast_de_stats <- lapply(split(mast_de_stats, rep(local_params$groups, times = sapply(contrast_list, length))), function(l) {
	do.call("rbind", l) %>%
	dplyr::select(contrast, upregulated, downregulated, upregulated_strict, downregulated_strict) %>%
	dplyr::transmute(contrast = contrast, upreg = paste0(upregulated, " (", upregulated_strict, ")"), downreg = paste0(downregulated, " (", downregulated_strict, ")")) -> df
	colnames(df) <- c("Contrast", "Upregulated (strict)", "Downregulated (strict)")
	return(df)
	})
	attr(data_mast_de_stats, "subheadings") <- paste("from", gsub(pattern = "_", replacement = "-", x = names(data_mast_de_stats)))
	```

	Table \@ref(tab:scRNAseq-differential-expression-A-MAST-table) shows the number of differentially expressed features across the experimental factors `r ReporteR.base::itemize(names(data_mast_de_stats))`, stratified by different selection criteria.

	```{r scRNAseq-differential-expression-A-MAST-table, results="asis"}
	table_mast_de_stats <- xtable::xtableList(data_mast_de_stats, caption = paste0("(\\#tab:scRNAseq-differential-expression-A-MAST-table)Number of features with significantly differential expression (MAST) across experimental factors ", ReporteR.base::itemize(gsub(pattern = "_", replacement = "-", x = names(data_mast_de_stats))), ". Up- and downregulation were assesed at an adjusted p-value of $", local_params$max_p_adjust, "$ and a minimum $log_{2}$ fold change of $", local_params$min_abs_lfc, "$. Numbers in parentheses indicate up- and downregulated features whose 95\\% confidence interval lies well above (below) the minimum $log_{2}$ fold change of $", local_params$min_abs_lfc,"$."))

	xtable::print.xtableList(table_mast_de_stats, booktabs = TRUE, comment = FALSE, timestamp = NULL, include.rownames = FALSE)
	```

	```{r scRNAseq-differential-expression-A-MAST-results, echo = FALSE, include = FALSE}
	# Zip temporary result files and write out
	mast_abs_paths <- as.character(do.call("rbind", mast_de_stats)$filename)

	mast_file_base <- "08-A-MAST"
	target_path <- file.path(managed_objects$paths$differential_expression_results_dir, mast_file_base)

	# Create dir, fill with target files
	if(!dir.exists(target_path)) {
	dir.create(target_path, showWarnings = FALSE)
	}

	for (i in seq_along(mast_abs_paths))
	{
	mast_abs_paths %>%
	magrittr::extract2(i) %>%
	file.copy(
	target_path
	)
	}

	# cwd <- getwd()
	# target_path %>% setwd()
	#
	# mast_rel_paths <- dir(full.names = T)
	#
	# stop()
	# if (local_params$compression == "zip") {
	# zip(zipfile = paste0(target_path, ".zip"),
	# files = mast_rel_paths)
	# }

	```