This repository has been archived by the owner. It is now read-only.
Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ReporteR.scRNAseq/inst/content/02-quality-control-D-cellfiltering.Rmd
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
187 lines (146 sloc)
8.56 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
```{r parameters-and-defaults, include = FALSE} | |
module <- "scRNAseq" | |
section <- "quality_control" | |
``` | |
```{r parameter-merge, include = FALSE} | |
local_params <- module %>% | |
options() %>% | |
magrittr::extract2(module) %>% | |
magrittr::extract2(section) %>% | |
ReporteR.base::validate_params(parameters_and_defaults) | |
``` | |
```{r scRNAseq-quality-control-D-cellfiltering-checks, include = FALSE, echo = FALSE} | |
assertive.files::assert_all_are_readable_files(local_params$cell_filter_yml) | |
cell_filter <- yaml::read_yaml(local_params$cell_filter_yml) | |
# Check filter names | |
assertive.properties::assert_has_names(cell_filter) | |
assertive.sets::assert_is_subset(names(cell_filter), colnames(SummarizedExperiment::colData(object))) | |
# Check filter structure | |
sapply(names(cell_filter), function(n) { | |
assertive.sets::assert_is_subset(c("nmads", "type", "log"), names(cell_filter[[n]])) | |
}) | |
``` | |
### Cell filtering | |
Cell filtering strives for the identification of samples that show typical features of low quality cells (see above), by determining *outliers* based on the **m**edian **a**bsolute **d**eviation (MAD) among the quality features `r ReporteR.base::itemize(names(cell_filter))`. | |
The median absolute deviation is a measure of statistical dispersion (variability). Moreover, the MAD is a robust statistic, being more resilient to outliers in a data set than the standard deviation. Mathematically, the MAD of a metric $X$ is defined as the median of the absolute deviations from the data's median | |
$MAD(X) = median(|X_{i} - median(X)|)$ | |
```{r scRNAseq-quality-control-D-cellfiltering-processing, echo = FALSE, include = FALSE} | |
outlier_list <- lapply(names(cell_filter), function(c) { | |
filter <- cell_filter[[c]] | |
# Could be hard filtering | |
filter$type <- gsub(pattern = "-hard", replacement = "", x = filter$type) | |
is_outlier <- scater::isOutlier(metric = SummarizedExperiment::colData(object)[, c], | |
nmads = as.numeric(filter$nmads), | |
type = filter$type, | |
log = filter$log) | |
return(is_outlier) | |
}) | |
outlier <- do.call("cbind", outlier_list) | |
colnames(outlier) <- names(cell_filter) | |
rownames(outlier) <- colnames(object) | |
failed_cells <- (rowSums(outlier) > local_params$cell_filter_allow_n_failed) | |
# Second pass: Enable hard cutoffs below the lower/upper bound | |
for(c in names(cell_filter)) { | |
filter <- cell_filter[[c]] | |
if (endsWith(filter$type, "-hard")) { | |
failed_cells[outlier[, c]] <- TRUE | |
} | |
} | |
SummarizedExperiment::colData(object)$qc_status <- ifelse(failed_cells, "fail", "pass") | |
``` | |
In total, **`r sum(failed_cells)`** low quality samples (`r round(sum(failed_cells)/length(failed_cells)*100)`%) have been identified. Evaluation of individual filters is shown in Table \@ref(tab:scRNAseq-quality-control-D-cellfiltering-table) and Figure \@ref(fig:scRNAseq-quality-control-D-cellfiltering-figure), as well as Figure \@ref(fig:scRNAseq-quality-control-D-cellfiltering-figure2). | |
```{r scRNAseq-quality-control-D-cellfiltering-table, results = "asis"} | |
filter_names <- gsub("_", "-", names(cell_filter)) | |
tbl_data <- data.frame(Result = as.vector(ifelse(outlier, "fail", "pass")), | |
Filter = rep(filter_names, each = nrow(outlier))) | |
col_vars <- c("Result") | |
extra_caption <- "by filter." | |
summary_row_data <- data.frame(Result = SummarizedExperiment::colData(object)$qc_status, Filter = "combined filters") | |
if(assertive.properties::is_non_empty(local_params$tabulate_samples)) { | |
# Sanitize names | |
tabulate_names <- gsub("_", "-", SummarizedExperiment::colData(object)[, local_params$tabulate_samples]) | |
tbl_data <- cbind(tbl_data, rep(tabulate_names, length(cell_filter))) | |
colnames(tbl_data) <- c("Result", "Filter", local_params$tabulate_sample) | |
col_vars <- c(local_params$tabulate_sample, col_vars) | |
extra_caption <- paste0("by filter and ", local_params$tabulate_sample, ".") | |
# Add summary row | |
summary_row_data <- data.frame(Result = SummarizedExperiment::colData(object)$qc_status, Filter = "combined filters", tabulate_names) | |
colnames(summary_row_data) <- c("Result", "Filter", local_params$tabulate_sample) | |
} | |
tbl_data <- rbind(tbl_data, summary_row_data) | |
ftab <- stats::ftable(tbl_data, col.vars = col_vars, row.vars = c("Filter")) | |
xftbl <- xtable::xtableFtable(ftab, | |
caption = paste("(\\#tab:scRNAseq-quality-control-D-cellfiltering-table)Number of samples passing/failing quality control", extra_caption), | |
method = "compact") | |
xtable::print.xtableFtable(xftbl, | |
booktabs = TRUE, | |
comment = FALSE, | |
timestamp = NULL) | |
``` | |
```{r scRNAseq-quality-control-D-cellfiltering-figure-params, include = FALSE, echo = FALSE} | |
fig_height <- ReporteR.base::estimate_figure_height( | |
height_in_panels = ceiling(length(cell_filter)/2), | |
panel_height_in_in = params$formatting_defaults$figures$panel_height_in, | |
axis_space_in_in = params$formatting_defaults$figures$axis_space_in, | |
mpf_row_space = as.numeric(grid::convertUnit(grid::unit(5, 'mm'), 'in')), | |
max_height_in_in = params$formatting_defaults$figures$max_height_in) | |
``` | |
```{r scRNAseq-quality-control-D-cellfiltering-figure, message = FALSE, warning = FALSE, echo = FALSE, fig.height = fig_height$global, fig.cap = "Evaluation of individual cell filters. The range of the metric (X) is shown on the X-axis, vertical bars in blue and red indicate lower and upper limits respectively in terms of MAD. Outliers are located to the left of the blue and to the right of the red vertical bars."} | |
figure_cell_filtering <- multipanelfigure::multi_panel_figure(height = fig_height$sub, columns = 2, rows = ceiling(length(cell_filter)/2), unit = "in") | |
for(c in names(cell_filter)) { | |
g <- scater::plotColData(object, y = c, show_median = T) + | |
ggplot2::coord_flip() + | |
ggplot2::theme(plot.title = ggplot2::element_text(face="bold", color="black", size=6)) + | |
ggplot2::ylab("") + | |
ggplot2::ggtitle(c) | |
metric <- SummarizedExperiment::colData(object)[, c] | |
cur.med <- median(metric, na.rm = TRUE) | |
cur.mad <- mad(metric, center = cur.med, na.rm = TRUE) | |
diff.val <- max(NA, cell_filter[[c]]$nmads * cur.mad, na.rm = TRUE) | |
upper.limit <- cur.med + diff.val | |
lower.limit <- cur.med - diff.val | |
cell_filter[[c]]$type <- gsub("-hard", "", cell_filter[[c]]$type) | |
if (cell_filter[[c]]$type == "lower") { | |
upper.limit <- Inf | |
g <- g + ggplot2::geom_hline(yintercept=lower.limit, color = "#2166ac") | |
} else if (cell_filter[[c]]$type == "higher") { | |
lower.limit <- -Inf | |
g <- g + ggplot2::geom_hline(yintercept=upper.limit, color = "#67001f") | |
} else { | |
g <- g + | |
ggplot2::geom_hline(yintercept=lower.limit, color = "#2166ac") + | |
ggplot2::geom_hline(yintercept=upper.limit, color = "#67001f") | |
} | |
figure_cell_filtering <- multipanelfigure::fill_panel(figure_cell_filtering, g) | |
} | |
figure_cell_filtering | |
``` | |
```{r scRNAseq-quality-control-D-cellfiltering-figure2-params, include = FALSE, echo = FALSE} | |
fig_height <- ReporteR.base::estimate_figure_height( | |
height_in_panels = 1, | |
panel_height_in_in = params$formatting_defaults$figures$panel_height_in, | |
axis_space_in_in = params$formatting_defaults$figures$axis_space_in, | |
mpf_row_space = as.numeric(grid::convertUnit(grid::unit(5, 'mm'), 'in')), | |
max_height_in_in = params$formatting_defaults$figures$max_height_in) | |
sup_fig_cap <- "." | |
if (length(local_params$features) > 0) { | |
tmp <- sapply(1:length(na.omit(local_params$features[1:2])), function(i) { | |
paste0("(", LETTERS[i+1], ") ", local_params$features[i]) | |
}) | |
sup_fig_cap <- paste0(", ", ReporteR.base::itemize(tmp, sort = FALSE), sup_fig_cap) | |
} | |
fig_cap <- paste0("Biplot of components 1 (\'Dimension 1\') vs. 2 (\'Dimension 2\') from *principal component analysis* [@ringner_2008] of the sample quality subspace. Cells are colored by (A) QC status of individual cells", sup_fig_cap) | |
``` | |
```{r scRNAseq-quality-control-D-cellfiltering-figure2, message = FALSE, warning = FALSE, echo = FALSE, fig.height = fig_height$global, fig.cap = fig_cap} | |
figure_cell_filtering_summary <- multipanelfigure::multi_panel_figure(height = fig_height$sub, columns = 3, rows = 1, unit = "in") | |
for(c in c("qc_status", na.omit(local_params$features[1:2]))) { | |
tmp <- object %>% | |
scater::plotReducedDim(use_dimred = "PCA_coldata", colour_by = c) + | |
ggplot2::guides(colour = FALSE) + | |
theme_qc_pca | |
if(c == "qc_status") | |
tmp <- tmp + ggplot2::scale_fill_manual(values = c("fail" = "#FF0000", "pass" = "#000000")) | |
figure_cell_filtering_summary <- multipanelfigure::fill_panel(figure_cell_filtering_summary, tmp) | |
} | |
figure_cell_filtering_summary | |
``` |