Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
64fce7794e
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
187 lines (146 sloc) 8.56 KB
```{r parameters-and-defaults, include = FALSE}
module <- "scRNAseq"
section <- "quality_control"
```
```{r parameter-merge, include = FALSE}
local_params <- module %>%
options() %>%
magrittr::extract2(module) %>%
magrittr::extract2(section) %>%
ReporteR.base::validate_params(parameters_and_defaults)
```
```{r scRNAseq-quality-control-D-cellfiltering-checks, include = FALSE, echo = FALSE}
assertive.files::assert_all_are_readable_files(local_params$cell_filter_yml)
cell_filter <- yaml::read_yaml(local_params$cell_filter_yml)
# Check filter names
assertive.properties::assert_has_names(cell_filter)
assertive.sets::assert_is_subset(names(cell_filter), colnames(SummarizedExperiment::colData(object)))
# Check filter structure
sapply(names(cell_filter), function(n) {
assertive.sets::assert_is_subset(c("nmads", "type", "log"), names(cell_filter[[n]]))
})
```
### Cell filtering
Cell filtering strives for the identification of samples that show typical features of low quality cells (see above), by determining *outliers* based on the **m**edian **a**bsolute **d**eviation (MAD) among the quality features `r ReporteR.base::itemize(names(cell_filter))`.
The median absolute deviation is a measure of statistical dispersion (variability). Moreover, the MAD is a robust statistic, being more resilient to outliers in a data set than the standard deviation. Mathematically, the MAD of a metric $X$ is defined as the median of the absolute deviations from the data's median
$MAD(X) = median(|X_{i} - median(X)|)$
```{r scRNAseq-quality-control-D-cellfiltering-processing, echo = FALSE, include = FALSE}
outlier_list <- lapply(names(cell_filter), function(c) {
filter <- cell_filter[[c]]
# Could be hard filtering
filter$type <- gsub(pattern = "-hard", replacement = "", x = filter$type)
is_outlier <- scater::isOutlier(metric = SummarizedExperiment::colData(object)[, c],
nmads = as.numeric(filter$nmads),
type = filter$type,
log = filter$log)
return(is_outlier)
})
outlier <- do.call("cbind", outlier_list)
colnames(outlier) <- names(cell_filter)
rownames(outlier) <- colnames(object)
failed_cells <- (rowSums(outlier) > local_params$cell_filter_allow_n_failed)
# Second pass: Enable hard cutoffs below the lower/upper bound
for(c in names(cell_filter)) {
filter <- cell_filter[[c]]
if (endsWith(filter$type, "-hard")) {
failed_cells[outlier[, c]] <- TRUE
}
}
SummarizedExperiment::colData(object)$qc_status <- ifelse(failed_cells, "fail", "pass")
```
In total, **`r sum(failed_cells)`** low quality samples (`r round(sum(failed_cells)/length(failed_cells)*100)`%) have been identified. Evaluation of individual filters is shown in Table \@ref(tab:scRNAseq-quality-control-D-cellfiltering-table) and Figure \@ref(fig:scRNAseq-quality-control-D-cellfiltering-figure), as well as Figure \@ref(fig:scRNAseq-quality-control-D-cellfiltering-figure2).
```{r scRNAseq-quality-control-D-cellfiltering-table, results = "asis"}
filter_names <- gsub("_", "-", names(cell_filter))
tbl_data <- data.frame(Result = as.vector(ifelse(outlier, "fail", "pass")),
Filter = rep(filter_names, each = nrow(outlier)))
col_vars <- c("Result")
extra_caption <- "by filter."
summary_row_data <- data.frame(Result = SummarizedExperiment::colData(object)$qc_status, Filter = "combined filters")
if(assertive.properties::is_non_empty(local_params$tabulate_samples)) {
# Sanitize names
tabulate_names <- gsub("_", "-", SummarizedExperiment::colData(object)[, local_params$tabulate_samples])
tbl_data <- cbind(tbl_data, rep(tabulate_names, length(cell_filter)))
colnames(tbl_data) <- c("Result", "Filter", local_params$tabulate_sample)
col_vars <- c(local_params$tabulate_sample, col_vars)
extra_caption <- paste0("by filter and ", local_params$tabulate_sample, ".")
# Add summary row
summary_row_data <- data.frame(Result = SummarizedExperiment::colData(object)$qc_status, Filter = "combined filters", tabulate_names)
colnames(summary_row_data) <- c("Result", "Filter", local_params$tabulate_sample)
}
tbl_data <- rbind(tbl_data, summary_row_data)
ftab <- stats::ftable(tbl_data, col.vars = col_vars, row.vars = c("Filter"))
xftbl <- xtable::xtableFtable(ftab,
caption = paste("(\\#tab:scRNAseq-quality-control-D-cellfiltering-table)Number of samples passing/failing quality control", extra_caption),
method = "compact")
xtable::print.xtableFtable(xftbl,
booktabs = TRUE,
comment = FALSE,
timestamp = NULL)
```
```{r scRNAseq-quality-control-D-cellfiltering-figure-params, include = FALSE, echo = FALSE}
fig_height <- ReporteR.base::estimate_figure_height(
height_in_panels = ceiling(length(cell_filter)/2),
panel_height_in_in = params$formatting_defaults$figures$panel_height_in,
axis_space_in_in = params$formatting_defaults$figures$axis_space_in,
mpf_row_space = as.numeric(grid::convertUnit(grid::unit(5, 'mm'), 'in')),
max_height_in_in = params$formatting_defaults$figures$max_height_in)
```
```{r scRNAseq-quality-control-D-cellfiltering-figure, message = FALSE, warning = FALSE, echo = FALSE, fig.height = fig_height$global, fig.cap = "Evaluation of individual cell filters. The range of the metric (X) is shown on the X-axis, vertical bars in blue and red indicate lower and upper limits respectively in terms of MAD. Outliers are located to the left of the blue and to the right of the red vertical bars."}
figure_cell_filtering <- multipanelfigure::multi_panel_figure(height = fig_height$sub, columns = 2, rows = ceiling(length(cell_filter)/2), unit = "in")
for(c in names(cell_filter)) {
g <- scater::plotColData(object, y = c, show_median = T) +
ggplot2::coord_flip() +
ggplot2::theme(plot.title = ggplot2::element_text(face="bold", color="black", size=6)) +
ggplot2::ylab("") +
ggplot2::ggtitle(c)
metric <- SummarizedExperiment::colData(object)[, c]
cur.med <- median(metric, na.rm = TRUE)
cur.mad <- mad(metric, center = cur.med, na.rm = TRUE)
diff.val <- max(NA, cell_filter[[c]]$nmads * cur.mad, na.rm = TRUE)
upper.limit <- cur.med + diff.val
lower.limit <- cur.med - diff.val
cell_filter[[c]]$type <- gsub("-hard", "", cell_filter[[c]]$type)
if (cell_filter[[c]]$type == "lower") {
upper.limit <- Inf
g <- g + ggplot2::geom_hline(yintercept=lower.limit, color = "#2166ac")
} else if (cell_filter[[c]]$type == "higher") {
lower.limit <- -Inf
g <- g + ggplot2::geom_hline(yintercept=upper.limit, color = "#67001f")
} else {
g <- g +
ggplot2::geom_hline(yintercept=lower.limit, color = "#2166ac") +
ggplot2::geom_hline(yintercept=upper.limit, color = "#67001f")
}
figure_cell_filtering <- multipanelfigure::fill_panel(figure_cell_filtering, g)
}
figure_cell_filtering
```
```{r scRNAseq-quality-control-D-cellfiltering-figure2-params, include = FALSE, echo = FALSE}
fig_height <- ReporteR.base::estimate_figure_height(
height_in_panels = 1,
panel_height_in_in = params$formatting_defaults$figures$panel_height_in,
axis_space_in_in = params$formatting_defaults$figures$axis_space_in,
mpf_row_space = as.numeric(grid::convertUnit(grid::unit(5, 'mm'), 'in')),
max_height_in_in = params$formatting_defaults$figures$max_height_in)
sup_fig_cap <- "."
if (length(local_params$features) > 0) {
tmp <- sapply(1:length(na.omit(local_params$features[1:2])), function(i) {
paste0("(", LETTERS[i+1], ") ", local_params$features[i])
})
sup_fig_cap <- paste0(", ", ReporteR.base::itemize(tmp, sort = FALSE), sup_fig_cap)
}
fig_cap <- paste0("Biplot of components 1 (\'Dimension 1\') vs. 2 (\'Dimension 2\') from *principal component analysis* [@ringner_2008] of the sample quality subspace. Cells are colored by (A) QC status of individual cells", sup_fig_cap)
```
```{r scRNAseq-quality-control-D-cellfiltering-figure2, message = FALSE, warning = FALSE, echo = FALSE, fig.height = fig_height$global, fig.cap = fig_cap}
figure_cell_filtering_summary <- multipanelfigure::multi_panel_figure(height = fig_height$sub, columns = 3, rows = 1, unit = "in")
for(c in c("qc_status", na.omit(local_params$features[1:2]))) {
tmp <- object %>%
scater::plotReducedDim(use_dimred = "PCA_coldata", colour_by = c) +
ggplot2::guides(colour = FALSE) +
theme_qc_pca
if(c == "qc_status")
tmp <- tmp + ggplot2::scale_fill_manual(values = c("fail" = "#FF0000", "pass" = "#000000"))
figure_cell_filtering_summary <- multipanelfigure::fill_panel(figure_cell_filtering_summary, tmp)
}
figure_cell_filtering_summary
```