03-normalization.Rmd

```{r, parameters-and-defaults, include = FALSE}
module <- "scRNAseq"
section <- "normalization"
parameters_and_defaults <- list(
  batch = structure(
    c(),
    type = "character",
    choices = NA,
    several.ok = FALSE
  ),
  features = structure(
    c(),
    type = "character",
    choices = NA,
    several.ok = TRUE
  ),
  include_methods = structure(
    c(),
    type = "character",
    choices = NA,
    several.ok = TRUE
  ),
  compare = structure(
    TRUE,
    type = "logical",
    choices = NA,
    several.ok = FALSE
  )
)
```

```{r parameter-merge, include = FALSE}
local_params <- module %>%
  options() %>%
  magrittr::extract2(module) %>%
  magrittr::extract2(section) %>%
  ReporteR.base::validate_params(parameters_and_defaults)
```

```{r scRNAseq-normalization-load, include=FALSE, eval = !exists("object_filtered")}
assertive.files::is_existing_file(managed_objects$paths$object_filtered$path)

object_filtered <- readRDS(managed_objects$paths$object_filtered$path) %>%
  ReporteR.base::flag_persistent()
```

## Data normalization

The data normalization step strives to ensure that comparisons of relative expression between cells are valid and that systematic technical effects have minimal impact on downstream analyses. It is therefore a critical and essential step, as it determines the validity of downstream quantitative analyses.

Normalization of count data from scRNA-seq experiments seeks to correct for cell-sepcific biases, e.g. cell-to-cell differences in capture efficiency, sequencing depth, and other technical confounders.
A simple approach would be the scaling of counts to remove differences in library sizes between cells, i.e., library size normalization. Current RNA-seq analysis methods typically standardize data between samples by scaling the number of reads in a given sample to a common value across all sequenced samples in the experiment. However, this is not straightforward for noisy single-cell data where many counts are zero. The underlying assumption is that most genes are not differentially expressed across the sampled cells. Counts are scaled so that there is, on average, no fold-difference in expression between cells for the majority of genes. This is the underlying concept of commonly used methods such as trimmed mean of M values [@robinson_tmm_2010] normalization.

```{r scRNAseq-normalization-A-params, echo = FALSE, include = FALSE, R.options = params}
theme_norm_pca <- ggplot2::theme(plot.background = ggplot2::element_blank(),
                                 panel.grid.major = ggplot2::element_line(size=.2, colour = "grey"),
                                 panel.grid.minor = ggplot2::element_line(size=.1, colour = "grey"),
                                 panel.border = ggplot2::element_blank(),
                                 panel.background = ggplot2::element_blank(),
                                 axis.line.x = ggplot2::element_line(size=.3),
                                 axis.line.y = ggplot2::element_line(size=.3),
                                 axis.text = ggplot2::element_text(size = 4),
                                 axis.title.y = ggplot2::element_text(size = 5, margin = ggplot2::margin(0, -2, 0, 0)),
                                 axis.title.x = ggplot2::element_text(size = 5, margin = ggplot2::margin(-2, 0, 0, 0)),
                                 legend.text = ggplot2::element_text(size = 5),
                                 legend.title = ggplot2::element_text(size = 5),
                                 plot.title = ggplot2::element_text(face="bold", color="black", size=5),
                                 legend.key.size =  grid::unit(2, "mm"),
                                 legend.margin = ggplot2::margin(b = 2),#grid::unit(-50, "mm"),
                                 legend.position = c(0,1),
                                 legend.justification = c(0,0),
                                 legend.background = ggplot2::element_rect(fill = "white"),
                                 legend.direction = "horizontal")

caption_norm_pca <- "(A) Biplot of components 1 (\'Dimension 1\') vs. 2 (\'Dimension 2\') of *principal component analysis* [@ringner_2008] using non-normalized count values, along with (B) the percentage of variance explained by several factors. (C) Biplot of components 1 (\'Dimension 1\') vs. 2 (\'Dimension 2\') of *principal component analysis* using normalized expression values, along with (D) the percentage of variance explained by several factors."
caption_norm_pca_extra <- "(E) - (G) Visual representation of several factors of interest onto the biplot from (C)."
```

```{r scRNAseq-normalization-A-sumfactors, eval = TRUE, echo = FALSE, include = FALSE, R.options = params}
rmd_path <- system.file(file.path('content', '03-normalization-A-sumfactors.Rmd'), package = 'ReporteR.scRNAseq', mustWork = TRUE)
md_path = ReporteR.base::make_md_path(rmd_path)
knitr::knit_child(rmd_path, output = md_path)
```

```{r scRNAseq-normalization-A-sumfactors-include, echo = FALSE, eval = ifelse(exists('local_params'), 'norm_sumfactor' %in% local_params$include_methods, FALSE), results="asis"}
rmd_path <- system.file(file.path('content', '03-normalization-A-sumfactors.Rmd'), package = 'ReporteR.scRNAseq', mustWork = TRUE)
md_path <- ReporteR.base::make_md_path(rmd_path)

assertive.files::assert_all_are_readable_files(md_path)
md_path %>%
  readLines() %>%
  cat(sep = '\n')
```

```{r scRNAseq-normalization-B-TMM, echo = FALSE, include = FALSE, R.options = params}
rmd_path <- system.file(file.path('content', '03-normalization-B-TMM.Rmd'), package = 'ReporteR.scRNAseq', mustWork = TRUE)
md_path = ReporteR.base::make_md_path(rmd_path)
knitr::knit_child(rmd_path, output = md_path)
```

```{r scRNAseq-normalization-B-TMM-include, echo = FALSE, eval = ifelse(exists('local_params'), 'norm_TMM' %in% local_params$include_methods, FALSE), results="asis"}
rmd_path <- system.file(file.path('content', '03-normalization-B-TMM.Rmd'), package = 'ReporteR.scRNAseq', mustWork = TRUE)
md_path <- ReporteR.base::make_md_path(rmd_path)

assertive.files::assert_all_are_readable_files(md_path)
md_path %>%
  readLines() %>%
  cat(sep = '\n')
```

```{r scRNAseq-normalization-C-scnorm, echo = FALSE, include = FALSE, R.options = params, eval = ifelse(exists('local_params'), 'batch' %in% local_params, FALSE)}
rmd_path <- system.file(file.path('content', '03-normalization-C-scnorm.Rmd'), package = 'ReporteR.scRNAseq', mustWork = TRUE)
md_path = ReporteR.base::make_md_path(rmd_path)
knitr::knit_child(rmd_path, output = md_path)
```

```{r scRNAseq-normalization-C-scnorm-include, echo = FALSE, eval = ifelse(exists('local_params'), 'norm_scnorm' %in% local_params$include_methods, FALSE), results="asis"}
rmd_path <- system.file(file.path('content', '03-normalization-C-scnorm.Rmd'), package = 'ReporteR.scRNAseq', mustWork = TRUE)
md_path <- ReporteR.base::make_md_path(rmd_path)

assertive.files::assert_all_are_readable_files(md_path)
md_path %>%
  readLines() %>%
  cat(sep = '\n')
```

```{r scRNAseq-normalization-Z-compare, echo = FALSE, include = FALSE, R.options = params}
rmd_path <- system.file(file.path('content', '03-normalization-Z-compare.Rmd'), package = 'ReporteR.scRNAseq', mustWork = TRUE)
md_path = ReporteR.base::make_md_path(rmd_path)
knitr::knit_child(rmd_path, output = md_path)
```

```{r scRNAseq-normalization-Z-compare-include, echo = FALSE, eval = ifelse(exists('local_params'), local_params$compare, FALSE), results="asis"}
rmd_path <- system.file(file.path('content', '03-normalization-Z-compare.Rmd'), package = 'ReporteR.scRNAseq', mustWork = TRUE)
md_path <- ReporteR.base::make_md_path(rmd_path)

assertive.files::assert_all_are_readable_files(md_path)
md_path %>%
  readLines() %>%
  cat(sep = '\n')
```

```{r scRNAseq-normalization-terminal-cleanup, include = FALSE}
saveRDS(object = object_filtered, file = managed_objects$paths$object_filtered$path)

ReporteR.base::purge_nonpersistent()
```
	```{r, parameters-and-defaults, include = FALSE}
	module <- "scRNAseq"
	section <- "normalization"
	parameters_and_defaults <- list(
	batch = structure(
	c(),
	type = "character",
	choices = NA,
	several.ok = FALSE
	),
	features = structure(
	c(),
	type = "character",
	choices = NA,
	several.ok = TRUE
	),
	include_methods = structure(
	c(),
	type = "character",
	choices = NA,
	several.ok = TRUE
	),
	compare = structure(
	TRUE,
	type = "logical",
	choices = NA,
	several.ok = FALSE
	)
	)
	```

	```{r parameter-merge, include = FALSE}
	local_params <- module %>%
	options() %>%
	magrittr::extract2(module) %>%
	magrittr::extract2(section) %>%
	ReporteR.base::validate_params(parameters_and_defaults)
	```

	```{r scRNAseq-normalization-load, include=FALSE, eval = !exists("object_filtered")}
	assertive.files::is_existing_file(managed_objects$paths$object_filtered$path)

	object_filtered <- readRDS(managed_objects$paths$object_filtered$path) %>%
	ReporteR.base::flag_persistent()
	```

	## Data normalization

	The data normalization step strives to ensure that comparisons of relative expression between cells are valid and that systematic technical effects have minimal impact on downstream analyses. It is therefore a critical and essential step, as it determines the validity of downstream quantitative analyses.

	Normalization of count data from scRNA-seq experiments seeks to correct for cell-sepcific biases, e.g. cell-to-cell differences in capture efficiency, sequencing depth, and other technical confounders.
	A simple approach would be the scaling of counts to remove differences in library sizes between cells, i.e., library size normalization. Current RNA-seq analysis methods typically standardize data between samples by scaling the number of reads in a given sample to a common value across all sequenced samples in the experiment. However, this is not straightforward for noisy single-cell data where many counts are zero. The underlying assumption is that most genes are not differentially expressed across the sampled cells. Counts are scaled so that there is, on average, no fold-difference in expression between cells for the majority of genes. This is the underlying concept of commonly used methods such as trimmed mean of M values [@robinson_tmm_2010] normalization.

	```{r scRNAseq-normalization-A-params, echo = FALSE, include = FALSE, R.options = params}
	theme_norm_pca <- ggplot2::theme(plot.background = ggplot2::element_blank(),
	panel.grid.major = ggplot2::element_line(size=.2, colour = "grey"),
	panel.grid.minor = ggplot2::element_line(size=.1, colour = "grey"),
	panel.border = ggplot2::element_blank(),
	panel.background = ggplot2::element_blank(),
	axis.line.x = ggplot2::element_line(size=.3),
	axis.line.y = ggplot2::element_line(size=.3),
	axis.text = ggplot2::element_text(size = 4),
	axis.title.y = ggplot2::element_text(size = 5, margin = ggplot2::margin(0, -2, 0, 0)),
	axis.title.x = ggplot2::element_text(size = 5, margin = ggplot2::margin(-2, 0, 0, 0)),
	legend.text = ggplot2::element_text(size = 5),
	legend.title = ggplot2::element_text(size = 5),
	plot.title = ggplot2::element_text(face="bold", color="black", size=5),
	legend.key.size = grid::unit(2, "mm"),
	legend.margin = ggplot2::margin(b = 2),#grid::unit(-50, "mm"),
	legend.position = c(0,1),
	legend.justification = c(0,0),
	legend.background = ggplot2::element_rect(fill = "white"),
	legend.direction = "horizontal")

	caption_norm_pca <- "(A) Biplot of components 1 (\'Dimension 1\') vs. 2 (\'Dimension 2\') of principal component analysis [@ringner_2008] using non-normalized count values, along with (B) the percentage of variance explained by several factors. (C) Biplot of components 1 (\'Dimension 1\') vs. 2 (\'Dimension 2\') of principal component analysis using normalized expression values, along with (D) the percentage of variance explained by several factors."
	caption_norm_pca_extra <- "(E) - (G) Visual representation of several factors of interest onto the biplot from (C)."
	```

	```{r scRNAseq-normalization-A-sumfactors, eval = TRUE, echo = FALSE, include = FALSE, R.options = params}
	rmd_path <- system.file(file.path('content', '03-normalization-A-sumfactors.Rmd'), package = 'ReporteR.scRNAseq', mustWork = TRUE)
	md_path = ReporteR.base::make_md_path(rmd_path)
	knitr::knit_child(rmd_path, output = md_path)
	```

	```{r scRNAseq-normalization-A-sumfactors-include, echo = FALSE, eval = ifelse(exists('local_params'), 'norm_sumfactor' %in% local_params$include_methods, FALSE), results="asis"}
	rmd_path <- system.file(file.path('content', '03-normalization-A-sumfactors.Rmd'), package = 'ReporteR.scRNAseq', mustWork = TRUE)
	md_path <- ReporteR.base::make_md_path(rmd_path)

	assertive.files::assert_all_are_readable_files(md_path)
	md_path %>%
	readLines() %>%
	cat(sep = '\n')
	```

	```{r scRNAseq-normalization-B-TMM, echo = FALSE, include = FALSE, R.options = params}
	rmd_path <- system.file(file.path('content', '03-normalization-B-TMM.Rmd'), package = 'ReporteR.scRNAseq', mustWork = TRUE)
	md_path = ReporteR.base::make_md_path(rmd_path)
	knitr::knit_child(rmd_path, output = md_path)
	```

	```{r scRNAseq-normalization-B-TMM-include, echo = FALSE, eval = ifelse(exists('local_params'), 'norm_TMM' %in% local_params$include_methods, FALSE), results="asis"}
	rmd_path <- system.file(file.path('content', '03-normalization-B-TMM.Rmd'), package = 'ReporteR.scRNAseq', mustWork = TRUE)
	md_path <- ReporteR.base::make_md_path(rmd_path)

	assertive.files::assert_all_are_readable_files(md_path)
	md_path %>%
	readLines() %>%
	cat(sep = '\n')
	```

	```{r scRNAseq-normalization-C-scnorm, echo = FALSE, include = FALSE, R.options = params, eval = ifelse(exists('local_params'), 'batch' %in% local_params, FALSE)}
	rmd_path <- system.file(file.path('content', '03-normalization-C-scnorm.Rmd'), package = 'ReporteR.scRNAseq', mustWork = TRUE)
	md_path = ReporteR.base::make_md_path(rmd_path)
	knitr::knit_child(rmd_path, output = md_path)
	```

	```{r scRNAseq-normalization-C-scnorm-include, echo = FALSE, eval = ifelse(exists('local_params'), 'norm_scnorm' %in% local_params$include_methods, FALSE), results="asis"}
	rmd_path <- system.file(file.path('content', '03-normalization-C-scnorm.Rmd'), package = 'ReporteR.scRNAseq', mustWork = TRUE)
	md_path <- ReporteR.base::make_md_path(rmd_path)

	assertive.files::assert_all_are_readable_files(md_path)
	md_path %>%
	readLines() %>%
	cat(sep = '\n')
	```

	```{r scRNAseq-normalization-Z-compare, echo = FALSE, include = FALSE, R.options = params}
	rmd_path <- system.file(file.path('content', '03-normalization-Z-compare.Rmd'), package = 'ReporteR.scRNAseq', mustWork = TRUE)
	md_path = ReporteR.base::make_md_path(rmd_path)
	knitr::knit_child(rmd_path, output = md_path)
	```

	```{r scRNAseq-normalization-Z-compare-include, echo = FALSE, eval = ifelse(exists('local_params'), local_params$compare, FALSE), results="asis"}
	rmd_path <- system.file(file.path('content', '03-normalization-Z-compare.Rmd'), package = 'ReporteR.scRNAseq', mustWork = TRUE)
	md_path <- ReporteR.base::make_md_path(rmd_path)

	assertive.files::assert_all_are_readable_files(md_path)
	md_path %>%
	readLines() %>%
	cat(sep = '\n')
	```

	```{r scRNAseq-normalization-terminal-cleanup, include = FALSE}
	saveRDS(object = object_filtered, file = managed_objects$paths$object_filtered$path)

	ReporteR.base::purge_nonpersistent()
	```