Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
mmRmeta/R/preprocess_assay.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
283 lines (265 sloc)
14.9 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Preprocessing SE Data | |
#' @description This function uses multiple filtering functions on expression data in SE. | |
#' @param summarized_experiment SummarizedExperiment Object with gene expression | |
#' @param assay_index Integer - index of assay in SE on which the filtering is applied to | |
#' @import SummarizedExperiment | |
#' @examples | |
#' filteredBreastBRCA <- preprocess_assay(breastBRCA, assay_index = 1) #filtering is applied to first assay | |
#' @return SummarizedExperiment with probably less genes than before. | |
#' @export | |
preprocess_assay <- function(summarized_experiment, assay_index = 1, ... ) { | |
#option to exclude filter functions by index or by name | |
summarized_experiment <- filter_duplicates(summarized_experiment) | |
summarized_experiment <- filter_rare(summarized_experiment, assay_index = assay_index, ... ) | |
return(summarized_experiment) | |
} | |
#' Filter Duplicated Genes | |
#' @description This function filters out duplicated gene/gene names. | |
#' @param summarized_experiment Matrix, data frame or SE, must contain rownames | |
#' @import SummarizedExperiment | |
#' @examples | |
#' filteredBreastBRCA <- filter_duplicates(breastBRCA, assay_index = 1) #filtering is applied to first assay | |
#' @return SummarizedExperiment with probably less genes than before. | |
#' @export | |
filter_duplicates <- function(summarized_experiment) { | |
summarized_experiment <- summarized_experiment[unique(rownames(summarized_experiment))] | |
return(summarized_experiment) | |
} | |
#' Filter 0 Sum Rows | |
#' @description This function filters rows with row sums of zero (no expression) | |
#' @param summarized_experiment Matrix like object (data frame etc.) or SummarizedExperiment | |
#' @param assay_index Integer - index of assay of gene expression if input object is a SummarizedExperiment. | |
#' @details This function is somewhat redundant, when you use \code{\link{filter_zero}} rows with zero sums are filtered out as well. | |
#' @import SummarizedExperiment | |
#' @examples | |
#' filteredBreastBRCA <- filter_zero(breastBRCA, assay_index = 1) #filtering is applied to first assay | |
#' @return SummarizedExperiment with probably less genes than before. | |
#' @export | |
#further ideas = instead of sum use any | |
filter_zero <- function(summarized_experiment, assay_index = 1) { | |
if (class(summarized_experiment) %in% c("RangedSummarizedExperiment", "SummarizedExperiment")) { | |
expressionMatrix <- (assay(summarized_experiment, assay_index)) #assay to matrix | |
rowSum <- apply(X = expressionMatrix, MARGIN = 1, sum) #sum of every row | |
notZero <- which(rowSum > 0) | |
summarized_experiment <- summarized_experiment[names(notZero),] | |
} else { | |
rowSum <- apply(X = summarized_experiment, MARGIN = 1, sum) | |
notZero <- which(rowSum > 0) | |
summarized_experiment <- summarized_experiment[names(notZero),] | |
} | |
return(summarized_experiment) | |
} | |
#' Filter Infrequently Expressed Genes | |
#' @description This function filters genes that are expressed distinguished by a given percentage. Less expressed than minimum percentage | |
#' @param summarized_experiment SummarizedExperiment or any matrix like object with rownames | |
#' @param assay_index Integer - index of assay of gene expression if input object is a SummarizedExperiment | |
#' @param threshold Double - minimum percentage of expression values of a gene that are higher than 0 | |
#' @param min_counts Integer - minimum value of gene counts | |
#' @param min_percentage Double - percentage of samples with counts greater than 0 that need to have gene counts equal or above min_value | |
#' @details The Arguments min_counts and min_percentage allow for a more strict filtering. One is able to exclude genes with small expression values. | |
#' @import SummarizedExperiment | |
#' @examples | |
#' filteredBreastBRCA <- filter_rare(breastBRCA, assay_index = 1, threshold = 0.10, min_counts = 5, min_percentage = 0.20) | |
#' #All genes with more than 90% of samples having zero counts are excluded. Moreover, only genes with 20% of their samples having expression counts greater than 5 are kept. | |
#' @return SummarizedExperiment with probably less genes than before | |
#' @export | |
filter_rare <- function(summarized_experiment, assay_index = 1, threshold = 0.05, min_counts = 5, min_percentage = 0.10) { | |
if (class(summarized_experiment) %in% c("RangedSummarizedExperiment", "SummarizedExperiment")) { | |
expressionMatrix <- assay(summarized_experiment, assay_index) #assay to matrix | |
} else { | |
expressionMatrix <- summarized_experiment #assay to matrix | |
} | |
zeroPercentage <- apply(X = expressionMatrix, MARGIN = 1, mmRmeta::not_zero) #return percentage of counts > 0 | |
aboveThreshold <- zeroPercentage >= threshold #is percentage of samples with expression greater than 0 >= the threshold | |
isAbove <- aboveThreshold[aboveThreshold == TRUE] #subset if true | |
expressionMatrix <- expressionMatrix[names(isAbove),] | |
abovePercentage <- apply(X = expressionMatrix, MARGIN = 1, function(x) mmRmeta::above_percentage(x, min_counts = min_counts, min_percentage = min_percentage)) #logical vector of how much % of counts greater than 0 are also greater than min_counts? | |
isAbovePercentage <- abovePercentage[abovePercentage == TRUE] | |
#union | |
#different subset for matrix/df or RangedSummarizedExperiment | |
if (class(summarized_experiment) %in% c("RangedSummarizedExperiment", "SummarizedExperiment")) { | |
summarized_experiment <- summarized_experiment[names(isAbovePercentage),] | |
return(summarized_experiment) | |
} else { | |
expressionMatrix <- expressionMatrix[names(isAbovePercentage),] | |
return(expressionMatrix) | |
} | |
} | |
#' Percentage of Zero in Vector | |
#' @description This function filters genes that are expressed distinguished by a given percentage. Less expressed than minimum percentage | |
#' @param vector Numeric - numeric vector | |
#' @return Returns double/percentage of zeros in the vector | |
#' @export | |
not_zero <- function(vector) { | |
percentage <- sum(vector != 0)/length(vector) | |
return(percentage) | |
} | |
#' Percentage of Zero in Vector | |
#' @description This function filters genes that are expressed distinguished by a given percentage. Less expressed than minimum percentage | |
#' @param vector Numeric - numeric vector | |
#' @return Returns double/percentage of zeros in the vector | |
#' @export | |
above_percentage <- function(vector, min_counts, min_percentage) { | |
percentage <- sum(vector[vector > 0] >= min_counts) / length(vector[vector > 0]) #how much % of counts greater than 0 are also greater than min_counts? | |
isAbove <- percentage >= min_percentage | |
return(isAbove) | |
} | |
#' Filter Specific Genes on Chromosome | |
#' @description This function filters genes that are located on specific chromosomes. | |
#' @param summarized_experiment SummarizedExperiment or matrix like object. | |
#' @param chromosome Name of chromosomes to be filtered out | 1-24, X, Y | |
#' @param df_biomart Data frame containing biomaRt results | |
#' @details Gene names have to be in rows and as ensembl gene IDs. e.g ENSG00000091831. If there is already a data frame with genes and their location on the chromosome, you can use that df instead. | |
#' That data frame needs to have the columns "ensembl_gene_id" and "chromosome_name". | |
#' @return | |
#' @import SummarizedExperiment | |
#' @importFrom biomaRt | |
#' @export | |
filter_chromosome <- function(summarized_experiment, chromosome = NULL, df_biomart) { | |
} | |
#' Silverman Test | |
#' @description This function applies the \code{\link[silvermantest]{silverman.test}} on every row (gene) and if wanted already filters statistical signifcant genes. | |
#' @param summarized_experiment SummarizedExperiment or any matrix like object with rownames#' | |
#' @param assay_index Integer - index of assay of gene expression if input object is a SummarizedExperiment | |
#' @param filter Logical - filter out statistically not significant genes, threshold set by alpha | |
#' @param alpha Double - significance level | |
#' @param p.adjust Double - percentage of samples with counts greater than 0 that need to have gene counts equal or above min_value | |
#' @param method Character - methods do adjust p-values | Default = "holm" | |
#' @param ignore_zero Logical - ignore vectors with only zero values, test will throw an error otherwise | |
#' @param parallel Logical - calculation in parallel? | |
#' @param no_cluster Integer - number of clusters for parallel computation | |
#' @param ... further arguments for silverman.test() | |
#' @details p.adjust.methods: | |
# c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none") | |
#' @import SummarizedExperiment | |
#' @importFrom silvermantest silverman.test | |
#' @examples | |
#' @return If filter == F then a vector of p-values is returned, otherwise the filtered input object | |
#' @export | |
use_silvermantest <- function(summarized_experiment, assay_index = 1, filter = TRUE, alpha = 0.05, p.adjust = FALSE, method = NULL, ignore_zero = TRUE, parallel = TRUE, no_cluster = NULL, ...) { | |
#extract assay/matrix | |
if (class(summarized_experiment) == "RangedSummarizedExperiment") { | |
expressionMatrix <- assay(summarized_experiment, assay_index) #assay to matrix | |
} else { | |
expressionMatrix <- summarized_experiment | |
} | |
systemInfo <- Sys.info() | |
#compute diptest in parallel or seriell | |
if (parallel) { | |
cl <- mmRmeta::get_no_cores(no_cluster = no_cluster) | |
doSNOW::registerDoSNOW(cl) #start Cluster | |
on.exit(snow::stopCluster(cl)) #stop cluster on exit | |
if (systemInfo["sysname"] == "Windows") { | |
#start of parallel computation of silvermantest | |
pValues <- foreach(i = seq_len(nrow(expressionMatrix)), .combine = "c") %dopar% { | |
#all zero values are ignored from computation | |
if (ignore_zero) { | |
vector <- Filter(function(x) x != 0, x = expressionMatrix[i,]) | |
#security measure if all values are equal, silverman.test will otherwise throw an error | |
if (min(vector) == max(vector)) { | |
silverP <- 1 | |
} else { | |
silverP <- tryCatch({silvermantest::silverman.test(x = Filter(function(x) x != 0, x = expressionMatrix[i, ]), ... )@p_value}) | |
} | |
} else { | |
silverP <- tryCatch({silvermantest::silverman.test(x = expressionMatrix[i, ], ... )@p_value}) | |
} | |
return(silverP) #result of parallel task | |
} | |
} else { | |
#linux stuff | |
} | |
} else { | |
pValues <- vector(mode = "numeric", length = nrow(expressionMatrix)) | |
for (gene in seq_len(nrow(expressionMatrix))) { | |
if (ignore_zero) { | |
silverTest <- silvermantest::silverman.test(x = Filter(function(x) x != 0, x = expressionMatrix[gene, ]), ... )@p_value | |
pValues[gene] <- silverTest | |
} else { | |
silverTest <- silvermantest::silverman.test(as.numeric(x = expressionMatrix[gene, ]), ... )@p_value | |
pValues[gene] <- silverTest | |
} | |
} | |
} | |
#adjust pValues if TRUE | |
if (p.adjust) { | |
if (is.null(method)) { | |
method <- "holm" | |
} | |
pValues <- tryCatch({use_p_adjust(p_values = pValues, method = method)}) | |
} | |
#filter input data directly if filter == TRUE by threshold of alpha | |
if (filter) { | |
significant <- pValues <= alpha #logic vector if the pvalues <= threshhold(alpha) | |
summarized_experiment <- summarized_experiment[significant,] | |
return(summarized_experiment) | |
} else { | |
return(pValues) | |
} | |
} #thrwows error when x = all the same values | |
#' DipTest | |
#' @description This function applies the silvermantest on every row (gene) | |
#' @param summarized_experiment SummarizedExperiment or any data frame like object. | |
#' @param assay_index Integer - index of assay of gene expression if input object is a SummarizedExperiment | |
#' @param ignore_zero Logical - ignore zero values | |
#' @param filter Logical - filter input data by alpha | |
#' @param alpha Double - significance level | p-values equal or below alpha are kept | |
#' @param p.adjust Logical - use p.adjust | |
#' @param method Character - method used in p.adjust | |
#' @param parallel Logical - use parallel computation of test | |
#' @param no_cluster Integer - number of clusters used in parallelization | Default (number of cores - 1) | |
#' @param ... further arguments for diptest(B) | |
#' @import SummarizedExperiment | |
#' @import snow | |
#' @import foreach | |
#' @import doParallel | |
#' @details | |
#' When applying the test on data with many zero values, the test most likely will be significant anyway. Those zero values are excluded from the diptest. But you can set set an expection with "..:". Meaning that when there are zero values but a percentage of samples | |
#' is above a certain value ":::" then the diptest is applied on the whole data set. | |
#' If filter is set to TRUE the function returns an object with the same class as summarized_experiment. Otherwise a vector of p-values is returned. | |
#' @examples | |
#' @return Returns either a vector of pValues or a filtered input object. | |
use_diptest <- function(summarized_experiment, assay_index = 1, filter = TRUE, alpha = 0.05, ignore_zero = TRUE, ignore_percentage = 0.2, ignore_threshold = NULL, p.adjust = FALSE, method = NULL, parallel = TRUE, no_cluster = NULL, ...) { | |
if (class(summarized_experiment) %in% c("SummarizedExperiment", "RangedSummarizedExperiment")) { | |
expressionMatrix <- assay(summarized_experiment, assay_index) #assay to matrix | |
} else { | |
expressionMatrix <- summarized_experiment | |
} | |
systemInfo <- Sys.info() | |
#compute diptest in parallel or seriell | |
if (parallel ) { | |
cl <- mmRmeta::get_no_cores(no_cluster = no_cluster) | |
doSNOW::registerDoSNOW(cl) #start Cluster | |
on.exit(snow::stopCluster(cl)) #stop cluster on exit | |
if (systemInfo["sysname"] == "Windows") { | |
pValues <- foreach(i = seq_len(nrow(expressionMatrix)), .combine = "c", .packages = c("foreach", "diptest")) %dopar% | |
if (ignore_zero) { tryCatch({diptest::dip.test(x = Filter(function(x) x != 0, x = expressionMatrix[i, ]), simulate.p.value = TRUE, ... )$p.value}) #filter out all zero | |
} else { tryCatch({diptest::dip.test(expressionMatrix[i, ], simulate.p.value = TRUE, ... )$p.value}) | |
} | |
} #else { | |
# pValues <- foreach(i = seq_len(nrow(expressionMatrix)), .combine = "c", .packages = c("foreach", "diptest")) %dopar% | |
# diptest::dip.test(x = expressionMatrix[i, ], simulate.p.value = TRUE, ... )$p.value | |
#} | |
} else { | |
pValues <- vector(mode = "numeric", length = nrow(expressionMatrix)) | |
for (gene in seq_len(nrow(expressionMatrix))) { | |
diptest <- diptest::dip.test(as.numeric(x = expressionMatrix[gene, ]), simulate.p.value = T, ... )$p.value | |
pValues[gene] <- diptest | |
} | |
} | |
#adjust pValues if TRUE | |
if (p.adjust) { | |
if (is.null(method)) { | |
method <- "holm" | |
} | |
pValues <- mmRmeta::use_p_adjust(p_values = pValues, method = method) | |
} | |
#filter input data directly if filter == TRUE by threshold of alpha | |
if (filter) { | |
significant <- pValues <= alpha #logic vector if the pvalues <= threshhold(alpha) | |
summarized_experiment <- summarized_experiment[significant,] | |
return(summarized_experiment) | |
} else { | |
return(pValues) | |
} | |
} |