diff --git a/.gitignore b/.gitignore index 5b6a065..c833a2c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .Rhistory .RData .Ruserdata +inst/doc diff --git a/DESCRIPTION b/DESCRIPTION index 7cfce80..3897b52 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,4 +18,7 @@ Imports: LaplacesDemon, plyr, mclust, diptest -Suggests: genefilter +Suggests: genefilter, + knitr, + rmarkdown +VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index e35a324..f85f09b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,7 +2,6 @@ export(DipTestOverP) export(SilvermanOverP) -export(createValidationData) export(evaluateAlgorithmOutput) export(getClassifications) export(getParam) @@ -16,9 +15,26 @@ export(useFlexmix) export(useHdbscan) export(useMclust) exportClasses(Params) +import(dbscan) +import(diptest) +import(flexmix) +import(mclust) +import(silvermantest) importFrom(LaplacesDemon,rinvgaussian) importFrom(checkmate,assertCharacter) importFrom(checkmate,assertClass) importFrom(checkmate,assertList) importFrom(checkmate,assertString) +importFrom(methods,"slot<-") +importFrom(methods,new) +importFrom(methods,rbind2) +importFrom(methods,slot) +importFrom(methods,slotNames) +importFrom(methods,validObject) +importFrom(plyr,compact) +importFrom(stats,dnorm) +importFrom(stats,rgamma) +importFrom(stats,rnorm) +importFrom(stats,runif) +importFrom(stats,sd) importFrom(utils,head) diff --git a/R/algorithmSpecificFunctions.R b/R/algorithmSpecificFunctions.R index 27880fc..3c4bc5d 100644 --- a/R/algorithmSpecificFunctions.R +++ b/R/algorithmSpecificFunctions.R @@ -3,31 +3,45 @@ #' Use the mclust algorithm for generating output to evaluate the #' classification (unimodal,bimodal or other) of genes. #' -#' -#' #' @param expression (Simulated) Gene Expression data frame which genes shall #' be classified as unimodal, bimodal or other +#' @param clusterNumbers span of the clusters to be calculated at maximum #' @param verbose logical. Whether to print progress messages #' @param parallel logical. Whether to calculate the mclust models in parallel #' @return The formatted output of the mclust algorithm #' #' @examples -#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) +#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' #with messages printed to console #' mclustOutput <- useMclust(expression = expression) -#' #without messages printed to console -#' mclustOutput <- useMclust(expression = expression, verbose = FALSE) +#' #without messages printed to console, at maximum 3 clusters +#' mclustOutput <- useMclust(expression = expression, clusterNumbers = c(1:3), +#' verbose = FALSE, parallel = FALSE) #' #' @import mclust #' @export -useMclust <- function(expression,verbose = TRUE,parallel = FALSE) { +#' +useMclust <- function(expression,clusterNumbers = NULL,verbose = TRUE,parallel = FALSE) { if (verbose) {message("mclust-models are calculated...")} - - if(parallel==FALSE){ + if(is.null(clusterNumbers)){ + if(parallel==FALSE){ + models <- lapply(1:nrow(expression), function(i) { + mclust::Mclust((expression[i,]),modelNames=c("V"),verbose = FALSE) }) + } + if(parallel==TRUE){ + n.cores = parallel::detectCores()-1 + cl = parallel::makeCluster(n.cores) + parallel::clusterExport(cl, "expression") + parallel::clusterEvalQ(cl, library(mclust)) + models <- parallel::parLapply(cl, 1:nrow(expression), function(i) { + mclust::Mclust((expression[i,]), modelNames=c("V")) }) + } + }else{ + if(parallel==FALSE){ models <- lapply(1:nrow(expression), function(i) { - mclust::Mclust((expression[i,]),modelNames=c("V"),verbose = FALSE) }) + mclust::Mclust((expression[i,]),modelNames=c("V"),verbose = FALSE,G = clusterNumbers) }) } if(parallel==TRUE){ n.cores = parallel::detectCores()-1 @@ -35,8 +49,10 @@ useMclust <- function(expression,verbose = TRUE,parallel = FALSE) { parallel::clusterExport(cl, "expression") parallel::clusterEvalQ(cl, library(mclust)) models <- parallel::parLapply(cl, 1:nrow(expression), function(i) { - mclust::Mclust((expression[i,]), modelNames=c("V")) }) + mclust::Mclust((expression[i,]), modelNames=c("V"),G = clusterNumbers) }) } + } + if(parallel==TRUE){parallel::stopCluster(cl)} names(models) = rownames(expression) patients <- names(expression) @@ -51,15 +67,12 @@ useMclust <- function(expression,verbose = TRUE,parallel = FALSE) { #' Using the output of the mclust algorithm for generating output in a unified #' format to evaluate the classification (unimodal,bimodal or other) of genes. #' -#' #' @param models Unformatted output of the Mclust function from the mclust #' package of the genes from the gene expression data frame #' @param patients List of the patients #' #' @return The formatted output of the mclust algorithm #' -#' -#' mclustClassification <- function(models,patients){ genes <- names(models) @@ -94,8 +107,6 @@ mclustClassification <- function(models,patients){ #' Use the HDBSCAN algorithm for generating output to evaluate the #' classification (unimodal,bimodal or other) of genes. #' -#' -#' #' @param expression (Simulated) Gene Expression data frame which genes shall #' be classified as unimodal, bimodal or other #' @param minPts Integer; Minimum size of clusters. See details of ?hdbscan @@ -104,7 +115,7 @@ mclustClassification <- function(models,patients){ #' @return hdbscanOutput The formatted output of the HDBSCAN clustering #' #' @examples -#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) +#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' #with messages printed to console @@ -116,6 +127,7 @@ mclustClassification <- function(models,patients){ #' #' @import dbscan #' @export +#' useHdbscan <- function(expression,minPts = 10,verbose = TRUE){ if(verbose){message("HDBSCAN-clusters are calculated...")} genes <- rownames(expression) @@ -187,7 +199,6 @@ useHdbscan <- function(expression,minPts = 10,verbose = TRUE){ #' @param error The difference of a value predicted by a model and an actually #' observed value #' -#' rmse <- function(error){ error <- unlist(error) error2 <- unlist(lapply(1:length(error),function(i){if(!is.na(error[[i]])){c(error[[i]])}})) @@ -208,7 +219,7 @@ rmse <- function(error){ #' @return flexmixOutput The formatted output of the best fitting models from flexmix #' #' @examples -#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) +#' params <- newParams(nGenes = list("1"=75,"2"=list("gauss"= 75,"gamma"= 75))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' #with messages printed to console @@ -219,6 +230,7 @@ rmse <- function(error){ #' @import flexmix #' @importFrom plyr compact #' @export +#' useFlexmix <- function(expression,maxModality, verbose = TRUE,parallel=FALSE){ modelGauss <- c() modelGamma <- c() diff --git a/R/evaluate.R b/R/evaluate.R index 2366b11..e56ca4b 100644 --- a/R/evaluate.R +++ b/R/evaluate.R @@ -66,7 +66,7 @@ isApproxSame<- function(validationVal,classVal,maxDistance){ #' value and the validation data value with which the estimated is classified #' as TRUE (correctly estimated) #' @examples -#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) +#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData @@ -137,7 +137,7 @@ evaluateAlgorithmOutput <- function(modality,algorithmOutput,validationData,maxD #' @param validationData The validationData behind the simulated gene #' expression data frame #' @examples -#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) +#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData @@ -187,7 +187,7 @@ getClassification <- function(modality,algorithmOutput,algorithmName="mclust",va #' @param algorithmName The name of the algorithm which created the output #' @return A data frame with the percentual proportion of as TRUE or FALSE evaluated values #' @examples -#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) +#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData @@ -213,24 +213,21 @@ getStatistic <- function(evaluation,algorithmName ="mclust"){ #' @param evaluations list of named evaluations #' @return A data frame with the percentual proportion of as TRUE or FALSE evaluated values #' @examples -#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) +#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData #' mclustOutput <- useMclust(expression = expression,verbose = FALSE) #' hdbscanOutput <- useHdbscan(expression = expression,verbose = FALSE) -#' flexmixOutput <- useFlexmix(expression = expression,maxModality = 2,verbose = FALSE) #' mclustEvaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = mclustOutput, #' validationData = validationData) #' hdbscanEvaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = hdbscanOutput, #' validationData = validationData) -#' flexmixEvaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = flexmixOutput, -#' validationData = validationData) #' #' #getStatistics of one algorithm -#' getStatistics(evaluations = list("mclust"=mclustEval)) +#' getStatistics(evaluations = list("mclust"=mclustEvaluation)) #' #getStatistics of two or more algorithms -#' getStatistics(evaluations = list("mclust"=mclustEval,"HDBSCAN"=hdbscanEval)) +#' getStatistics(evaluations = list("mclust"=mclustEvaluation,"HDBSCAN"=hdbscanEvaluation)) #' @export getStatistics <- function(evaluations) { numEvaluations <- length(evaluations) @@ -254,13 +251,12 @@ getStatistics <- function(evaluations) { #' @return a classification data frame that contains the number of #' FN, FP, TN and TP #' @examples -#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) +#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData #' mclustOutput <- useMclust(expression = expression,verbose = FALSE) #' hdbscanOutput <- useHdbscan(expression = expression,verbose = FALSE) -#' flexmixOutput <- useFlexmix(expression = expression,maxModality = 2,verbose = FALSE) #' #' #getClassifications of one algorithm #' getClassifications(modality=2,validationData, diff --git a/R/filter.R b/R/filter.R index c50508f..48cbfca 100644 --- a/R/filter.R +++ b/R/filter.R @@ -6,13 +6,14 @@ #' @param na.rm Whether NAs should be removed. #' #' @return A function with bindings for p and R. +#' @import silvermantest #' #' @export SilvermanOverP <- function(p, R = 100, na.rm = T) { function(x) { if (na.rm) x <- x[!is.na(x)] - s1 <- silvermantest::silverman.test(x, k = 1, m = 1, R = R, adjust = T) + s1 <- silvermantest::silverman.test(x, k = 1, R = R, adjust = T) s1@p_value < p } } @@ -20,12 +21,12 @@ SilvermanOverP <- function(p, R = 100, na.rm = T) { #' DipTestOverP returns a filter function with bindings for P. #' This function evaluates to TRUE if the Hartigans Dip Test for unimodality can be rejected at the significance level P. #' -#' @param p -#' @param R -#' @param na.rm +#' @param p The P-value that is used to reject Hartigans Dip Test of unimodality ( +#' @param R The number of bootstrap replicates for each test. +#' @param na.rm Whether NAs should be removed. #' #' @return A function with bindings for p and R. -#' +#' @import diptest #' @export DipTestOverP <- function(p, R = 100, na.rm = T) { function(x) { diff --git a/R/params.R b/R/params.R index 91ba846..f68df39 100644 --- a/R/params.R +++ b/R/params.R @@ -47,34 +47,24 @@ setGeneric("setParam", #' The Params class defines the following parameters: #' #' \describe{ -#' \item{\code{nUniGenes}}{The number of genes to simulate with the -#' Unimodal scenario.} -#' \item{\code{nBiGenes}}{The number of genes to simulate with the -#' bimodal scenario.} -#' \item{\code{nGuGenes}}{The number of genes to simulate with the -#' gamma and unimodal scenario.} -#' \item{\code{nPatients}}{The number of patients to simulate.} +#' \item{\code{nGenes}}{A list containing the numbers of genes to simulate for +#' each scenario and modality.} +#' \item{\code{means}}{A list containing the parameter range for the mean for +#' each scenario and modality.} +#' \item{\code{foldChanges}}A list containing the parameter range for the log2 +#' foldChanges between distributions for multimodal genes. Has to contain one +#' entry less than the modality that is simulated.} +#' \item{\code{sd}}{The parameters mu and lambda for the generation of the +#' standard deviation of each distribution via an Inverse Gaussian distribution.} +#' \item{\code{gamma}}{The shape and rate parameter for generating the gamma +#' distributions.} +#' \item{\code{proportions}}{Parameters for the allowed proportions of +#' multimodal distributions.} +#' \item{\code{nPatients}}{The number of patients to simulate and how +#' many columns the simulated gene expression should have per gene.} #' \item{\code{seed}}{Seed to use for generating random numbers.} -#' \item{\code{sdParms}}{The parameters mu and lambda for the generation of the -#' distribution of the standard deviation for each szenario.} -#' \item{\code{uniMean}}{The mean span for the gaussian distribution of the -#' unimodal simulation scenario.} -#' \item{\code{biMean}}{The mean span for the gaussian distribution of the -#' bimodal simulation scenario.} -#' \item{\code{foldChange}}{The log2 foldChange between the modes in the -#' bimodal simuation scenario.} -#' \item{\code{proportions}}{The proporties of the different distributions of the -#' bimodal and gamma+unimodal simulation scenario.} -#' \item{\code{guMean}}{The mean span for the gaussian distribution of the -#' gamma+unimodal simulation scenario.} -#' \item{\code{gammaShape}}{The shape parameter for the mean gamma+unimodal -#' distribution.} -#' \item{\code{gammaRate}}{The rate parameter for the mean gamma+unimodal -#' distribution.} #' } #' -#' The parameters shown in brackets can be estimated from real data. -#' #' @name Params #' @rdname Params #' @aliases Params-class @@ -140,7 +130,8 @@ setMethod("show", "Params", function(object) { #' params <- newParams("nGenes"=list("1"=100,"2"=list("gauss"=100,"gamma"=100)), #' "proportions"=c(30,70)) #' params <- newParams( -#' nGenes=list("1"=c(700),"2"=list(gauss = 150, gamma = 150),"3"=list(gauss = 150, gamma = 150)), +#' nGenes=list("1"=c(700),"2"=list(gauss = 150, gamma = 150), +#' "3"=list(gauss = 150, gamma = 150)), #' means=list("1"= c(2, 4),"2"= c(2, 4),"3"= c(2, 4)), #' foldChanges = list("1"=NA,"2"= list(gauss = c(2, 4), gamma = c(2, 4)), #' "3"= list(gauss = list(c(2, 4), c(2, 4)), gamma = list(c(2, 4), c(2, 4))))) @@ -197,12 +188,13 @@ getParams <- function(params, names) { #' params <- newParams() #' params #' # Set individually -#' params <- params <- setParams(params,proportions = c(80,20),foldChanges = list("1"=c(2,4), -#' "2"= list("gauss"=c(2,5),"gamma"=c(3,5)))) +#' params <- params <- setParams(params,proportions = c(80,20), +#' foldChanges = list("1"=c(2,4),"2"= list("gauss"=c(2,5),"gamma"=c(3,5)))) #' params #' # Set via update list #' params <- setParams(params,update = list("nGenes"=list("1"=100, -#' "2"=list("gauss"=200,"gamma"=100)),"means"=list("1"=c(1,3),"2"=c(2,4)),"nPatients"=700)) +#' "2"=list("gauss"=200,"gamma"=100)),"means"=list("1"=c(1,3),"2"=c(2,4)), +#' "nPatients"=700)) #' params #' @export setParams <- function(params, update = NULL, ...) { diff --git a/R/simulation.R b/R/simulation.R index ef822e7..fdcbaa5 100644 --- a/R/simulation.R +++ b/R/simulation.R @@ -1,61 +1,38 @@ #' Simulation #' #' Simulate gene expression for the different simulation scenarios: Unimodal, -#' bimodal,gamma+unimodal +#' multimodal with gaussian distributions, multimodal starting with a gamma distribution and followed by gaussian distributions #' #' @param params Params object containing simulation parameters for all scenarios. #' @param verbose logical. Whether to print progress messages #' @param ... any additional parameter settings to override what is provided in \code{params}. #' #' @details -#' Scenario 1 - Unimodal gaussian distribution -#' Gene expression for the unimodal gaussian distribution (scenario 1) is +#' The "modus" is the number of occurring distributions within one gene. +#' Modus "1" - Unimodal gaussian distribution +#' Gene expression for the unimodal gaussian distribution (scenario "1") is #' simulated from the mean and the standard deviation taken from the validation data #' for this gene. The number of values that are simulated for each #' distribution equals the number of Patients {"nPatients"}: -#' \code{expression <- stats::rnorm(nPatients,as.numeric(valData[i,3]), as.numeric(valData[i,4]))} #' -#' Scenario 2 - Bimodal with two unimodal gaussian distributions -#' Gene expression for the bimodal scenario 2 (two gaussian distributions) is -#' simulated individually for each distribution. -#' The first distribution is simulated with the mean and the first standard -#' deviation taken from the validation data for this gene. The number of values that are -#' simulated ("nBi1") are calculated by multiplicating the number of Patients with the -#' first proportion and rounding this number off: -#' \code{data1 <- stats::rnorm(nBi1,as.numeric(valData[i,3]),as.numeric(valData[i,4]))} -#' The second distribution is simulated with the mean calculated by -#' multiplying the mean of the first distribution with the foldChange -#' from the validation data of this gene. The standard deviation is also taken from the validation data of this gene. The number of values that are -#' simulated ("nBi2") are calculated by multiplicating the number of Patients with the -#' second proportion and rounding this number up. -#' the mean and the standard deviation taken from the validation data for this gene. -#' The number of values that are simulated for each distribution equals the -#' number of Patients {"nPatients"}: -#' \code{data2 <- stats::rnorm(nBi2,foldBi,as.numeric(valData[i,5]))} -#' At the end both of the simulated distributions are put into one expression. -#' \code{expression <- c(data1,data2)} +#' All other modi consist of two different scenarios: +#' In the "gauss" scenario all occurring distributions are gaussian distributions. +#' In the "gamma" scenario the first occuring distribution is a gamma distribution. +#' All other distributions are gaussian distributions. #' -#' Scenario 3 - Bimodal with one Gamma Distribution and one unimodal gaussian -#' distribution -#' Gene expression for the bimodal scenario 3 (1 Gamma and 1 unimodal gaussian -#' distribution) is simulated individually for each distribution. -#' The first distribution, which is the Gamma distribution, is simulated with -#' the shape and rate parameter of the validation data for this gene. -#' The number of values that are simulated ("nGu1") are calculated by -#' multiplying the number of Patients with the first proportion and rounding -#' this number off. -#' \code{data1 <- stats::rgamma(n = nGu1,shape = as.numeric(valData[i,9]), -#' rate = as.numeric(valData[i,10]))} -#' The second distribution, which is the unimodal gaussian distribution is -#' simulated with the calculated mean, which is calculated by multiplying the -#' mean with the foldChange of the validation data of this gene. The standard -#' deviation is taken from the validation data of this gene. The number of values that -#' are simulated ("nGu2") are calculated by multiplying the number of Patients -#' with the second proportion and rounding this number up. -#' \code{data2 <- stats::rnorm(nGu2,foldGamma,as.numeric(valData[i,4]))} -#' At the end both of the simulated distributions are put into one expression. -#' \code{expression <- c(data1,data2)} +#' Modus "2" or higher - Bimodal distributions or higher modalities #' +#' Gene expressions of the multimodal genes are +#' simulated automatically for each distribution. +#' Scenario "gauss" +#' The different gaussian distributions are simulated with the corresponding mean +#' and a standard deviation and number of values taken from the validation data +#' for this gene. +#' +#' Scenario "gamma" +#' The gamma distribution is simulated with the gamma shape and rate values from +#' the params object. The other gaussian distributions are simulated with the values +#' from the validation data. #' #' @return A named list with two components: 'validationData' and 'expressionData' #' @@ -63,8 +40,8 @@ #' simulation <- simulateExpression(params = newParams()) #' # Override default parameters #' # Not recommended, better use setParams; See \code{\link{Params}} -#' simulation <- simulateExpression(nGenes = list("1"=100, -#' "2"=list("gauss"= 100,"gamma"= 100)), nPatients = 50) +#' simulation <- simulateExpression(nGenes = list("1"=10, +#' "2"=list("gauss"= 10,"gamma"= 10)), nPatients = 50) #' # without messages printed to the console #' params <- newParams() #' simulation <- simulateExpression(params = params,verbose = FALSE) @@ -140,9 +117,9 @@ simulateExpression <- function(params = newParams(), verbose = TRUE,...) { #' Creating the validation data for algorithms. #' -#' The validation data is generated by using the `Params` object and the parameters that -#' are stored in it. It is generated automatically as and saved as an Rdata -#' object when the simulate() function is executed +#' The validation data is generated by using the `Params` object and the parameters +#' that are stored in it. It is generated automatically when the +#' simulateExpression() function is executed #' @param params Params object containing simulation parameters for all scenarios. #' @param verbose logical. Whether to print progress messages #' @@ -156,34 +133,16 @@ simulateExpression <- function(params = newParams(), verbose = TRUE,...) { #' Default values describes the default values used to generate a `Params` #' object. #' -#' Parameter Type Usage Default values -#' -#' nUniGenes: numeric, Default value: 300 -#' -#' nBiGenes: numeric, Default value: 500 -#' -#' nGuGenes: numeric, Default value: 200 -#' -#' nPatients: numeric, Default value: 100 -#' -#' seed: numeric, Default value: sample(1:1e6,1) -#' -#' sdParms: vector, Default value:c(0.61,2.21), use values as c(µ,lambda) -#' -#' uniMean: vector, Default value: c(2,4), randomize from span of vector -#' -#' biMean: vector, Default value: c(2,5), randomize from span of vector -#' -#' foldChange: vector, Default value: c(2,4), randomize from span of vector -#' -#' proportions: vector, Default value: c(10,20,30,40,50,60,70,80,90), pick random value from vector -#' -#' guMean: vector, Default value: c(0.5,3), randomize from span of vector -#' -#' gammaShape: numeric, Default value: 2 -#' -#' gammaRate: numeric, Default value: 2 +#' Parameter Type Default values #' +#' nGenes: list, Default value: list("1"= 700, "2" = list("gauss"=150,"gamma"=150)), +#' means: list, Default value: list("1"=c(2,4),"2" = c(2,4)), +#' foldChanges: list, Default value: list("1" = c(2,4),"2" = list("gauss"= c(2,4),"gamma" = c(3,5))), +#' sd: list, Default value: list("mu"= 0.61,"lambda"=2.21), +#' gamma: list, Default value: list("shape" = 2, "rate" = 2), +#' proportions: vector, Default value: c(10,20,30,40,50,60,70,80,90), +#' nPatients: numeric, Default value: 200, +#' seed: numeric, Default value: sample(1:1e6, 1) #' #' @importFrom LaplacesDemon rinvgaussian #' diff --git a/man/DipTestOverP.Rd b/man/DipTestOverP.Rd index 0a5fa67..6c27416 100644 --- a/man/DipTestOverP.Rd +++ b/man/DipTestOverP.Rd @@ -8,7 +8,11 @@ This function evaluates to TRUE if the Hartigans Dip Test for unimodality can be DipTestOverP(p, R = 100, na.rm = T) } \arguments{ -\item{na.rm}{} +\item{p}{The P-value that is used to reject Hartigans Dip Test of unimodality (} + +\item{R}{The number of bootstrap replicates for each test.} + +\item{na.rm}{Whether NAs should be removed.} } \value{ A function with bindings for p and R. diff --git a/man/Params.Rd b/man/Params.Rd index f3d1569..0c9675f 100644 --- a/man/Params.Rd +++ b/man/Params.Rd @@ -8,38 +8,3 @@ \description{ Virtual S4 class that all other Params classes inherit from. } -\section{Parameters}{ - - -The Params class defines the following parameters: - -\describe{ - \item{\code{nUniGenes}}{The number of genes to simulate with the - Unimodal scenario.} - \item{\code{nBiGenes}}{The number of genes to simulate with the - bimodal scenario.} - \item{\code{nGuGenes}}{The number of genes to simulate with the - gamma and unimodal scenario.} - \item{\code{nPatients}}{The number of patients to simulate.} - \item{\code{seed}}{Seed to use for generating random numbers.} - \item{\code{sdParms}}{The parameters mu and lambda for the generation of the - distribution of the standard deviation for each szenario.} - \item{\code{uniMean}}{The mean span for the gaussian distribution of the - unimodal simulation scenario.} - \item{\code{biMean}}{The mean span for the gaussian distribution of the - bimodal simulation scenario.} - \item{\code{foldChange}}{The log2 foldChange between the modes in the - bimodal simuation scenario.} - \item{\code{proportions}}{The proporties of the different distributions of the - bimodal and gamma+unimodal simulation scenario.} - \item{\code{guMean}}{The mean span for the gaussian distribution of the - gamma+unimodal simulation scenario.} - \item{\code{gammaShape}}{The shape parameter for the mean gamma+unimodal - distribution.} - \item{\code{gammaRate}}{The rate parameter for the mean gamma+unimodal - distribution.} -} - -The parameters shown in brackets can be estimated from real data. -} - diff --git a/man/compareScenarios.Rd b/man/compareScenarios.Rd index 6479824..5a12280 100644 --- a/man/compareScenarios.Rd +++ b/man/compareScenarios.Rd @@ -4,9 +4,12 @@ \alias{compareScenarios} \title{compare Scenarios} \usage{ -compareScenarios(algorithmOutput, validationData) +compareScenarios(modality, algorithmOutput, validationData) } \arguments{ +\item{modality}{numerical, for which modality the scenarioComparing is calculated, 1 for unimodal, +2 for bimodal,...} + \item{algorithmOutput}{The output to be compared} \item{validationData}{The validationData behind the simulated gene expression data frame} @@ -20,14 +23,11 @@ Compare the scenario of each gene that was assigned by the algorithm with the scenario that is written in the validation Data. } \details{ -TN (TRUE Negative): validationData: unimodal, assigned scenario: unimodal +TN (TRUE Negative): validationData: not modality, assigned scenario: not modality -TP (TRUE Positive): validationData: bimodal, assigned scenario: bimodal +TP (TRUE Positive): validationData: modality, assigned scenario: modality -FN (FALSE Negative): validationData: bimodal, assigned scenario: unimodal -or -FN (FALSE Negative): validationData: unimodal/bimodal, - assigned scenario: more than bimodal +FN (FALSE Negative): validationData: modality, assigned scenario: not modality -FP (FALSE Positive): validationData: unimodal, assigned scenario: bimodal +FP (FALSE Positive): validationData: not modality, assigned scenario: modality } diff --git a/man/createValidationData.Rd b/man/createValidationData.Rd index c1482d0..0ecdff7 100644 --- a/man/createValidationData.Rd +++ b/man/createValidationData.Rd @@ -15,9 +15,9 @@ createValidationData(params = newParams(), verbose = TRUE) A data.frame with validation data, from which expression values will be simulated. } \description{ -The validation data is generated by using the `Params` object and the parameters that -are stored in it. It is generated automatically as and saved as an Rdata -object when the simulate() function is executed +The validation data is generated by using the `Params` object and the parameters +that are stored in it. It is generated automatically when the +simulateExpression() function is executed } \details{ For changing the parameters, please consider the following: @@ -27,31 +27,14 @@ Usage shows further information for the usage of the values, Default values describes the default values used to generate a `Params` object. -Parameter Type Usage Default values +Parameter Type Default values -nUniGenes: numeric, Default value: 300 - -nBiGenes: numeric, Default value: 500 - -nGuGenes: numeric, Default value: 200 - -nPatients: numeric, Default value: 100 - -seed: numeric, Default value: sample(1:1e6,1) - -sdParms: vector, Default value:c(0.61,2.21), use values as c(µ,lambda) - -uniMean: vector, Default value: c(2,4), randomize from span of vector - -biMean: vector, Default value: c(2,5), randomize from span of vector - -foldChange: vector, Default value: c(2,4), randomize from span of vector - -proportions: vector, Default value: c(10,20,30,40,50,60,70,80,90), pick random value from vector - -guMean: vector, Default value: c(0.5,3), randomize from span of vector - -gammaShape: numeric, Default value: 2 - -gammaRate: numeric, Default value: 2 +nGenes: list, Default value: list("1"= 700, "2" = list("gauss"=150,"gamma"=150)), +means: list, Default value: list("1"=c(2,4),"2" = c(2,4)), +foldChanges: list, Default value: list("1" = c(2,4),"2" = list("gauss"= c(2,4),"gamma" = c(3,5))), +sd: list, Default value: list("mu"= 0.61,"lambda"=2.21), +gamma: list, Default value: list("shape" = 2, "rate" = 2), +proportions: vector, Default value: c(10,20,30,40,50,60,70,80,90), +nPatients: numeric, Default value: 200, +seed: numeric, Default value: sample(1:1e6, 1) } diff --git a/man/evaluateAlgorithmOutput.Rd b/man/evaluateAlgorithmOutput.Rd index 0fe1bac..35d1918 100644 --- a/man/evaluateAlgorithmOutput.Rd +++ b/man/evaluateAlgorithmOutput.Rd @@ -4,9 +4,13 @@ \alias{evaluateAlgorithmOutput} \title{Evaluate AlgorithmOutput} \usage{ -evaluateAlgorithmOutput(algorithmOutput, validationData, maxDifference = 10) +evaluateAlgorithmOutput(modality, algorithmOutput, validationData, + maxDifference = 10) } \arguments{ +\item{modality}{numerical, for which modality output is evaluated, 1 for unimodal, +2 for bimodal,...} + \item{algorithmOutput}{The formatted output of the algorithm that is now evaluated} @@ -32,15 +36,15 @@ FALSE (not correctly estimated) for this value. If the difference is less or equal to 10%, the estimation is classified as TRUE (correctly estimated) } \examples{ -params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) simulation <- simulateExpression(params= params,verbose=FALSE) expression <- simulation$expressionData validationData <- simulation$validationData output <- useMclust(expression, verbose=FALSE) # TRUE if less than +- 10 \% difference to value of validationData -evaluation <- evaluateAlgorithmOutput(algorithmOutput = output, +evaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = output, validationData = validationData) # TRUE if less than +- 20 \% difference to value of validationData -evaluation <- evaluateAlgorithmOutput(algorithmOutput = output, +evaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = output, validationData = validationData,maxDifference = 20) } diff --git a/man/expandValidationData.Rd b/man/expandValidationData.Rd deleted file mode 100644 index d2b4d60..0000000 --- a/man/expandValidationData.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/evaluate.R -\name{expandValidationData} -\alias{expandValidationData} -\title{Expand the validationData} -\usage{ -expandValidationData(validationData) -} -\arguments{ -\item{validationData}{not expanded validationData data frame} -} -\value{ -returns validationData with the two added columns -} -\description{ -Adds two columns to the validationData: mean2 and sdGamma -} -\details{ -For scenario 1, no mean2 and no sdGamma is added (NA added) -For scenario 2 the mean2 (calculated out of mean1 and foldChange) is added, -but no sdGamma is added (NA added) -For scenario 3, the mean 2 is not added(NA added, foldChange was already -used), but sdGamma is calculated with sqrt(gammaShape*(gammaScale^2)) -gammaScale is 1/gammaRate -} diff --git a/man/getClassification.Rd b/man/getClassification.Rd index f2441ba..97bb04a 100644 --- a/man/getClassification.Rd +++ b/man/getClassification.Rd @@ -4,9 +4,13 @@ \alias{getClassification} \title{getClassification} \usage{ -getClassification(algorithmOutput, algorithmName = "mclust", validationData) +getClassification(modality, algorithmOutput, algorithmName = "mclust", + validationData) } \arguments{ +\item{modality}{numerical, for which modality the classification is calculated, 1 for unimodal, +2 for bimodal,...} + \item{algorithmOutput}{The formatted output of the algorithm} \item{algorithmName}{The name of the evaluated algorithm} @@ -36,13 +40,11 @@ FN (FALSE Negative): validationData: unimodal/bimodal, FP (FALSE Positive): validationData: unimodal, assigned scenario: bimodal } \examples{ -params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) simulation <- simulateExpression(params= params,verbose=FALSE) expression <- simulation$expressionData validationData <- simulation$validationData mclustOutput <- useMclust(expression, verbose=FALSE) -evaluation <- evaluateAlgorithmOutput(algorithmOutput = output, -validationData = validationData) -getClassification(algorithmOutput = mclustOutput,algorithmName = "mclust", -validationData = validationData) +getClassifications(modality = 2,validationData = validationData, +outputs = list("mclust" = mclustOutput)) } diff --git a/man/getClassifications.Rd b/man/getClassifications.Rd index 6191864..c761c84 100644 --- a/man/getClassifications.Rd +++ b/man/getClassifications.Rd @@ -4,9 +4,12 @@ \alias{getClassifications} \title{getClassifications} \usage{ -getClassifications(validationData, outputs) +getClassifications(modality, validationData, outputs) } \arguments{ +\item{modality}{numerical, for which modality the classification is calculated, 1 for unimodal, +2 for bimodal,...} + \item{validationData}{The validationData behind the simulated gene expression data frame} \item{outputs}{list of named outputs} @@ -33,17 +36,18 @@ FN (FALSE Negative): validationData: unimodal/bimodal, FP (FALSE Positive): validationData: unimodal, assigned scenario: bimodal } \examples{ -params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) simulation <- simulateExpression(params= params,verbose=FALSE) expression <- simulation$expressionData validationData <- simulation$validationData mclustOutput <- useMclust(expression = expression,verbose = FALSE) hdbscanOutput <- useHdbscan(expression = expression,verbose = FALSE) -flexmixOutput <- useFlexmix(expression = expression,verbose = FALSE) #getClassifications of one algorithm -getClassifications(validationData,outputs = list("mclust"=mclustOutput)) +getClassifications(modality=2,validationData, +outputs = list("mclust"=mclustOutput)) #getClassifications of two or more algorithms -getClassifications(validationData,outputs = list("mclust"=mclustOutput,"HDBSCAN"=hdbscanOutput)) +getClassifications(modality=2,validationData, +outputs = list("mclust"=mclustOutput,"HDBSCAN"=hdbscanOutput)) } diff --git a/man/getParam.Rd b/man/getParam.Rd index 92d9bda..a37f076 100644 --- a/man/getParam.Rd +++ b/man/getParam.Rd @@ -23,6 +23,6 @@ Accessor function for getting parameter values. } \examples{ params <- newParams() -getParam(params, "nUniGenes") +getParam(object = params,name = "nGenes") } diff --git a/man/getParams.Rd b/man/getParams.Rd index 7c6c3be..0c54fda 100644 --- a/man/getParams.Rd +++ b/man/getParams.Rd @@ -19,5 +19,5 @@ Get multiple parameter values from a Params object. } \examples{ params <- newParams() -getParams(params, c("nUniGenes", "nPatients", "uniMean")) +getParams(params = params,names = c("nGenes","foldChanges","proportions","means")) } diff --git a/man/getStatistic.Rd b/man/getStatistic.Rd index 7bd582f..36bdedd 100644 --- a/man/getStatistic.Rd +++ b/man/getStatistic.Rd @@ -20,12 +20,12 @@ algorithms. The result is a data frame where the percentage of as TRUE and as FALSE evaluated values of the genes } \examples{ -params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) simulation <- simulateExpression(params= params,verbose=FALSE) expression <- simulation$expressionData validationData <- simulation$validationData mclustOutput <- useMclust(expression, verbose=FALSE) -mclustEvaluation <- evaluateAlgorithmOutput(algorithmOutput = mclustOutput,validationData = validationData) -getStatistic(evaluation = mclustEvaluation,algorithmName = "mclust") - +mclustEvaluation <- evaluateAlgorithmOutput(modality = 2, algorithmOutput = mclustOutput, +validationData = validationData) +getStatistics(evaluations = list("mclust" = mclustEvaluation)) } diff --git a/man/getStatistics.Rd b/man/getStatistics.Rd index 150444e..6fccb48 100644 --- a/man/getStatistics.Rd +++ b/man/getStatistics.Rd @@ -18,23 +18,19 @@ algorithms. The result is a data frame where the percentage of as TRUE and as FALSE evaluated values of the genes } \examples{ -params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) simulation <- simulateExpression(params= params,verbose=FALSE) expression <- simulation$expressionData validationData <- simulation$validationData mclustOutput <- useMclust(expression = expression,verbose = FALSE) hdbscanOutput <- useHdbscan(expression = expression,verbose = FALSE) -flexmixOutput <- useFlexmix(expression = expression,verbose = FALSE) -mclustEvaluation <- evaluateAlgorithmOutput(algorithmOutput = mclustOutput, +mclustEvaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = mclustOutput, validationData = validationData) -hdbscanEvaluation <- evaluateAlgorithmOutput(algorithmOutput = hdbscanOutput, -validationData = validationData) -flexmixEvaluation <- evaluateAlgorithmOutput(algorithmOutput = flexmixOutput, +hdbscanEvaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = hdbscanOutput, validationData = validationData) #getStatistics of one algorithm -getStatistics(evaluations = list("mclust"=mclustEval)) +getStatistics(evaluations = list("mclust"=mclustEvaluation)) #getStatistics of two or more algorithms -getStatistics(evaluations = list("mclust"=mclustEval,"HDBSCAN"=hdbscanEval)) - +getStatistics(evaluations = list("mclust"=mclustEvaluation,"HDBSCAN"=hdbscanEvaluation)) } diff --git a/man/newParams.Rd b/man/newParams.Rd index 08250e6..ee485c9 100644 --- a/man/newParams.Rd +++ b/man/newParams.Rd @@ -18,6 +18,13 @@ Params subtypes. } \examples{ params <- newParams() -params <- newParams(nUniGenes = 200, nPatients = 10) +params <- newParams("nGenes"=list("1"=100,"2"=list("gauss"=100,"gamma"=100)), +"proportions"=c(30,70)) +params <- newParams( +nGenes=list("1"=c(700),"2"=list(gauss = 150, gamma = 150), +"3"=list(gauss = 150, gamma = 150)), +means=list("1"= c(2, 4),"2"= c(2, 4),"3"= c(2, 4)), +foldChanges = list("1"=NA,"2"= list(gauss = c(2, 4), gamma = c(2, 4)), +"3"= list(gauss = list(c(2, 4), c(2, 4)), gamma = list(c(2, 4), c(2, 4))))) } diff --git a/man/rmse.Rd b/man/rmse.Rd index 166f58d..3718ed0 100644 --- a/man/rmse.Rd +++ b/man/rmse.Rd @@ -7,10 +7,8 @@ rmse(error) } \arguments{ -\item{error}{The difference of a value predicted by a model and an actually observed value} -} -\value{ -The RMSE value +\item{error}{The difference of a value predicted by a model and an actually +observed value} } \description{ The RMSE is frquently used as a measure of the differences between values diff --git a/man/setParam.Rd b/man/setParam.Rd index 3129576..d79c5a1 100644 --- a/man/setParam.Rd +++ b/man/setParam.Rd @@ -25,6 +25,6 @@ Accessor function for setting parameter values. } \examples{ params <- newParams() -params = setParam(params, "nBiGenes", 100) +params <- setParam(object = params,name = "nPatients",value = 250) } diff --git a/man/setParams.Rd b/man/setParams.Rd index ec9be5b..6a69fb6 100644 --- a/man/setParams.Rd +++ b/man/setParams.Rd @@ -32,9 +32,12 @@ them manually), see examples. params <- newParams() params # Set individually -params <- setParams(params, nUniGenes = 1000, nPatients = 50) +params <- params <- setParams(params,proportions = c(80,20), +foldChanges = list("1"=c(2,4),"2"= list("gauss"=c(2,5),"gamma"=c(3,5)))) params # Set via update list -params <- setParams(params, list(nUniGenes = 1000, nPatients = 50)) +params <- setParams(params,update = list("nGenes"=list("1"=100, +"2"=list("gauss"=200,"gamma"=100)),"means"=list("1"=c(1,3),"2"=c(2,4)), +"nPatients"=700)) params } diff --git a/man/simulateExpression.Rd b/man/simulateExpression.Rd index c77b40f..0da292d 100644 --- a/man/simulateExpression.Rd +++ b/man/simulateExpression.Rd @@ -18,62 +18,42 @@ A named list with two components: 'validationData' and 'expressionData' } \description{ Simulate gene expression for the different simulation scenarios: Unimodal, -bimodal,gamma+unimodal +multimodal with gaussian distributions, multimodal starting with a gamma distribution and followed by gaussian distributions } \details{ -Scenario 1 - Unimodal gaussian distribution -Gene expression for the unimodal gaussian distribution (scenario 1) is +The "modus" is the number of occurring distributions within one gene. +Modus "1" - Unimodal gaussian distribution +Gene expression for the unimodal gaussian distribution (scenario "1") is simulated from the mean and the standard deviation taken from the validation data for this gene. The number of values that are simulated for each distribution equals the number of Patients {"nPatients"}: -\code{expression <- rnorm(nPatients,as.numeric(valData[i,3]), as.numeric(valData[i,4]))} -Scenario 2 - Bimodal with two unimodal gaussian distributions -Gene expression for the bimodal scenario 2 (two gaussian distributions) is -simulated individually for each distribution. -The first distribution is simulated with the mean and the first standard -deviation taken from the validation data for this gene. The number of values that are -simulated ("nBi1") are calculated by multiplicating the number of Patients with the -first proportion and rounding this number off: -\code{data1 <- rnorm(nBi1,as.numeric(valData[i,3]),as.numeric(valData[i,4]))} -The second distribution is simulated with the mean calculated by -multiplying the mean of the first distribution with the foldChange -from the validation data of this gene. The standard deviation is also taken from the validation data of this gene. The number of values that are -simulated ("nBi2") are calculated by multiplicating the number of Patients with the -second proportion and rounding this number up. -the mean and the standard deviation taken from the validation data for this gene. -The number of values that are simulated for each distribution equals the -number of Patients {"nPatients"}: -\code{data2 <- rnorm(nBi2,foldBi,as.numeric(valData[i,5]))} -At the end both of the simulated distributions are put into one expression. -\code{expression <- c(data1,data2)} +All other modi consist of two different scenarios: +In the "gauss" scenario all occurring distributions are gaussian distributions. +In the "gamma" scenario the first occuring distribution is a gamma distribution. +All other distributions are gaussian distributions. -Scenario 3 - Bimodal with one Gamma Distribution and one unimodal gaussian -distribution -Gene expression for the bimodal scenario 3 (1 Gamma and 1 unimodal gaussian -distribution) is simulated individually for each distribution. -The first distribution, which is the Gamma distribution, is simulated with -the shape and rate parameter of the validation data for this gene. -The number of values that are simulated ("nGu1") are calculated by -multiplying the number of Patients with the first proportion and rounding -this number off. -\code{data1 <- rgamma(n = nGu1,shape = as.numeric(valData[i,9]), -rate = as.numeric(valData[i,10]))} -The second distribution, which is the unimodal gaussian distribution is -simulated with the calculated mean, which is calculated by multiplying the -mean with the foldChange of the validation data of this gene. The standard -deviation is taken from the validation data of this gene. The number of values that -are simulated ("nGu2") are calculated by multiplying the number of Patients -with the second proportion and rounding this number up. -\code{data2 <- rnorm(nGu2,foldGamma,as.numeric(valData[i,4]))} -At the end both of the simulated distributions are put into one expression. -\code{expression <- c(data1,data2)} +Modus "2" or higher - Bimodal distributions or higher modalities + +Gene expressions of the multimodal genes are +simulated automatically for each distribution. +Scenario "gauss" +The different gaussian distributions are simulated with the corresponding mean +and a standard deviation and number of values taken from the validation data +for this gene. + +Scenario "gamma" +The gamma distribution is simulated with the gamma shape and rate values from +the params object. The other gaussian distributions are simulated with the values +from the validation data. } \examples{ simulation <- simulateExpression(params = newParams()) # Override default parameters # Not recommended, better use setParams; See \\code{\\link{Params}} -simulation <- simulateExpression(nUniGenes = 1000, nPatients = 50) +simulation <- simulateExpression(nGenes = list("1"=10, +"2"=list("gauss"= 10,"gamma"= 10)), nPatients = 50) # without messages printed to the console +params <- newParams() simulation <- simulateExpression(params = params,verbose = FALSE) } diff --git a/man/useFlexmix.Rd b/man/useFlexmix.Rd index af1d060..61a27ac 100644 --- a/man/useFlexmix.Rd +++ b/man/useFlexmix.Rd @@ -4,13 +4,17 @@ \alias{useFlexmix} \title{UseFlexmix} \usage{ -useFlexmix(expression, verbose = TRUE) +useFlexmix(expression, maxModality, verbose = TRUE, parallel = FALSE) } \arguments{ \item{expression}{(Simulated) Gene Expression data frame which genes shall be classified as unimodal, bimodal or other} +\item{maxModality}{numerical, maximum modality for that models will be calculated} + \item{verbose}{logical. Whether to print progress messages} + +\item{parallel}{logical. Whether to calculate the models in parallel} } \value{ flexmixOutput The formatted output of the best fitting models from flexmix @@ -20,12 +24,12 @@ Use the FlexMix algorithm for creating a model to find the best fitting model to evaluate the classification (unimodal,bimodal or other) of genes. } \examples{ -params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +params <- newParams(nGenes = list("1"=75,"2"=list("gauss"= 75,"gamma"= 75))) simulation <- simulateExpression(params= params,verbose=FALSE) expression <- simulation$expressionData #with messages printed to console -flexmixOutput <- useFlexmix(expression = expression) +flexmixOutput <- useFlexmix(expression = expression, maxModality = 2) #without messages printed to console -flexmixOutput <- useFlexmix(expression = expression, verbose = FALSE) +flexmixOutput <- useFlexmix(expression = expression, maxModality = 2, verbose = FALSE) } diff --git a/man/useHdbscan.Rd b/man/useHdbscan.Rd index ad5f28a..1ae762a 100644 --- a/man/useHdbscan.Rd +++ b/man/useHdbscan.Rd @@ -22,7 +22,7 @@ Use the HDBSCAN algorithm for generating output to evaluate the classification (unimodal,bimodal or other) of genes. } \examples{ -params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) simulation <- simulateExpression(params= params,verbose=FALSE) expression <- simulation$expressionData #with messages printed to console diff --git a/man/useMclust.Rd b/man/useMclust.Rd index 1f03c56..980e47a 100644 --- a/man/useMclust.Rd +++ b/man/useMclust.Rd @@ -4,12 +4,15 @@ \alias{useMclust} \title{UseMclust} \usage{ -useMclust(expression, verbose = TRUE, parallel = FALSE) +useMclust(expression, clusterNumbers = NULL, verbose = TRUE, + parallel = FALSE) } \arguments{ \item{expression}{(Simulated) Gene Expression data frame which genes shall be classified as unimodal, bimodal or other} +\item{clusterNumbers}{span of the clusters to be calculated at maximum} + \item{verbose}{logical. Whether to print progress messages} \item{parallel}{logical. Whether to calculate the mclust models in parallel} @@ -22,12 +25,13 @@ Use the mclust algorithm for generating output to evaluate the classification (unimodal,bimodal or other) of genes. } \examples{ -params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) simulation <- simulateExpression(params= params,verbose=FALSE) expression <- simulation$expressionData #with messages printed to console mclustOutput <- useMclust(expression = expression) -#without messages printed to console -mclustOutput <- useMclust(expression = expression, verbose = FALSE) +#without messages printed to console, at maximum 3 clusters +mclustOutput <- useMclust(expression = expression, clusterNumbers = c(1:3), +verbose = FALSE, parallel = FALSE) } diff --git a/vignettes/PlottingExample.PNG b/vignettes/PlottingExample.PNG new file mode 100644 index 0000000..e2631de Binary files /dev/null and b/vignettes/PlottingExample.PNG differ diff --git a/vignettes/Using-BimodalR-on-real-data.Rmd b/vignettes/Using-BimodalR-on-real-data.Rmd new file mode 100644 index 0000000..e7a6ca5 --- /dev/null +++ b/vignettes/Using-BimodalR-on-real-data.Rmd @@ -0,0 +1,121 @@ +--- +title: "Using BimodalR on real data" +author: "Svenja K. Beck" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Vignette Title} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +# Introduction +Welcome to bimodalR! bimodalR is an R package for the simulation of +multimodal gene expression data, for evaluating different algorithms +detecting multimodality and for detecting bimodality or multimodality +in data sets. +This vignette gives an overview to bimodalR's usage on real data. For an overview and introduction to bimodalR's functionalities, use the other vignette. +It was written in R Markdown, using the knitr package for production. +See help(package="bimodalR") for further details (and references provided by citation("bimodalR").) + +# Installation +To install the most recent development version from Github use: (not possible yet) + +```{r install-github, eval = FALSE} +# biocLite("loosolab/bimodalR", dependencies = TRUE,build_vignettes = TRUE) +``` + +# Quickstart + +1. Import your data +1. Clean up your data + 1. Log-transform your data + 1. Exclude 0-sum rows + 1. Sort into groups + +**Processing single groups:** + +1. Collect data as data.frame +1. Exlude 0-sum rows +1. Filter your genes (with Silverman Test's for unimodality) +1. Use algorithm on data to find potential multimodal genes + +## Import your data + +Import your data with +```{r loadData} +#load(file="") #add path to your Rdata object +``` +or create your own simulated data with 'bimodalR' + +## Clean up your data + +**Log-transform your data** + +**Exclude 0-sum rows** + +**Sort into groups** + +```{r loadData-2} +#load(file="") #add path to your Rdata object +``` + +## Processing single groups + +###Collect data as data.frame + +###Exlude 0-sum rows + +###Filter your genes (with Silverman Test's for unimodality) + +###Use algorithm on data to find potential multimodal genes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/vignettes/Workflow_small.png b/vignettes/Workflow_small.png new file mode 100644 index 0000000..ca65b83 Binary files /dev/null and b/vignettes/Workflow_small.png differ diff --git a/vignettes/bimodalR.Rmd b/vignettes/bimodalR.Rmd new file mode 100644 index 0000000..4814219 --- /dev/null +++ b/vignettes/bimodalR.Rmd @@ -0,0 +1,775 @@ +--- +title: "An introduction to the bimodalR package" +author: "Svenja Kristina Beck" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +#output: pdf_document +vignette: > + %\VignetteIndexEntry{An introduction to the bimodalR package} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + + + + + + + + + + + + + + +# Introduction +Welcome to bimodalR! bimodalR is an R package for the simulation of +multimodal gene expression data, for evaluating different algorithms +detecting multimodality and for detecting bimodality or multimodality +in data sets. +This vignette gives an overview and introduction to bimodalR's functionalities. +It was written in R Markdown, using the knitr package for production. +See help(package="bimodalR") for further details (and references provided by citation("bimodalR").) + +# Installation +To install the most recent development version from Github use: (not possible yet) + +```{r install-github, eval = FALSE} +# biocLite("loosolab/bimodalR", dependencies = TRUE,build_vignettes = TRUE) +``` + +# Quickstart + +Assuming you want to simulate gene expression data from scratch +there are two simple steps to creating a simulated data set with bimodalR. +Here is an example: + +```{r quickstart} +# Load package +library(bimodalR) + +# Create parameter object +params <- newParams() + +# Simulate data and generating the validationData behind it +simulation <- simulateExpression(params = params) +validationData <- simulation$validationData +expression <- simulation$expressionData + +``` + +These steps will be explained in detail in the following sections but briefly +the first step generates a Params object containing the parameters for +the simulation. The second step takes those parameters and simulates a new data set +containing the simulated gene expressions. + +# Getting started + +## The general background +When observing gene expression data, mostly 3 different scenarios can be observed: + + 1. a unimodal distribution of the gene expression + 2. a multimodal (often bimodal) distribution of the gene expression, + where lesser and greater expression are observed + 3. a multimodal (often bimodal) distribution of the gene expression, + where a non-existent expression (0, or nearly 0) + and higher expressions are observed + +For our simulation the first scenario is generated with a Gaussian distribution. +The second scenario is generated with multiple Gaussian distributions. +The third scenario is generated with a Gamma distribution and multiple Gaussian distributions. + +Examples for bimodal gene expression: + +* The gene expression of Plexin-B1 is bimodal in ErbB-2 over-expressing cells + +## Parameters +The parameters required for the gene expression simulation are briefly described here: + +* **Global parameters** + * `nGenes` - A list containing the numbers of genes to simulate for each scenario and modality. + * `nPatients` - The number of patients to simulate and + how many columns the simulated gene expression should have per gene. + * `seed` - Seed to use for generating random numbers. +* **Distribution parameters** + * `means` - A list containing the parameter range for the mean for each scenario and modality. + * `foldChanges` - A list containing the parameter range for the foldChanges between distributions for multimodal genes. Has to be one less than the modality that is simulated. + * `sd` - Parameters for generating the standard deviations via an Inverse Gaussian distribution + * `gamma` - Shape and rate parameter for generating the gamma distributions. + * `proportions` - Parameters for the allowed proportions of multimodal distributions. + +# The `Params` Object +All the parameters for the simulation of gene expression data +are stored in a `Params` object. +Let's create a new one and see what it looks like. + +```{r Params} +params <- newParams() +params +``` + +As well as telling us what type of object we have ("An object of class +`Params`") and showing us the values of the parameters +this output gives us some extra information. +We can see which parameters have been changed from their +default values (those in ALL CAPS). + +## `Params` object explained & Template +The names "1","2","3",... stand for the modality of the scenario. If you want to simulate gene expression with a modality > 3, it is very helpful to use the following template: + +```{r params-Template} +#template for highest modality >= 3 +params <- newParams( + nGenes=list( + "1"=c(300), + "2"=list(gauss = 150, gamma = 150), + "3"=list(gauss = 150, gamma = 150)#add more rows + ), + means=list( + "1"= c(2, 4), + "2"= c(2, 4), + "3"= c(2, 4) + ), + foldChanges = list( + "1"=NA, + "2"= list(gauss = c(2, 4), gamma = c(2, 4)), + "3"= list(gauss = list(c(2, 4), c(2, 4)), gamma = list(c(2, 4), c(2, 4))) + ), + sd = list( + "mu" = 0.61, + "lambda" = 2.21 + ), + gamma = list( + shape =2, + rate = 2 + ), + proportions = c(10,20,30,40,50,60,70,80,90), + nPatients = c(200) + ) +``` +CAUTION: if you want to simulate only certain modalities (e.g. 2,4), you have to use blanks for the others: +```{r params-TemplateOnlyCertainModailities} +params <- newParams( + nGenes=list( + "1"=c(0), + "2"=list(gauss = 10, gamma = 10), + "3"=list(gauss = 0, gamma = 0), + "4"=list(gauss = 10, gamma = 10) + ), + means=list( + "1"= 0, + "2"= c(2, 4), + "3"= 0, + "4"=c(2,4) + ), + foldChanges = list( + "1"=NA, + "2"= list(gauss = c(2, 4), gamma = c(2, 4)), + "3"= NA, + "4"= list(gauss = list(c(2, 4),c(2,4),c(2,4)), gamma = list(c(2, 4),c(2,4),c(2,4))) + ), + sd = list( + "mu" = 0.61, + "lambda" = 2.21 + ), + gamma = list( + shape =2, + rate = 2 + ), + proportions = c(10,20,30,40,50,60,70,80,90), + nPatients = c(200) + ) +``` + +## Getting and setting +If we want to look at a particular parameter, for example the number of genes to +simulate, we can extract it using the `getParam` function: + +```{r getParam} +getParam(params, "nGenes") +``` + +Alternatively, to give a parameter a new value we can use the `setParam` +function: + +```{r setParam} +params <- setParam(params, "nGenes", list("1" = 100,"2"=list("gauss"=150,"gamma"=200))) +getParam(params, "nGenes") +``` + +If we want to extract multiple parameters (as a list) or set multiple parameters +we can use the `getParams` or `setParams` functions: + +```{r getParams-setParams} +# Set multiple parameters at once (using a list) +params <- setParams(params, update = list("nGenes" = list("1" = 100,"2"=list("gauss"=150,"gamma"=200)), "nPatients" = 300)) +# Extract multiple parameters as a list +getParams(params, c("nGenes", "gamma")) +# Set multiple parameters at once (using additional arguments) +params <- setParams(params, "nGenes" = list("1" = 100,"2"=list("gauss"=150,"gamma"=200)), "nPatients" = 300) +params +``` + +The parameters which have changed are now shown in ALL CAPS to indicate that they have +been changed from the default. + +We can also set parameters directly when we call `newParams`: + +```{r newParams-set} +# # A basic example for simulating unimodal, bimodal and trimodal distributions +params <- newParams( +nGenes=list("1"=c(300),"2"=list(gauss = 150, gamma = 150),"3"=list(gauss = 150, gamma = 150)), +means=list("1"= c(2, 4),"2"= c(2, 4),"3"= c(2, 4)), +foldChanges = list("1"=NA,"2"= list(gauss = c(2, 4), gamma = c(2, 4)), +"3"= list(gauss = list(c(2, 4), c(2, 4)), gamma = list(c(2, 4), c(2, 4))))) +params +``` + +For changing the parameters, please consider the following table. +*Type* shows with which type the slot has to be filled, +*Usage* shows further information for the usage of the values, +*Default values* describes the default values used to generate a `Params` +object. +```{r params-table, eval=FALSE} + Type Usage Default values +nGenes list values with unimodal,bimodal(gauss,gamma),... 700, list(gauss = 150, gamma = 150) +means list values with unimodal,bimodal(gauss),... c(2, 4), c(2, 4) +foldChanges list values with unimodal,bimodal(gauss,gamma),... c(2, 4), list(gauss = c(2, 4), gamma = c(3, 5)) +sd list use values as µ,lambda 0.61, 2.21 +gamma list use values as shape,rate 2, 2 +proportions vector pick random value from vector c(10,20,30,40,50,60,70,80,90) +nPatients numeric - 100 +seed numeric - sample(1:1e6,1) + +``` + + +# Workflow +![Workflow to simulate gene expression and evaluate which algorithm works best](workflow_small.png) + +# The gene expression simulation +Now that we looked at the Params Objects and saw which parameters it contains, +let's look at how bimodalR simulates gene expression data. + +The core of the simulation is the validationData, which is created in the simulation +process with the parameters from the created Params object. +As the result of the simulation process, the simulated gene expression matrix +is returned with the validationData in a list object. + +The simulation of a gene expression data set requires the parameters (which are +stored in the Params object) and the validationData behind the simulation (which was +not created yet). +The Params object was already created and if needed, default values were +changed. The validationData is automatically created when the simulate() command is +executed. + +## validationData +The structure of the validationData is the following: + + * each simulated gene has its own entry in the validationData and can be accessed with validationData$[GeneName] + * each entry contains the following informations: + * modus - modality of this gene shown as a number (1 = unimodal, 2 = bimodal, ...) + * scenario - scenario of this gene, either ("unimodal-gaussian","multimodal-gaussian" or "multimodal-gamma+gaussian") + * means - means of this gene + * foldChanges - foldChanges between the means (NA for unimodal genes) + * sds - standard deviation of each mean + * proportions - proportions of the different distributions + * sizes - sizes of the different groups + * groups - patientNames are listed in the corresponding group + +To generate the validationData, a `Params` object is needed. The parameters from this +object are used for generating the validationData by randomizing values starting with a +generated `seed`. This is done by drawing values from the `Params` object. +Mean values are generated from a span of values, standard +deviations are generated by using an inversed Gaussian distribution [LaplacesDemon].That is, +because over a large data set a inverseGaussian-like distribution of all +standard deviations was noticed. + +Because of those needed parameters the validationData looks for example like that: +```{r validationData-example,eval=FALSE} +validationData$Gene0001 +$modus +[1] 1 + +$scenario +[1] "unimodal-gaussian" + +$means +[1] 2.173944 + +$foldChanges +[1] NA + +$sds +[1] 0.4449741 + +$proportions +[1] 100 + +$sizes +[1] 200 + +$groups +$groups[[1]] + [1] "Patient1" "Patient2" "Patient3" "Patient4" "Patient5" "Patient6" + [7] "Patient7" "Patient8" "Patient9" "Patient10" "Patient11" "Patient12" + [13] "Patient13" "Patient14" "Patient15" "Patient16" "Patient17" "Patient18" + [19] "Patient19" "Patient20" "Patient21" "Patient22" "Patient23" "Patient24" + [25] "Patient25" "Patient26" "Patient27" "Patient28" "Patient29" "Patient30" + [31] "Patient31" "Patient32" "Patient33" "Patient34" "Patient35" "Patient36" + [37] "Patient37" "Patient38" "Patient39" "Patient40" "Patient41" "Patient42" + [43] "Patient43" "Patient44" "Patient45" "Patient46" "Patient47" "Patient48" + [49] "Patient49" "Patient50" "Patient51" "Patient52" "Patient53" "Patient54" + [55] "Patient55" "Patient56" "Patient57" "Patient58" "Patient59" "Patient60" + [61] "Patient61" "Patient62" "Patient63" "Patient64" "Patient65" "Patient66" + [67] "Patient67" "Patient68" "Patient69" "Patient70" "Patient71" "Patient72" + [73] "Patient73" "Patient74" "Patient75" "Patient76" "Patient77" "Patient78" + [79] "Patient79" "Patient80" "Patient81" "Patient82" "Patient83" "Patient84" + [85] "Patient85" "Patient86" "Patient87" "Patient88" "Patient89" "Patient90" + [91] "Patient91" "Patient92" "Patient93" "Patient94" "Patient95" "Patient96" + [97] "Patient97" "Patient98" "Patient99" "Patient100" "Patient101" "Patient102" +[103] "Patient103" "Patient104" "Patient105" "Patient106" "Patient107" "Patient108" +[109] "Patient109" "Patient110" "Patient111" "Patient112" "Patient113" "Patient114" +[115] "Patient115" "Patient116" "Patient117" "Patient118" "Patient119" "Patient120" +[121] "Patient121" "Patient122" "Patient123" "Patient124" "Patient125" "Patient126" +[127] "Patient127" "Patient128" "Patient129" "Patient130" "Patient131" "Patient132" +[133] "Patient133" "Patient134" "Patient135" "Patient136" "Patient137" "Patient138" +[139] "Patient139" "Patient140" "Patient141" "Patient142" "Patient143" "Patient144" +[145] "Patient145" "Patient146" "Patient147" "Patient148" "Patient149" "Patient150" +[151] "Patient151" "Patient152" "Patient153" "Patient154" "Patient155" "Patient156" +[157] "Patient157" "Patient158" "Patient159" "Patient160" "Patient161" "Patient162" +[163] "Patient163" "Patient164" "Patient165" "Patient166" "Patient167" "Patient168" +[169] "Patient169" "Patient170" "Patient171" "Patient172" "Patient173" "Patient174" +[175] "Patient175" "Patient176" "Patient177" "Patient178" "Patient179" "Patient180" +[181] "Patient181" "Patient182" "Patient183" "Patient184" "Patient185" "Patient186" +[187] "Patient187" "Patient188" "Patient189" "Patient190" "Patient191" "Patient192" +[193] "Patient193" "Patient194" "Patient195" "Patient196" "Patient197" "Patient198" +[199] "Patient199" "Patient200" +``` + +To understand why all those parameters are needed, it is essential to +understand how the validationData is used for the simulation of the gene expression. + +## Simulation +With the validationData the gene expression matrix is simulated. +For each gene (each entry of the validationData) the parameters in the validationData are used for +simulating the gene expression. +How many columns the gene expression matrix will have depends on the number of +patients. Per patient one gene expression is simulated. +If a negative value was simulated, it is changed to be 0 in the simulation process, +as normally no negative gene expressions can be observed. +How the gene expression is simulated depends on the modus. + +The "modus" is the number of occurring distributions within one gene. + +###Modus "1" - Unimodal gaussian distribution +Gene expression for the unimodal gaussian distribution (modus "1") is +simulated from the mean and the standard deviation taken from the +validation data for this gene. The Gaussian distribution is calculated +with [rnorm]. The number of values that are simulated for each distribution +equals the number of Patients {"nPatients"}. + +All other modi consist of two different scenarios: +In the "gauss" scenario all occurring distributions are Gaussian distributions +and calculated with [rnorm]. +In the "gamma" scenario the first occuring distribution is a Gamma distribution +and is calculated with [rgamma]. +All other distributions are gaussian distributions and are calculated with [rnorm]. + +###Modus "2" or higher - Bimodal distributions or higher modalities + +Gene expressions of the multimodal genes are simulated automatically for each +distribution. + +####Scenario "gauss" +The different gaussian distributions are simulated with the corresponding mean +and a standard deviation and number of values taken from the validation data +for this gene. + +####Scenario "gamma" +The gamma distribution is simulated with the gamma shape and rate values from +the params object. The other gaussian distributions are simulated with the values +from the validation data. + +###Exemplary Gene Expression Matrix +After using the `simulateExpression()` command, the gene expression matrix looks like +this: +```{r expression-sample, eval=FALSE} + Patient0001 Patient0002 Patient0003 Patient0004 Patient0005 Patient0006 Patient0007 +Gene0001 3.9674492 2.80396473 3.78980664 4.095237721 4.00587345 3.60867245 4.11567495 +Gene0002 4.1045561 2.26303081 3.75889335 3.522737760 3.02592372 4.57802545 3.12049260 +Gene0003 2.4582632 2.22295497 3.17911283 2.947560965 2.74938887 2.13212747 2.37177796 +Gene0004 5.9032577 5.95497476 4.75760279 5.161985499 3.70910216 4.11950180 3.58240309 +Gene0005 3.7431061 3.68041039 3.62509463 3.740929320 3.66194851 3.64731856 3.69041368 +Gene0006 3.0784338 1.79963109 2.16929818 1.160848158 3.49503635 2.82339493 3.28019391 +Gene0007 0.7993267 2.21868266 2.62367680 0.152856110 1.58145211 2.08062664 2.38610452 +Gene0008 2.0018416 2.22588333 2.90464907 3.049822819 2.28450055 3.04354666 3.61796463 +Gene0009 2.2986108 2.08720943 2.02138626 2.407369792 2.00220259 2.20031061 2.13336985 +Gene0010 2.1793609 2.85387502 2.23943177 2.423610586 2.35072765 2.31400934 2.58579871 + +``` + +## Executing the simulation +As the `Params`object can be generated before starting the +simulation, the `simulateExpression()` command can be used in two different ways: + 1. with a created params object (recommended) + 2. without a created params object (not recommended) + +The validationData is automatically generated when using the `simulateExpression()` command. +The `simulateExpression()` command returns a list object, which contains the created +validationData and the simulated expressionData. + +##Simulating the gene expression + +### 1. With a created params object(recommended) +It is recommended to create a `Params` object before using the `simulateExpression()` +command, because then you can change the parameters and have the params object +saved in your global environment. +```{r simulateExpression-with-params} +#with messages printed to console +params <- newParams() +simulation <- simulateExpression(params = params) + +#without messages printed to console +simulation <- simulateExpression(params = params,verbose = FALSE) + +#storing the validation data and the simulated gene expression data as +#separate data.frames + +validationData <- simulation$validationData +expression <- simulation$expressionData +``` + +### 2. Without a created params object (not recommended) +If the simulation should be executed with default parameters only (or with +only certain parameters changed) and you do not mind, having no separate +`Params`object in your global environment to be able to look up the different +parameters (especially in the case that you changed them), you can use this +commands, though it is not recommended, as it is useful to have a `Params` +object to look up the parameters behind the validationData. +In this case, without a created `Params`object the validationData is not saved as an +Rdata object, so there is no way of keeping track of changes to your params object. +```{r simulateExpression-without-params} +#with default parameters and messages printed to console +params <- newParams() +simulation <- simulateExpression(params = newParams()) + +#with one changed parameter and messages not printed to console +simulation <- simulateExpression(params = newParams("seed"=23),verbose = FALSE) + +#with some changed parameters and messages printed to console +simulation <- simulateExpression(params = newParams("seed"=23,"nPatients"=200)) + + +#storing the validation data and the simulated gene expression data as +#a list and a data.frame + +validationData <- simulation$validationData +expression <- simulation$expressionData + +``` + +# The unified output data format +The output data format is a unified data format, that is needed to work with +and use all of the evaluation functions. The output of all implemented +algorithms matches the unified data format. +The unified data format has the following structure: + +The data format is structured in three parts: + +* the Genes section, +* the Clinical Data section and +* expression matrix section + + +The different parts are accessible with: +```{r accessing-the-different-sections, eval=FALSE} +mclustOutput$Genes[1:10] #mclustOutput$Genes for all genes +mclustOutput$ClinicalData +mclustOutput$Expressionmatrix + + +``` + +The Genes section has many parts. For each gene a separate section is generated. +Each gene section contains the following informations, which were calculated +by using one of the algorithms: + +* $modus - the modus is the number that equals the found number of occurring distributions +within one gene (the modality) +* $means - the means section contains the calculated mean values for each modus +* $sds - the sds section contains the calculated standard deviation values for each modus +* $sizes - the sizes section contains the calculated size of each group +* $groups - the groups section contains the patients that are assigned to the different groups + +The Genes can be accessed by using: +```{r accessing-a-specific-gene-section, eval=FALSE} +mclustOutput$Genes$Gene0001 + +#Accessing a specific section within the genes is possible by using the $ tags + +mclustOutput$Genes$Gene0001$modus +mclustOutput$Genes$Gene0001$means +mclustOutput$Genes$Gene0001$sds +mclustOutput$Genes$Gene0001$sizes +mclustOutput$Genes$Gene0001$groups +``` + +If this package is used with an algorithm which was not implemented in this package, +the output has to be changed to match this unified data format. +Then, all the evaluation functions will work with the output of not implemented algorithms. + +# Usage of the algorithms +In this package three algorithms that can be used for detecting multimodality, +are implemented. + +## mclust +Mclust uses a finite Gaussian mixture modeling fitted via EM algorithm for +model-based clustering, classification, and density estimation, including +Bayesian regularization and dimension reduction. +For more information look at the vignette of mclust "A quick tour of mclust" +[mclust] + +### mclust References +C. Fraley, A. E. Raftery, T. B. Murphy and L. Scrucca (2012). +mclust Version 4 for R: Normal Mixture Modeling for Model-Based Clustering, +Classification, and Density Estimation. Technical Report No. 597, Department +of Statistics, University of Washington. + +C. Fraley and A. E. Raftery (2002). Model-based clustering, discriminant +analysis, and density estimation. Journal of the American Statistical +Association 97:611:631. + +## HDBSCAN +This is a fast reimplementation of the HDBSCAN (Hierarchical Density-based +spatial clustering of applications with noise) clustering algorithm using a +kd-tree and its related algorithms using Rcpp. +HDBSCAN computes the hierarchical cluster tree representing density estimates +along with the stability-based flat cluster extraction proposed by Campello et +al. (2013). HDBSCAN essentially computes the hierarchy of all DBSCAN* +clusterings, and then uses a stability-based extraction method to find optimal +cuts in the hierarchy, thus producing a flat solution. +For more information look at the vignette of HDBSCAN[hdbscan] or use +```{r dbscan-information} +library(dbscan) +?hdbscan +``` + +### HDBSCAN References +Martin Ester, Hans-Peter Kriegel, Joerg Sander, Xiaowei Xu (1996). +A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases +with Noise. Institute for Computer Science, University of Munich. +Proceedings of 2nd International Conference on Knowledge Discovery and Data +Mining (KDD-96). + +Campello, R. J. G. B.; Moulavi, D.; Sander, J. (2013). +Density-Based Clustering Based on Hierarchical Density Estimates. +Proceedings of the 17th Pacific-Asia Conference on Knowledge Discovery in +Databases, PAKDD 2013, Lecture Notes in Computer Science 7819, p. 160. + +Campello, Ricardo JGB, et al. "Hierarchical density estimates for data +clustering, visualization, and outlier detection." ACM Transactions on +Knowledge Discovery from Data (TKDD) 10.1 (2015): 5. + +## flexmix +FlexMix implements a general framework for finite mixtures of regression +models. Parameter estimation is performed using the EM algorithm: the E-step +is implemented by flexmix, while the user can specify the M-step. +For more information look at the vignette of flexmix "flexmix-intro"[flexmix] + +### flexmix References +Friedrich Leisch. FlexMix: A general framework for finite mixture models and +latent class regression in R. Journal of Statistical Software, 11(8), 2004. +doi:10.18637/jss.v011.i08 + +Bettina Gruen and Friedrich Leisch. Fitting finite mixtures of generalized +linear regressions in R. Computational Statistics & Data Analysis, 51(11), +5247-5252, 2007. doi:10.1016/j.csda.2006.08.014 + +Bettina Gruen and Friedrich Leisch. FlexMix Version 2: Finite mixtures with +concomitant variables and varying and constant parameters Journal of +Statistical Software, 28(4), 1-35, 2008. doi:10.18637/jss.v028.i04 + +## Using the different algorithms +To use the different algorithms, it is only needed to have created a simulated +expression data frame (which can be done with the `simulateExpression()` command). +Of course, it is also possible to use the different algorithms with a data-set +that was created elsewhere, if it has the same structure used in this package. +To use this package for detecting bimodality in your data set, we recommend +using the **not clear yet** command, because our tests with a stratified +data set (included as the test-data set (**not yet**)) have shown that this +algorithm is the most accurate. +The output of all implemented algorithms was unified and overall shows the +same structure. +In the following it is shown, how to use the different implemented algorithms +and how the output should roughly look like. + +###Using mclust +To use the mclust algorithm on your expression data frame (simulated or not) +and gain information which genes are bimodal use the useMclust() command: +```{r useMclust} +#use mclust with the expression data frame expression and messages printed to the console +mclustOutput <- useMclust(expression = expression) + +#use mclust with the expression data frame expression and messages not printed to the console +mclustOutput <- useMclust(expression = expression,verbose = FALSE) + +``` + +###Using HDBSCAN +To use the HDBSCAN algorithm on your expression data frame (simulated or not) +and gain information which genes are bimodal use the useHdbscan() command. +Using HDBSCAN also needs the component `minPts`, which not only acts as a +minimum cluster size to detect, but also as a "smoothing" factor of the +density estimates implicitly computed from HDBSCAN: +```{r useHdbscan} +#use HDBSCAN with the expression data frame expression +hdbscanOutput <- useHdbscan(expression = expression,minPts = 10) + +``` + +###Using FlexMix: +To use the FlexMix algorithm on your expression data frame (simulated or not) +and gain information which genes are bimodal use the useFlexmix() command: +```{r useFlexmix} +#use FlexMix with the expression data frame expression +flexmixOutput <- useFlexmix(expression = expression,maxModality = 2) + +``` + +# Evaluating the algorithms output +After using one or more of the implemented algorithms on the gene expression +matrix, the packages function for evaluating which genes are bimodal can be +used. All outputs can be used with the same evaluation functions, but when +using the `evaluateAlgorithmOutput()` command, a type component will be needed. +When using the recommended algorithm, the component is not needed, as it is +the default value. +```{r evaluateAlgorithmOutput} +#evaluating for bimodal genes after using useMclust() +mclustEvaluation <- evaluateAlgorithmOutput(modality= 2,algorithmOutput = mclustOutput,validationData = validationData, maxDifference = 10) + +#evaluating for bimodal genes after using useHdbscan() +hdbscanEvaluation <- evaluateAlgorithmOutput(modality= 2,algorithmOutput = hdbscanOutput,validationData = validationData, maxDifference = 10) + +#evaluating for bimodal genes after using useFlexmix() +flexmixEvaluation <- evaluateAlgorithmOutput(modality= 2,algorithmOutput = flexmixOutput,validationData = validationData, maxDifference = 10) + +#the evaluation output looks like this: +head(mclustEvaluation) +``` + +## Statistics and classifications +After using the algorithms and using the `evaluateAlgorithmOutput` command, all +requirements for creating statistics and classifications were met. + +### Statistics +The `getStatistics()` function calculates the percentage of values that were +correctly (*TRUE*) and not correctly (*FALSE*) estimated by one or more of the +algorithms. To get this statistic, execute the following code: +```{r getStatistic()} +#getStatistic of one evaluation +getStatistics(evaluations = list("mclust"=mclustEvaluation)) + +#getStatistic of two or more evaluations +getStatistics(evaluations = list("mclust"=mclustEvaluation,"HDBSCAN"=hdbscanEvaluation)) + +``` + + +### Classifications +To get the classification of correctly and not correctly estimated scenarios, +the `getClassifications()` function can be used. + +#### Reading the output + * FN (FALSE Negatives) - validationData: modality, estimated scenario: not bimodal + (unimodal or more than bimodal) + * FP (FALSE Positives) - validationData: unimodal, estimated scenario: bimodal + * TN (TRUE Negatives) - validationData: unimodal, estimated scenario: unimodal + * TP (TRUE Positives) - validationData: bimodal, estimated scenario: bimodal + +#### Using the `getClassifications()` command +```{r getClassifications()} +#get the classification of one algorithmOutput +getClassifications(modality = 2,validationData = validationData,outputs = list("mclust"=mclustOutput)) + +#get the classification of two or more algorithms +getClassifications(modality = 2,validationData = validationData, +outputs = list("mclust"=mclustOutput,"HDBSCAN"=hdbscanOutput)) + +``` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +# Session information {-} + +```{r sessionInfo} +sessionInfo() +``` + + + + +[rnorm]: https://stat.ethz.ch/R-manual/R-devel/library/stats/html/Normal.html +[rgamma]: https://stat.ethz.ch/R-manual/R-devel/library/stats/html/GammaDist.html +[mclust]: http://127.0.0.1:40225/help/library/mclust/doc/mclust.html +[hdbscan]: https://cran.r-project.org/web/packages/dbscan/vignettes/hdbscan.html +[flexmix]: https://cran.r-project.org/web/packages/flexmix/vignettes/flexmix-intro.pdf +[LaplacesDemon]: https://www.rdocumentation.org/packages/LaplacesDemon/versions/16.0.1/topics/dist.Inverse.Gamma + + + + + +