From 5d0f9cfcc9f93e0d9b91ecc5c2ab5a2c937adaba Mon Sep 17 00:00:00 2001 From: sbeck Date: Wed, 21 Feb 2018 11:20:52 +0100 Subject: [PATCH 1/6] Added possibility of simulating and evaluating multimodality (higher than bimodal) The params object was changed. Especially, the slots were updated and renamed. The params object can now hold the parameters for unlimited high modality simulation. The creation of the validationData() was updated to work with the new params object structure. Also, the validationData now has a new structure which resembles approximately the output data format. The validationData now has is a list with one entry per gene. Each entry contains information about the modus (modality as numbers: 1=unimodal, 2=bimodal,...), the scenario(unimodal gaussian, multimodal gaussian or multimodal gamma+gaussian), the means, the foldChanges between the means, the standard deviations of the means, the proportions of the different distributions, the sizes of the groups of the different distributions and the groups with the patients that belong to each group. With the new validationData format the simulation of the expression was also changed. The simulation of unlimited modality gene expression was added. The three possible scenarios with which gene expression can be simulated are now: unimodal gaussian, multimodal gaussian and multimodal gamma+gaussian. The "modus" output of the useXXX() functions is now shown as a number instead of a string. All evaluation functions that work with the validationData are now updated to work with the new structure. The expandValidationData() function was deleted, because it is no longer necessary. The following functions have a new parameter "modality" which is used to evaluate the output of the useXXX functions for a certain modality: - compareScenarios(), - evaluateAlgorithmOutput(), -getClassification(), -getClassifications() The examples in the roxygen documentation of the functions were updated, as well as the documentation itself --- R/algorithmSpecificFunctions.R | 84 ++++--------- R/evaluate.R | 200 ++++++++++-------------------- R/params.R | 87 ++++++------- R/simulation.R | 218 +++++++++++++++++++++++---------- 4 files changed, 289 insertions(+), 300 deletions(-) diff --git a/R/algorithmSpecificFunctions.R b/R/algorithmSpecificFunctions.R index f587b26..604c8e5 100644 --- a/R/algorithmSpecificFunctions.R +++ b/R/algorithmSpecificFunctions.R @@ -3,6 +3,8 @@ #' Use the mclust algorithm for generating output to evaluate the #' classification (unimodal,bimodal or other) of genes. #' +#' +#' #' @param expression (Simulated) Gene Expression data frame which genes shall #' be classified as unimodal, bimodal or other #' @param verbose logical. Whether to print progress messages @@ -10,7 +12,7 @@ #' @return The formatted output of the mclust algorithm #' #' @examples -#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' #with messages printed to console @@ -18,18 +20,18 @@ #' #without messages printed to console #' mclustOutput <- useMclust(expression = expression, verbose = FALSE) #' +#' @import mclust #' @export useMclust <- function(expression,verbose = TRUE,parallel = FALSE) { + if (verbose) {message("mclust-models are calculated...")} if(parallel==FALSE){ - # Calculate mclust models sequentially models <- lapply(1:nrow(expression), function(i) { mclust::Mclust((expression[i,]),modelNames=c("V"),verbose = FALSE) }) } if(parallel==TRUE){ - # Calculate mclust models in parallel n.cores = parallel::detectCores()-1 cl = parallel::makeCluster(n.cores) parallel::clusterExport(cl, "expression") @@ -37,11 +39,9 @@ useMclust <- function(expression,verbose = TRUE,parallel = FALSE) { models <- parallel::parLapply(cl, 1:nrow(expression), function(i) { mclust::Mclust((expression[i,]), modelNames=c("V")) }) } - #unser Model zeigt schon log2 Werte if(parallel==TRUE){parallel::stopCluster(cl)} names(models) = rownames(expression) patients <- names(expression) - #save(models, file = "MClustModels.Rdata") mclustOutput <- mclustClassification(models = models,patients = patients) return(mclustOutput) @@ -59,11 +59,12 @@ useMclust <- function(expression,verbose = TRUE,parallel = FALSE) { #' @param patients List of the patients #' #' @return The formatted output of the mclust algorithm +#' +#' +#' mclustClassification <- function(models,patients){ - - ##get values from mclust into a data.frame - genes <- names(models) #row 'header' + genes <- names(models) outputs <- lapply(1:length(genes),function(i){ @@ -75,16 +76,7 @@ mclustClassification <- function(models,patients){ }})) }) proportions <- models[[i]]$parameters$pro - modus <- "modal" - if(length(models[[i]]$parameters$mean)==1){ - modus <- "unimodal" - } - if(length(models[[i]]$parameters$mean)==2){ - modus <- "bimodal" - } - if(length(models[[i]]$parameters$mean)>2){ - modus <- "multimodal" - } + modus <- length(models[[i]]$parameters$mean) output <- list( "modus" = modus, "means" = unname(models[[i]]$parameters$mean), @@ -114,7 +106,7 @@ mclustClassification <- function(models,patients){ #' @return hdbscanOutput The formatted output of the HDBSCAN clustering #' #' @examples -#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' #with messages printed to console @@ -124,14 +116,11 @@ mclustClassification <- function(models,patients){ #' #without messages printed to console #' hdbscanOutput <- useHdbscan(expression = expression, verbose = FALSE) #' +#' @import dbscan #' @export useHdbscan <- function(expression,minPts = 10,verbose = TRUE){ if(verbose){message("HDBSCAN-clusters are calculated...")} - #wichtige Fragen: - #wenn wir die NoisePunkte ebenfalls zu den Gruppen zuordnen, sollen - #die sizes ebenfalls angepasst werden? Oder sollen die sizes die Originalzahlen darstellen - #müssen ebenso means und sds neu berechnet werden oder nicht? - genes <- rownames(expression) #row 'header' + genes <- rownames(expression) patients <- names(expression) outputs <- lapply(1:nrow(expression),function(i){ @@ -171,13 +160,9 @@ useHdbscan <- function(expression,minPts = 10,verbose = TRUE){ } modus <- "modal" if((max(classification))==0){ - modus <- "unimodal" - } - if(max(classification)==2){ - modus <- "bimodal" - } - if(max(classification)>2){ - modus <- "multimodal" + modus <- 1 + }else{ + modus <- max(classification) } output <- list( @@ -191,20 +176,6 @@ useHdbscan <- function(expression,minPts = 10,verbose = TRUE){ names(outputs)<- genes hdbscanOutput <- list(Genes = outputs,"ClinicalData"="mnt/agnerds/xyz","Expressionmatrix"="mnt/agnerds/xyzyy") - - # hdbscanClusters <- lapply(1:nrow(expression), function(i) { - # - # - # proportions <- unlist(lapply(0:(max(clustering$cluster)),function(n){ - # length((which(clustering$cluster == n))) / length(clustering$cluster) - # })) - # c(means,sds,proportions) - # }) - # names(hdbscanClusters) = rownames(expression) - # save(hdbscanClusters, file = "hdbscanClusters.Rdata") - # if(verbose){message("Output of clustering is processed and formatted...")} - # hdbscanOutput <- hdbscanClassification(hdbscanClusters = hdbscanClusters) - return(hdbscanOutput) } @@ -215,9 +186,10 @@ useHdbscan <- function(expression,minPts = 10,verbose = TRUE){ #' predicted by a model or an estimator and the values actually observed. #' NAs are filtered out before calculating the RMSE #' -#' @param error The difference of a value predicted by a model and an actually observed value +#' @param error The difference of a value predicted by a model and an actually +#' observed value +#' #' -#' @return The RMSE value rmse <- function(error){ error <- unlist(error) error2 <- unlist(lapply(1:length(error),function(i){if(!is.na(error[[i]])){c(error[[i]])}})) @@ -230,8 +202,6 @@ rmse <- function(error){ #' Use the FlexMix algorithm for creating a model to find the best fitting #' model to evaluate the classification (unimodal,bimodal or other) of genes. #' -#' -#' #' @param expression (Simulated) Gene Expression data frame which genes shall #' be classified as unimodal, bimodal or other #' @param verbose logical. Whether to print progress messages @@ -239,7 +209,7 @@ rmse <- function(error){ #' @return flexmixOutput The formatted output of the best fitting models from flexmix #' #' @examples -#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' #with messages printed to console @@ -247,6 +217,8 @@ rmse <- function(error){ #' #without messages printed to console #' flexmixOutput <- useFlexmix(expression = expression, verbose = FALSE) #' +#' @import flexmix +#' @importFrom plyr compact #' @export useFlexmix <- function(expression, verbose = TRUE){ mo1 <- flexmix::FLXMRglm(family = "gaussian") @@ -366,12 +338,11 @@ useFlexmix <- function(expression, verbose = TRUE){ c(3) } })) - #save(bestModel, file = "flexModels.Rdata") if(verbose){message("Creating the formatted output with the best fitting models")} outputs <- lapply(1:length(bestModel),function(i){ if(bestModel[i]==1){ - modus <- "unimodal" + modus <- 1 means <- (flexModelsUni[[i]]$means) sds <- c(flexModelsUni[[i]]$sds) sizes <- c(flexModelsUni[[i]]$sizes) @@ -382,10 +353,9 @@ useFlexmix <- function(expression, verbose = TRUE){ }})) }) - #c(flexModelsUni[[i]][1],NA,flexModelsUni[[i]][2],NA,(flexModelsUni[[i]][3])/100,NA) }else if(bestModel[i]==2){ - modus <- "bimodal" + modus <- 2 means <- c(flexModelsBi[[i]]$means) sds <- c(flexModelsBi[[i]]$sds) sizes <- c(flexModelsBi[[i]]$sizes) @@ -397,7 +367,7 @@ useFlexmix <- function(expression, verbose = TRUE){ }) }else if(bestModel[i]==3){ - modus <- "bimodal" + modus <- 2 means <- c(flexModelsGU[[i]]$means) sds <- c(flexModelsGU[[i]]$sds) sizes <- c(flexModelsGU[[i]]$sizes) @@ -408,8 +378,8 @@ useFlexmix <- function(expression, verbose = TRUE){ }})) }) } - if(modus=="bimodal"&&(sizes[1]==ncol(expression2)||sizes[2]==ncol(expression2))){ - modus="unimodal" + if(modus==2&&(sizes[1]==ncol(expression2)||sizes[2]==ncol(expression2))){ + modus=1 means <- means[!is.na(means)] sds <- sds[!is.na(sds)] sizes <- sizes[!is.na(sizes)] diff --git a/R/evaluate.R b/R/evaluate.R index 5f22355..defc453 100644 --- a/R/evaluate.R +++ b/R/evaluate.R @@ -2,41 +2,31 @@ #' #' Compare the scenario of each gene that was assigned by the algorithm with #' the scenario that is written in the validation Data. -#' +#' @param modality numerical, for which modality the scenarioComparing is calculated, 1 for unimodal, +#' 2 for bimodal,... #' @param algorithmOutput The output to be compared #' @param validationData The validationData behind the simulated gene expression data frame #' #' @return Returns a list that classifies each assignment as #' TN (TRUE Negative),TP (TRUE Positive), FN (FALSE Negative), FP (FALSE Positive) #' @details -#' TN (TRUE Negative): validationData: unimodal, assigned scenario: unimodal +#' TN (TRUE Negative): validationData: not modality, assigned scenario: not modality #' -#' TP (TRUE Positive): validationData: bimodal, assigned scenario: bimodal +#' TP (TRUE Positive): validationData: modality, assigned scenario: modality #' -#' FN (FALSE Negative): validationData: bimodal, assigned scenario: unimodal -#' or -#' FN (FALSE Negative): validationData: unimodal/bimodal, -#' assigned scenario: more than bimodal +#' FN (FALSE Negative): validationData: modality, assigned scenario: not modality #' -#' FP (FALSE Positive): validationData: unimodal, assigned scenario: bimodal +#' FP (FALSE Positive): validationData: not modality, assigned scenario: modality #' -compareScenarios <- function(algorithmOutput,validationData){ +compareScenarios <- function(modality, algorithmOutput,validationData){ compareScenarios <- (unlist(lapply(1:length(algorithmOutput$Genes),function(i){ - if(validationData[i,2]==1&&algorithmOutput$Genes[[i]]$modus=="unimodal"){ - #unimodal + classified as unimodal + if(validationData[[i]]$modus != modality && algorithmOutput$Genes[[i]]$modus!=modality){ evaluation <- "TN" - }else if((validationData[i,2]==2||validationData[i,2]==3)&&algorithmOutput$Genes[[i]]$modus=="bimodal"){ - #bimodal + classified as bimodal + }else if(validationData[[i]]$modus ==modality && algorithmOutput$Genes[[i]]$modus==modality){ evaluation <- "TP" - }else if((validationData[i,2]==1)&&algorithmOutput$Genes[[i]]$modus=="bimodal"){ - #unimodal + classified as bimodal + }else if(validationData[[i]]$modus !=modality && algorithmOutput$Genes[[i]]$modus==modality){ evaluation <- "FP" - }else if((validationData[i,2]==2||validationData[i,2]==3)&&algorithmOutput$Genes[[i]]$modus=="unimodal"){ - #bimodal + classified as unimodal - evaluation <- "FN" - }else if((validationData[i,2]==1||validationData[i,2]==2||validationData[i,2]==3)&& - algorithmOutput$Genes[[i]]$modus=="multimodal"){ - #unimodal/bimodal + classified as more than multimodal + }else if(validationData[[i]]$modus ==modality && algorithmOutput$Genes[[i]]$modus!=modality){ evaluation <- "FN" }else{ evaluation <- NA @@ -66,49 +56,10 @@ isApproxSame<- function(validationVal,classVal,maxDistance){ } } -#' Expand the validationData -#' -#' Adds two columns to the validationData: mean2 and sdGamma -#' @param validationData not expanded validationData data frame -#' @return returns validationData with the two added columns -#' -#' @details -#' For scenario 1, no mean2 and no sdGamma is added (NA added) -#' For scenario 2 the mean2 (calculated out of mean1 and foldChange) is added, -#' but no sdGamma is added (NA added) -#' For scenario 3, the mean 2 is not added(NA added, foldChange was already -#' used), but sdGamma is calculated with sqrt(gammaShape*(gammaScale^2)) -#' gammaScale is 1/gammaRate -#' -expandValidationData <- function(validationData){ - validationData <- cbind(validationData,unlist(lapply(1:nrow(validationData),function(i){ - if(as.numeric(validationData[i,2])==1){#is unimodal, no mean2 - addValue <- NA - }else if(as.numeric(validationData[i,2])==2){#calculates mean2 with mean1*FC - addValue <- as.numeric(validationData[i,3])*as.numeric(validationData[i,6]) - }else if(as.numeric(validationData[i,2])==3){#calculates mean2 with shape*scale - gammaScale <- 1/as.numeric(validationData[i,10]) - mean2 <- as.numeric(validationData[i,9])*gammaScale - addValue <- mean2 - } - - }))) - colnames(validationData)[11]<-"Mean2" - validationData <- cbind(validationData,unlist(lapply(1:nrow(validationData),function(i){ - if((as.numeric(validationData[i,2])==1) || (as.numeric(validationData[i,2])==2)){#is unimodal or bimodal without gamma, no sdGamma2 - addValue <- NA - }else if(as.numeric(validationData[i,2])==3){#calculates sd2 with sqrt(shape*(scale^2)) - gammaScale <- 1/as.numeric(validationData[i,10]) - sdGamma <- sqrt(as.numeric(validationData[i,9])*(gammaScale*gammaScale)) - addValue <- sdGamma - } - }))) - colnames(validationData)[12]<-"sdGamma" - return(validationData) -} - #' Evaluate AlgorithmOutput #' +#' @param modality numerical, for which modality output is evaluated, 1 for unimodal, +#' 2 for bimodal,... #' @param algorithmOutput The formatted output of the algorithm that is now #' evaluated #' @param validationData the validationData data.frame @@ -116,16 +67,16 @@ expandValidationData <- function(validationData){ #' value and the validation data value with which the estimated is classified #' as TRUE (correctly estimated) #' @examples -#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData #' output <- useMclust(expression, verbose=FALSE) #' # TRUE if less than +- 10 % difference to value of validationData -#' evaluation <- evaluateAlgorithmOutput(algorithmOutput = output, +#' evaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = output, #' validationData = validationData) #' # TRUE if less than +- 20 % difference to value of validationData -#' evaluation <- evaluateAlgorithmOutput(algorithmOutput = output, +#' evaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = output, #' validationData = validationData,maxDifference = 20) #' @return Returns a data.frame where for each as TP classified gene it was #' evaluated whether it was estimated approximately correct or not (TRUE or @@ -138,62 +89,41 @@ expandValidationData <- function(validationData){ #' or equal to 10%, the estimation is classified as TRUE (correctly estimated) #' #' @export -evaluateAlgorithmOutput <- function(algorithmOutput,validationData,maxDifference = 10){ - validationDataExpanded <- expandValidationData(validationData = validationData) - compared <- compareScenarios(algorithmOutput = algorithmOutput,validationData = validationDataExpanded) - - genes <- validationDataExpanded[,1] #row 'header' +evaluateAlgorithmOutput <- function(modality,algorithmOutput,validationData,maxDifference = 10){ + compared <- compareScenarios(modality = modality,algorithmOutput = algorithmOutput,validationData = validationData) - numCol <- ncol(algorithmOutput) - perAt <- floor(numCol/3) + genes <- names(validationData) - approxSame <- (lapply(1:length(algorithmOutput$Genes),function(i){ - if((algorithmOutput$Genes[[i]]$modus=="bimodal") && (as.numeric(validationDataExpanded[i,2])==2||as.numeric(validationDataExpanded[i,2])==3)){ - mean1 <- isApproxSame(validationVal = min(as.numeric(validationDataExpanded[i,3]),as.numeric(validationDataExpanded[i,11])), - classVal = min(algorithmOutput$Genes[[i]]$means[1],algorithmOutput$Genes[[i]]$means[2]), - maxDistance = maxDifference) - mean2 <- isApproxSame(validationVal = max(as.numeric(validationDataExpanded[i,3]),as.numeric(validationDataExpanded[i,11])), - classVal = max(algorithmOutput$Genes[[i]]$means[1],algorithmOutput$Genes[[i]]$means[2]), - maxDistance = maxDifference) - if(as.numeric(validationDataExpanded[i,2])==2){ - sd1 <- isApproxSame(validationVal = (min(as.numeric(validationDataExpanded[i,4]),as.numeric(validationDataExpanded[i,5]))), - classVal = min(algorithmOutput$Genes[[i]]$sds[1],algorithmOutput$Genes[[i]]$sds[1]), - maxDistance = maxDifference) - sd2 <- isApproxSame(validationVal = (max(as.numeric(validationDataExpanded[i,4]),as.numeric(validationDataExpanded[i,5]))), - classVal = max(algorithmOutput$Genes[[i]]$sds[1],algorithmOutput$Genes[[i]]$sds[2]), - maxDistance = maxDifference) - }else if(as.numeric(validationDataExpanded[i,2])==3){ - sd1 <- isApproxSame(validationVal = (min(as.numeric(validationDataExpanded[i,4]),as.numeric(validationDataExpanded[i,12]))), - classVal = min(algorithmOutput$Genes[[i]]$sds[1],algorithmOutput$Genes[[i]]$sds[2]), - maxDistance = maxDifference) - sd2 <- isApproxSame(validationVal = (max(as.numeric(validationDataExpanded[i,4]),as.numeric(validationDataExpanded[i,12]))), - classVal = max(algorithmOutput$Genes[[i]]$sds[1],algorithmOutput$Genes[[i]]$sds[2]), - maxDistance = maxDifference) - } - if((algorithmOutput$Genes[[i]]$sizes[1]==100)&&(is.na(algorithmOutput$Genes[[i]]$sizes[2]))){ - prop1 <- isApproxSame(validationVal = ((max(as.numeric(validationDataExpanded[i,7]),as.numeric(validationDataExpanded[i,8])))), - classVal = algorithmOutput$Genes[[i]]$sizes[1], - maxDistance = maxDifference) - prop2 <- FALSE - } - else{ - prop1 <- isApproxSame(validationVal = ((min(as.numeric(validationDataExpanded[i,7]),as.numeric(validationDataExpanded[i,8])))), - classVal = min(algorithmOutput$Genes[[i]]$sizes[1],algorithmOutput$Genes[[i]]$sizes[2]), - maxDistance = maxDifference) - prop2 <- isApproxSame(validationVal = ((max(as.numeric(validationDataExpanded[i,7]),as.numeric(validationDataExpanded[i,8])))), - classVal = max(algorithmOutput$Genes[[i]]$sizes[1],algorithmOutput$Genes[[i]]$sizes[2]), - maxDistance = maxDifference) - } + approxSame <- (lapply(1:length(algorithmOutput$Genes),function(i){ + if((algorithmOutput$Genes[[i]]$modus==modality) && (validationData[[i]]$modus==modality)){ + classValsMean <- sort(algorithmOutput$Genes[[i]]$means) + x <- algorithmOutput$Genes[[i]]$sds + names(x) <- order(algorithmOutput$Genes[[i]]$means) + classValSDs <- x[order(names(x))] + y <- algorithmOutput$Genes[[i]]$sizes + names(y) <- order(algorithmOutput$Genes[[i]]$means) + classValProps <- y[order(names(y))] + means <- unlist(lapply(1:length(validationData[[i]]$means),function(j){ + isApproxSame(validationVal = validationData[[i]]$means[j],classVal = classValsMean[j],maxDistance = maxDifference) + })) + sds <- unlist(lapply(1:length(validationData[[i]]$sds),function(j){ + isApproxSame(validationVal = validationData[[i]]$sds[j],classVal = classValSDs[j],maxDistance = maxDifference) + })) + props <- unlist(lapply(1:length(validationData[[i]]$sizes),function(j){ + isApproxSame(validationVal = validationData[[i]]$sizes[j],classVal = classValProps[j],maxDistance = maxDifference) + })) - c(paste0("Gene",i),mean1,mean2,sd1,sd2,prop1,prop2) - } - })) - evaluation<- data.frame(do.call('rbind',approxSame)) - result <- data.frame(evaluation[,2:ncol(evaluation)]) - rownames(result) <- evaluation[,1] - colnames(result) <- c("mean1","mean2","sd1","sd2","prop1","prop2") + c(means,sds,props) + } + })) + names(approxSame) <- genes + evaluation<- data.frame(do.call('rbind',approxSame)) + meanNames <- sprintf(c("mean%s"),(1:(ncol(evaluation)/3))) + sdNames <- sprintf(c("sd%s"),(1:(ncol(evaluation)/3))) + propNames <- sprintf(c("prop%s"),(1:(ncol(evaluation)/3))) + colnames(evaluation) <- c(meanNames,sdNames,propNames) - return(result) + return(evaluation) } #' getClassification @@ -201,21 +131,22 @@ evaluateAlgorithmOutput <- function(algorithmOutput,validationData,maxDifference #' Returns the numbers of TN (TRUE Negative),TP (TRUE Positive), #' FN (FALSE Negative) and FP (FALSE Positive) for one of the algorithm's #' formatted outputs. -#' +#' @param modality numerical, for which modality the classification is calculated, 1 for unimodal, +#' 2 for bimodal,... #' @param algorithmOutput The formatted output of the algorithm #' @param algorithmName The name of the evaluated algorithm #' @param validationData The validationData behind the simulated gene #' expression data frame -#' #' @examples -#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData #' mclustOutput <- useMclust(expression, verbose=FALSE) #' evaluation <- evaluateAlgorithmOutput(algorithmOutput = output, #' validationData = validationData) -#' getClassification(algorithmOutput = mclustOutput,algorithmName = "mclust", +#' getClassification(modality= 2,algorithmOutput = mclustOutput, +#' algorithmName = "mclust", #' validationData = validationData) #' @return a classification data frame that contains the number of #' FN, FP, TN and TP @@ -232,9 +163,9 @@ evaluateAlgorithmOutput <- function(algorithmOutput,validationData,maxDifference #' #' FP (FALSE Positive): validationData: unimodal, assigned scenario: bimodal #' -getClassification <- function(algorithmOutput,algorithmName="mclust",validationData){ +getClassification <- function(modality,algorithmOutput,algorithmName="mclust",validationData){ - compared <- compareScenarios( + compared <- compareScenarios(modality = modality, algorithmOutput = algorithmOutput,validationData = validationData) fn <- table(compared)["FN"] @@ -259,14 +190,14 @@ getClassification <- function(algorithmOutput,algorithmName="mclust",validationD #' @param evaluation The output of the evaluation of the formatted output #' @param algorithmName The name of the algorithm which created the output #' @return A data frame with the percentual proportion of as TRUE or FALSE evaluated values -#' #' @examples -#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData #' mclustOutput <- useMclust(expression, verbose=FALSE) -#' mclustEvaluation <- evaluateAlgorithmOutput(algorithmOutput = mclustOutput,validationData = validationData) +#' mclustEvaluation <- evaluateAlgorithmOutput(algorithmOutput = mclustOutput, +#' validationData = validationData) #' getStatistic(evaluation = mclustEvaluation,algorithmName = "mclust") #' getStatistic <- function(evaluation,algorithmName ="mclust"){ @@ -287,7 +218,7 @@ getStatistic <- function(evaluation,algorithmName ="mclust"){ #' @param evaluations list of named evaluations #' @return A data frame with the percentual proportion of as TRUE or FALSE evaluated values #' @examples -#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData @@ -322,14 +253,15 @@ getStatistics <- function(evaluations) { #' Returns the numbers of TN (TRUE Negative),TP (TRUE Positive), #' FN (FALSE Negative) and FP (FALSE Positive) for each of the algorithm's #' formatted outputs. -#' +#' @param modality numerical, for which modality the classification is calculated, 1 for unimodal, +#' 2 for bimodal,... #' @param outputs list of named outputs #' @param validationData The validationData behind the simulated gene expression data frame #' #' @return a classification data frame that contains the number of #' FN, FP, TN and TP #' @examples -#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData @@ -338,9 +270,11 @@ getStatistics <- function(evaluations) { #' flexmixOutput <- useFlexmix(expression = expression,verbose = FALSE) #' #' #getClassifications of one algorithm -#' getClassifications(validationData,outputs = list("mclust"=mclustOutput)) +#' getClassifications(modality=2,validationData, +#' outputs = list("mclust"=mclustOutput)) #' #getClassifications of two or more algorithms -#' getClassifications(validationData,outputs = list("mclust"=mclustOutput,"HDBSCAN"=hdbscanOutput)) +#' getClassifications(modality=2,validationData, +#' outputs = list("mclust"=mclustOutput,"HDBSCAN"=hdbscanOutput)) #' #' @details #' TN (TRUE Negative): validationData: unimodal, assigned scenario: unimodal @@ -356,10 +290,10 @@ getStatistics <- function(evaluations) { #' #' #' @export -getClassifications <- function(validationData,outputs){ +getClassifications <- function(modality,validationData,outputs){ numOutputs <- (length(outputs)) classification <- lapply(1:numOutputs,function(i){ - getClassification(algorithmOutput = outputs[[i]], + getClassification(modality,algorithmOutput = outputs[[i]], algorithmName = names(outputs)[i], validationData = validationData) diff --git a/R/params.R b/R/params.R index 7242aae..5a7ec2e 100644 --- a/R/params.R +++ b/R/params.R @@ -9,7 +9,7 @@ #' #' @examples #' params <- newParams() -#' getParam(params, "nUniGenes") +#' getParam(object = params,name = "nGenes") #' #' @rdname getParam #' @export @@ -27,9 +27,10 @@ setGeneric("getParam", function(object, name) {standardGeneric("getParam")}) #' #' @examples #' params <- newParams() -#' params = setParam(params, "nBiGenes", 100) +#' params <- setParam(object = params,name = "nPatients",value = 250) #' #' @rdname setParam +#' @importFrom methods new rbind2 slot slot<- slotNames validObject #' @export setGeneric("setParam", function(object, name, value) { @@ -79,59 +80,51 @@ setGeneric("setParam", #' @aliases Params-class #' @exportClass Params setClass("Params", - slots = c(nUniGenes = "numeric", - nBiGenes = "numeric", - nGuGenes ="numeric", - nPatients = "numeric", - seed = "numeric", - sdParms = "vector", - uniMean = "vector", - biMean = "vector", - foldChange ="vector", + slots = c(nGenes ="list", + means = "list", + foldChanges = "list", + sd = "list", + gamma = "list", proportions = "vector", - guMean = "vector", - gammaShape = "numeric", - gammaRate = "numeric"), - prototype = prototype(nUniGenes = 300, nBiGenes = 500, nGuGenes = 200, - nPatients = 100, seed = sample(1:1e6, 1), - sdParms = c(0.61,2.21), uniMean = c(2,5), - biMean = c(2,5), foldChange =c(2,4), - proportions = c(10,20,30,40,50,60,70,80,90), guMean = c(2,5), - gammaShape = 2, gammaRate = 2)) + nPatients = "numeric", + seed = "numeric"), + prototype = prototype(nGenes = list("1"= 700, "2" = list("gauss"=150,"gamma"=150)), + means = list("1"=c(2,4),"2" = c(2,4)), + foldChanges = list("1" = c(2,4),"2" = list("gauss"= c(2,4),"gamma" = c(3,5))), + sd = list("mu"= 0.61,"lambda"=2.21), + gamma = list("shape" = 2, "rate" = 2), + proportions = c(10,20,30,40,50,60,70,80,90), + nPatients = 200, + seed = sample(1:1e6, 1))) #' @rdname getParam setMethod("getParam", "Params", function(object, name) { - slot(object, name) + methods::slot(object, name) }) #' @rdname setParam setMethod("setParam", "Params", function(object, name, value) { checkmate::assertString(name) - slot(object, name) <- value - validObject(object) + methods::slot(object, name) <- value + methods::validObject(object) return(object) }) setMethod("show", "Params", function(object) { - pp <- list("Global:" = c("UnimodalGenes" = "nUniGenes", - "BimodalGenes" = "nBiGenes", - "GammaUnimodalGenes" = "nGuGenes", - "Patients" = "nPatients", - "Seed" = "seed", - "Mu,Lambda for sd"="sdParms", - "Unimodal Mean" = "uniMean", - "Bimodal Mean" = "biMean", - "FoldChange" = "foldChange", + pp <- list("Global:" = c("nGenes" = "nGenes", + "Means" = "means", + "FoldChanges" = "foldChanges", + "sd" ="sd", + "Gamma" ="gamma", "Proportions" = "proportions", - "GammaUnimodal Mean" = "guMean", - "Gamma Shape" = "gammaShape", - "Gamma Rate" = "gammaRate")) + "nPatients" = "nPatients", + "Seed" = "seed")) cat("A Params object of class", class(object), "\n") cat("Parameters can be Default' or 'NOT DEFAULT'.", "\n\n") showPP(object, pp) - cat(length(slotNames(object)) - 13, "additional parameters", "\n\n") + cat(length(methods::slotNames(object)) - 8, "additional parameters", "\n\n") }) #' New Params @@ -144,13 +137,19 @@ setMethod("show", "Params", function(object) { #' @return New Params object. #' @examples #' params <- newParams() -#' params <- newParams(nUniGenes = 200, nPatients = 10) +#' params <- newParams("nGenes"=list("1"=100,"2"=list("gauss"=100,"gamma"=100)), +#' "proportions"=c(30,70)) +#' params <- newParams( +#' nGenes=list("1"=c(700),"2"=list(gauss = 150, gamma = 150),"3"=list(gauss = 150, gamma = 150)), +#' means=list("1"= c(2, 4),"2"= c(2, 4),"3"= c(2, 4)), +#' foldChanges = list("1"=NA,"2"= list(gauss = c(2, 4), gamma = c(2, 4)), +#' "3"= list(gauss = list(c(2, 4), c(2, 4)), gamma = list(c(2, 4), c(2, 4))))) #' #' @rdname newParams #' @export newParams <- function(...) { - params <- new("Params") + params <- methods::new("Params") params <- setParams(params, ...) return(params) @@ -166,7 +165,7 @@ newParams <- function(...) { #' @return List with the values of the selected parameters. #' @examples #' params <- newParams() -#' getParams(params, c("nUniGenes", "nPatients", "uniMean")) +#' getParams(params = params,names = c("nGenes","foldChanges","proportions","means")) #' @export getParams <- function(params, names) { @@ -198,10 +197,12 @@ getParams <- function(params, names) { #' params <- newParams() #' params #' # Set individually -#' params <- setParams(params, nUniGenes = 1000, nPatients = 50) +#' params <- params <- setParams(params,proportions = c(80,20),foldChanges = list("1"=c(2,4), +#' "2"= list("gauss"=c(2,5),"gamma"=c(3,5)))) #' params #' # Set via update list -#' params <- setParams(params, list(nUniGenes = 1000, nPatients = 50)) +#' params <- setParams(params,update = list("nGenes"=list("1"=100, +#' "2"=list("gauss"=200,"gamma"=100)),"means"=list("1"=c(1,3),"2"=c(2,4)),"nPatients"=700)) #' params #' @export setParams <- function(params, update = NULL, ...) { @@ -236,12 +237,12 @@ showPP <- function(params, pp) { checkmate::assertClass(params, classes = "Params") checkmate::assertList(pp, types = "character", min.len = 1) - default <- new(class(params)) + default <- methods::new(class(params)) for (category in names(pp)) { parameters <- pp[[category]] values <- getParams(params, parameters) short.values <- sapply(values, function(x) { - if (length(x) > 4) { + if (length(x) > 10) { paste0(paste(head(x, n = 4), collapse = ", "), ",...") } else { paste(x, collapse = ", ") diff --git a/R/simulation.R b/R/simulation.R index 101ed7d..69a2467 100644 --- a/R/simulation.R +++ b/R/simulation.R @@ -13,7 +13,7 @@ #' simulated from the mean and the standard deviation taken from the validation data #' for this gene. The number of values that are simulated for each #' distribution equals the number of Patients {"nPatients"}: -#' \code{expression <- rnorm(nPatients,as.numeric(valData[i,3]), as.numeric(valData[i,4]))} +#' \code{expression <- stats::rnorm(nPatients,as.numeric(valData[i,3]), as.numeric(valData[i,4]))} #' #' Scenario 2 - Bimodal with two unimodal gaussian distributions #' Gene expression for the bimodal scenario 2 (two gaussian distributions) is @@ -22,7 +22,7 @@ #' deviation taken from the validation data for this gene. The number of values that are #' simulated ("nBi1") are calculated by multiplicating the number of Patients with the #' first proportion and rounding this number off: -#' \code{data1 <- rnorm(nBi1,as.numeric(valData[i,3]),as.numeric(valData[i,4]))} +#' \code{data1 <- stats::rnorm(nBi1,as.numeric(valData[i,3]),as.numeric(valData[i,4]))} #' The second distribution is simulated with the mean calculated by #' multiplying the mean of the first distribution with the foldChange #' from the validation data of this gene. The standard deviation is also taken from the validation data of this gene. The number of values that are @@ -31,7 +31,7 @@ #' the mean and the standard deviation taken from the validation data for this gene. #' The number of values that are simulated for each distribution equals the #' number of Patients {"nPatients"}: -#' \code{data2 <- rnorm(nBi2,foldBi,as.numeric(valData[i,5]))} +#' \code{data2 <- stats::rnorm(nBi2,foldBi,as.numeric(valData[i,5]))} #' At the end both of the simulated distributions are put into one expression. #' \code{expression <- c(data1,data2)} #' @@ -44,7 +44,7 @@ #' The number of values that are simulated ("nGu1") are calculated by #' multiplying the number of Patients with the first proportion and rounding #' this number off. -#' \code{data1 <- rgamma(n = nGu1,shape = as.numeric(valData[i,9]), +#' \code{data1 <- stats::rgamma(n = nGu1,shape = as.numeric(valData[i,9]), #' rate = as.numeric(valData[i,10]))} #' The second distribution, which is the unimodal gaussian distribution is #' simulated with the calculated mean, which is calculated by multiplying the @@ -52,7 +52,7 @@ #' deviation is taken from the validation data of this gene. The number of values that #' are simulated ("nGu2") are calculated by multiplying the number of Patients #' with the second proportion and rounding this number up. -#' \code{data2 <- rnorm(nGu2,foldGamma,as.numeric(valData[i,4]))} +#' \code{data2 <- stats::rnorm(nGu2,foldGamma,as.numeric(valData[i,4]))} #' At the end both of the simulated distributions are put into one expression. #' \code{expression <- c(data1,data2)} #' @@ -63,11 +63,13 @@ #' simulation <- simulateExpression(params = newParams()) #' # Override default parameters #' # Not recommended, better use setParams; See \code{\link{Params}} -#' simulation <- simulateExpression(nUniGenes = 1000, nPatients = 50) +#' simulation <- simulateExpression(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100)), nPatients = 50) #' # without messages printed to the console +#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params = params,verbose = FALSE) #' @export #' @importFrom checkmate assertClass assertString assertCharacter assertList +#' @importFrom stats dnorm rgamma rnorm runif sd #' simulateExpression <- function(params = newParams(), verbose = TRUE,...) { @@ -75,43 +77,48 @@ simulateExpression <- function(params = newParams(), verbose = TRUE,...) { checkmate::assertClass(params, "Params") params <- setParams(params, ...) - # Set random seed seed <- getParam(params, "seed") set.seed(seed) valData <- createValidationData(params = params, verbose = verbose) nPatients <- getParam(params,"nPatients") - nUniGenes <- getParam(params, "nUniGenes") - nBiGenes <- getParam(params,"nBiGenes") - nGuGenes <- getParam(params,"nGuGenes") - nGenes <- nUniGenes+nBiGenes+nGuGenes + + numGenes <- sum(unlist(getParam(params,name = "nGenes"))) if (verbose){message("Drawing gene expression values...")} - ##STEP 2 - Generating the simulation result - sim <- lapply(1:nGenes,function(i){ - if(valData[i,2]==1){ - expression <- rnorm(nPatients,as.numeric(valData[i,3]),as.numeric(valData[i,4])) - }else if(valData[i,2]==2){ - nBi1 = floor(nPatients*as.numeric(valData[i,7])/100) - nBi2 = ceiling(nPatients*as.numeric(valData[i,8])/100) - foldBi = as.numeric(valData[i,3])*as.numeric(valData[i,6]) - data1 <- rnorm(nBi1,as.numeric(valData[i,3]),as.numeric(valData[i,4])) - data2 <- rnorm(nBi2,foldBi,as.numeric(valData[i,5])) - expression <- c(data1,data2) - }else if(valData[i,2]==3){ - nGu1 = floor(nPatients*as.numeric(valData[i,7])/100) - nGu2 = ceiling(nPatients*as.numeric(valData[i,8])/100) - data1 <- rgamma(n = nGu1,shape = as.numeric(valData[i,9]),rate = as.numeric(valData[i,10])) - data2 <- rnorm(nGu2,as.numeric(valData[i,3]),as.numeric(valData[i,4])) - expression <- c(data1,data2) + simNotAdjusted <- lapply(1:numGenes,function(i){ + if(valData[[i]]$scenario=="unimodal-gaussian"){ + expression <- stats::rnorm(n = nPatients,mean = valData[[i]]$means,sd = valData[[i]]$sds) + }else if(valData[[i]]$scenario=="multimodal-gaussian"){ + expression <- unlist(lapply(1:valData[[i]]$modus,function(j){ + expression <- stats::rnorm(n = valData[[i]]$sizes[j],mean = valData[[i]]$means[j],sd = valData[[i]]$sds[j]) + })) + }else if(valData[[i]]$scenario=="multimodal-gamma+gaussian"){ + expression <- unlist(lapply(1:valData[[i]]$modus,function(j){ + if(j==1){ + expression <- stats::rgamma(n = valData[[i]]$sizes[j],shape = params@gamma$shape,rate = params@gamma$rate) + }else{ + expression <- stats::rnorm(n = valData[[i]]$sizes[j],mean = valData[[i]]$means[j],sd = valData[[i]]$sds[j]) + } + })) } expression }) + simNotAdjusted <- data.frame(do.call('rbind',simNotAdjusted)) + sim <- lapply(1:nrow(simNotAdjusted),function(i){ + values <- unlist(lapply(1:ncol(simNotAdjusted),function(j){ + if(simNotAdjusted[i,j]<0){ + 0 + }else{ + simNotAdjusted[i,j] + } + })) + }) sim <- data.frame(do.call('rbind',sim)) - genes <- valData[,1] + genes <- names(valData) patients <- sprintf("Patient%04d", 1:nPatients) #columns 'header' rownames(sim) <- genes colnames(sim) <- patients @@ -166,57 +173,134 @@ simulateExpression <- function(params = newParams(), verbose = TRUE,...) { #' #' gammaRate: numeric, Default value: 2 #' -#' @export +#' #' @importFrom LaplacesDemon rinvgaussian #' createValidationData <- function(params = newParams(),verbose = TRUE){ - # Get the parameters we are going to use - nPatients <- getParam(params, "nPatients") - nUniGenes <- getParam(params, "nUniGenes") - nBiGenes <- getParam(params,"nBiGenes") - nGuGenes <- getParam(params,"nGuGenes") - nGenes <- nUniGenes+nBiGenes+nGuGenes - nSDs <- nUniGenes+ 2*nBiGenes+nGuGenes - sdParms <- getParam(params, "sdParms") - uniMean <- getParam(params, "uniMean") - biMean <- getParam(params, "biMean") - foldChange <- getParam(params, "foldChange") - proportions <- getParam(params, "proportions") - guMean <- getParam(params, "guMean") - gammaShape <- getParam(params, "gammaShape") - gammaRate <- getParam(params,"gammaRate") + numGenes <- sum(unlist(getParam(params,name = "nGenes"))) if (verbose) { message("Drawing parameters for expression simulation...") } - genes <- sprintf("Gene%04d",1:nGenes) - scenarios <- sample(rep(c(1,2,3), times = c(nUniGenes, nBiGenes, nGuGenes)), size = nGenes, replace = F) + genes <- sprintf("Gene%04d",1:numGenes) + reps <- c() + reps <- unlist(lapply(1:length(unlist(params@nGenes)),function(i){ + if(i == 1){ + reps <- rep(x = i,times = params@nGenes$`1`) + }else{ + if(i%%2==0){ + reps <- c(reps,rep(x = i, times = sum(unlist(eval(parse(text=(paste0("params@nGenes$`",floor(i/2)+1,"`$gauss")))))))) + }else{ + reps <- c(reps,rep(x = i, times = sum(unlist(eval(parse(text=(paste0("params@nGenes$`",floor(i/2)+1,"`$gamma")))))))) + } + } - drawGeneParams <- function(proportions, mean.min, mean.max, sd.mu, sd.lambda, fc.min = NULL, fc.max = NULL, gamma.shape = NA, gamma.rate = NA) { - ranMean <- runif(1, min = mean.min, max = mean.max) - ranSd <- LaplacesDemon::rinvgaussian(n = 1, mu = sd.mu, lambda = sd.lambda) - ranProp <- ifelse(length(proportions) > 1, sample(proportions, size = 1), proportions) - ranProp2 <- 100 - ranProp + })) + scenarios <- sample(x = reps,size = numGenes,replace = FALSE) + drawGeneParams <- function(nPatients,modality, proportions,means=NULL,sdParms,foldChanges = NULL,gammaParms = NULL){ + ranMean1 <- c() + ranSd1 <- c() + if(!is.null(gammaParms)){ + ranMean1 <- gammaParms$shape/gammaParms$rate + ranSd1 <- sqrt(gammaParms$shape/(gammaParms$rate^2)) + }else{ + ranMean1 <- stats::runif(1, min = means[1], max = means[2]) + ranSd1 <- LaplacesDemon::rinvgaussian(n = 1, mu = sdParms$mu, lambda = sdParms$lambda) + } + ranProps <- 100 + ranFCs <- NA + means <- c() + ranSd <- c() + if(modality >= 2){ + combs <- t(utils::combn(x = proportions,m = modality,simplify=TRUE)) + combinations <- unlist(lapply(1:nrow(combs),function(i){ + if(sum(combs[i,])==100){ + i + } + })) + sampleProp <- sample(combinations,1,replace = FALSE) + ranProps <- combs[sampleProp,] - ranSd2 <- NA - ranFC <- NA + ranSd<- unlist(lapply(2:modality,function(i){ + eval(parse(text = paste(paste0("ranSd",i), + "<- ", + LaplacesDemon::rinvgaussian(n = 1, mu = sdParms$mu, lambda = sdParms$lambda)))) + })) + if(modality==2){ + ranFCs <- unlist(lapply(1:1,function(i){ + eval(parse(text = paste(paste0("ranFC",i), + "<-",(stats::runif(1, min = foldChanges[1], max = foldChanges[2]))))) + })) + }else{ + ranFCs <- unlist(lapply(1:(length(foldChanges)),function(i){ + if(modality ==2){ + eval(parse(text = paste(paste0("ranFC",i), + "<-",(stats::runif(1, min = foldChanges[1], max = foldChanges[2]))))) + }else{ + eval(parse(text = paste(paste0("ranFC",i), + "<-",(stats::runif(1, min = foldChanges[[i]][1], max = foldChanges[[i]][2]))))) + } - if(!is.null(fc.min) & !is.null(fc.max)) { - ranSd2 <- LaplacesDemon::rinvgaussian(n = 1, mu = sd.mu, lambda = sd.lambda) - ranFC <- runif(1, min = fc.min, max = fc.max) + })) + } + eval(parse(text = paste(paste0("ranMean",2:modality), + "<-",NA))) + for(i in 1:modality){ + if(i!=1){ + eval(parse(text = paste(paste0("ranMean",(i)),"<-",paste0("ranMean",(i-1)),"*",paste0("ranFCs[",(i-1),"]")))) + } + } } - return(data.frame(Mean = ranMean, Sd1 = ranSd, Sd2 = ranSd2, FC = ranFC, Prop = ranProp, Prop2 = ranProp2, Shape = gamma.shape, Rate = gamma.rate)) + sds <- c(ranSd1,ranSd) + means <- unlist(lapply(1:modality,function(i){ + eval(parse(text = paste(paste0("ranMean",(i))))) + })) + sizes <- (ranProps/100)*nPatients + patients <- 1 + groups <- lapply(1:length(sizes),function(i){ + if(i==1){ + sprintf("Patient%d",1:sizes[i]) + }else{ + sprintf("Patient%d",(sum(sizes[1:(i-1)])+1):(sizes[i]+sum(sizes[1:(i-1)]))) + } + }) + scenario <-c() + if(modality==1){ + scenario = "unimodal-gaussian" + }else if(modality>1&&is.null(gammaParms)){ + scenario = "multimodal-gaussian" + }else{ + scenario <- "multimodal-gamma+gaussian" + } + + return(list("modus"=modality,"scenario"=scenario,"means"=means,"foldChanges"=ranFCs,"sds"=sds,"proportions"=ranProps,"sizes"=sizes,"groups"=groups)) } - simData <- lapply(scenarios, function(s) { - if(s == 1) gP <- drawGeneParams(100, uniMean[1], uniMean[2], sdParms[1], sdParms[2]) - if(s == 2) gP <- drawGeneParams(proportions, biMean[1], biMean[2], sdParms[1], sdParms[2], fc.min = foldChange[1], fc.max = foldChange[2]) - if(s == 3) gP <- drawGeneParams(proportions, guMean[1], guMean[2], sdParms[1], sdParms[2], gamma.shape = gammaShape, gamma.rate = gammaRate) + valData <- lapply(1:length(scenarios), function(i) { + if(scenarios[i]==1){ + gP <- drawGeneParams(nPatients = params@nPatients, + modality = 1, + proportions = params@proportions, + means = params@means$`1`, + sdParms = params@sd) + } + if((scenarios[i]>1)&&(scenarios[i]%%2==0)){ + gP <- drawGeneParams(nPatients = params@nPatients, + modality = (floor(scenarios[i]/2))+1, + proportions = params@proportions, + means = eval(parse(text = paste(paste0("params@means$`",((floor(scenarios[i]/2))+1),"`")))), + sdParms = params@sd, + foldChanges = eval(parse(text = paste(paste0("params@foldChanges$`",((floor(scenarios[i]/2))+1),"`$gauss"))))) + } + if((scenarios[i]>1)&&(scenarios[i]%%2!=0)){ + gP <- drawGeneParams(nPatients = params@nPatients, + modality = (floor(scenarios[i]/2))+1, + proportions = params@proportions, + sdParms = params@sd, + foldChanges = eval(parse(text = paste(paste0("params@foldChanges$`",((floor(scenarios[i]/2))+1),"`$gamma")))), + gammaParms = params@gamma) + } return(gP) }) - simData <- do.call("rbind", simData) - - valData <- cbind(genes, scenarios, simData) - colnames(valData) <- c("Gene","Scenario","Mean","SD1","SD2", - "FC","prop1","prop2","GammaShape","GammaRate") + names(valData)<- genes if(verbose){ message("Finished creating validation data.") } return(valData) } From 778d7d1e1bfb8ddbbaefaf34e1e35679a469ee71 Mon Sep 17 00:00:00 2001 From: sbeck Date: Wed, 21 Feb 2018 11:22:44 +0100 Subject: [PATCH 2/6] Revert "Added possibility of simulating and evaluating multimodality (higher than bimodal)" This reverts commit 5d0f9cfcc9f93e0d9b91ecc5c2ab5a2c937adaba. --- R/algorithmSpecificFunctions.R | 84 +++++++++---- R/evaluate.R | 200 ++++++++++++++++++++---------- R/params.R | 87 +++++++------ R/simulation.R | 218 ++++++++++----------------------- 4 files changed, 300 insertions(+), 289 deletions(-) diff --git a/R/algorithmSpecificFunctions.R b/R/algorithmSpecificFunctions.R index 604c8e5..f587b26 100644 --- a/R/algorithmSpecificFunctions.R +++ b/R/algorithmSpecificFunctions.R @@ -3,8 +3,6 @@ #' Use the mclust algorithm for generating output to evaluate the #' classification (unimodal,bimodal or other) of genes. #' -#' -#' #' @param expression (Simulated) Gene Expression data frame which genes shall #' be classified as unimodal, bimodal or other #' @param verbose logical. Whether to print progress messages @@ -12,7 +10,7 @@ #' @return The formatted output of the mclust algorithm #' #' @examples -#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) +#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' #with messages printed to console @@ -20,18 +18,18 @@ #' #without messages printed to console #' mclustOutput <- useMclust(expression = expression, verbose = FALSE) #' -#' @import mclust #' @export useMclust <- function(expression,verbose = TRUE,parallel = FALSE) { - if (verbose) {message("mclust-models are calculated...")} if(parallel==FALSE){ + # Calculate mclust models sequentially models <- lapply(1:nrow(expression), function(i) { mclust::Mclust((expression[i,]),modelNames=c("V"),verbose = FALSE) }) } if(parallel==TRUE){ + # Calculate mclust models in parallel n.cores = parallel::detectCores()-1 cl = parallel::makeCluster(n.cores) parallel::clusterExport(cl, "expression") @@ -39,9 +37,11 @@ useMclust <- function(expression,verbose = TRUE,parallel = FALSE) { models <- parallel::parLapply(cl, 1:nrow(expression), function(i) { mclust::Mclust((expression[i,]), modelNames=c("V")) }) } + #unser Model zeigt schon log2 Werte if(parallel==TRUE){parallel::stopCluster(cl)} names(models) = rownames(expression) patients <- names(expression) + #save(models, file = "MClustModels.Rdata") mclustOutput <- mclustClassification(models = models,patients = patients) return(mclustOutput) @@ -59,12 +59,11 @@ useMclust <- function(expression,verbose = TRUE,parallel = FALSE) { #' @param patients List of the patients #' #' @return The formatted output of the mclust algorithm -#' -#' -#' mclustClassification <- function(models,patients){ - genes <- names(models) + + ##get values from mclust into a data.frame + genes <- names(models) #row 'header' outputs <- lapply(1:length(genes),function(i){ @@ -76,7 +75,16 @@ mclustClassification <- function(models,patients){ }})) }) proportions <- models[[i]]$parameters$pro - modus <- length(models[[i]]$parameters$mean) + modus <- "modal" + if(length(models[[i]]$parameters$mean)==1){ + modus <- "unimodal" + } + if(length(models[[i]]$parameters$mean)==2){ + modus <- "bimodal" + } + if(length(models[[i]]$parameters$mean)>2){ + modus <- "multimodal" + } output <- list( "modus" = modus, "means" = unname(models[[i]]$parameters$mean), @@ -106,7 +114,7 @@ mclustClassification <- function(models,patients){ #' @return hdbscanOutput The formatted output of the HDBSCAN clustering #' #' @examples -#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) +#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' #with messages printed to console @@ -116,11 +124,14 @@ mclustClassification <- function(models,patients){ #' #without messages printed to console #' hdbscanOutput <- useHdbscan(expression = expression, verbose = FALSE) #' -#' @import dbscan #' @export useHdbscan <- function(expression,minPts = 10,verbose = TRUE){ if(verbose){message("HDBSCAN-clusters are calculated...")} - genes <- rownames(expression) + #wichtige Fragen: + #wenn wir die NoisePunkte ebenfalls zu den Gruppen zuordnen, sollen + #die sizes ebenfalls angepasst werden? Oder sollen die sizes die Originalzahlen darstellen + #müssen ebenso means und sds neu berechnet werden oder nicht? + genes <- rownames(expression) #row 'header' patients <- names(expression) outputs <- lapply(1:nrow(expression),function(i){ @@ -160,9 +171,13 @@ useHdbscan <- function(expression,minPts = 10,verbose = TRUE){ } modus <- "modal" if((max(classification))==0){ - modus <- 1 - }else{ - modus <- max(classification) + modus <- "unimodal" + } + if(max(classification)==2){ + modus <- "bimodal" + } + if(max(classification)>2){ + modus <- "multimodal" } output <- list( @@ -176,6 +191,20 @@ useHdbscan <- function(expression,minPts = 10,verbose = TRUE){ names(outputs)<- genes hdbscanOutput <- list(Genes = outputs,"ClinicalData"="mnt/agnerds/xyz","Expressionmatrix"="mnt/agnerds/xyzyy") + + # hdbscanClusters <- lapply(1:nrow(expression), function(i) { + # + # + # proportions <- unlist(lapply(0:(max(clustering$cluster)),function(n){ + # length((which(clustering$cluster == n))) / length(clustering$cluster) + # })) + # c(means,sds,proportions) + # }) + # names(hdbscanClusters) = rownames(expression) + # save(hdbscanClusters, file = "hdbscanClusters.Rdata") + # if(verbose){message("Output of clustering is processed and formatted...")} + # hdbscanOutput <- hdbscanClassification(hdbscanClusters = hdbscanClusters) + return(hdbscanOutput) } @@ -186,10 +215,9 @@ useHdbscan <- function(expression,minPts = 10,verbose = TRUE){ #' predicted by a model or an estimator and the values actually observed. #' NAs are filtered out before calculating the RMSE #' -#' @param error The difference of a value predicted by a model and an actually -#' observed value -#' +#' @param error The difference of a value predicted by a model and an actually observed value #' +#' @return The RMSE value rmse <- function(error){ error <- unlist(error) error2 <- unlist(lapply(1:length(error),function(i){if(!is.na(error[[i]])){c(error[[i]])}})) @@ -202,6 +230,8 @@ rmse <- function(error){ #' Use the FlexMix algorithm for creating a model to find the best fitting #' model to evaluate the classification (unimodal,bimodal or other) of genes. #' +#' +#' #' @param expression (Simulated) Gene Expression data frame which genes shall #' be classified as unimodal, bimodal or other #' @param verbose logical. Whether to print progress messages @@ -209,7 +239,7 @@ rmse <- function(error){ #' @return flexmixOutput The formatted output of the best fitting models from flexmix #' #' @examples -#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) +#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' #with messages printed to console @@ -217,8 +247,6 @@ rmse <- function(error){ #' #without messages printed to console #' flexmixOutput <- useFlexmix(expression = expression, verbose = FALSE) #' -#' @import flexmix -#' @importFrom plyr compact #' @export useFlexmix <- function(expression, verbose = TRUE){ mo1 <- flexmix::FLXMRglm(family = "gaussian") @@ -338,11 +366,12 @@ useFlexmix <- function(expression, verbose = TRUE){ c(3) } })) + #save(bestModel, file = "flexModels.Rdata") if(verbose){message("Creating the formatted output with the best fitting models")} outputs <- lapply(1:length(bestModel),function(i){ if(bestModel[i]==1){ - modus <- 1 + modus <- "unimodal" means <- (flexModelsUni[[i]]$means) sds <- c(flexModelsUni[[i]]$sds) sizes <- c(flexModelsUni[[i]]$sizes) @@ -353,9 +382,10 @@ useFlexmix <- function(expression, verbose = TRUE){ }})) }) + #c(flexModelsUni[[i]][1],NA,flexModelsUni[[i]][2],NA,(flexModelsUni[[i]][3])/100,NA) }else if(bestModel[i]==2){ - modus <- 2 + modus <- "bimodal" means <- c(flexModelsBi[[i]]$means) sds <- c(flexModelsBi[[i]]$sds) sizes <- c(flexModelsBi[[i]]$sizes) @@ -367,7 +397,7 @@ useFlexmix <- function(expression, verbose = TRUE){ }) }else if(bestModel[i]==3){ - modus <- 2 + modus <- "bimodal" means <- c(flexModelsGU[[i]]$means) sds <- c(flexModelsGU[[i]]$sds) sizes <- c(flexModelsGU[[i]]$sizes) @@ -378,8 +408,8 @@ useFlexmix <- function(expression, verbose = TRUE){ }})) }) } - if(modus==2&&(sizes[1]==ncol(expression2)||sizes[2]==ncol(expression2))){ - modus=1 + if(modus=="bimodal"&&(sizes[1]==ncol(expression2)||sizes[2]==ncol(expression2))){ + modus="unimodal" means <- means[!is.na(means)] sds <- sds[!is.na(sds)] sizes <- sizes[!is.na(sizes)] diff --git a/R/evaluate.R b/R/evaluate.R index defc453..5f22355 100644 --- a/R/evaluate.R +++ b/R/evaluate.R @@ -2,31 +2,41 @@ #' #' Compare the scenario of each gene that was assigned by the algorithm with #' the scenario that is written in the validation Data. -#' @param modality numerical, for which modality the scenarioComparing is calculated, 1 for unimodal, -#' 2 for bimodal,... +#' #' @param algorithmOutput The output to be compared #' @param validationData The validationData behind the simulated gene expression data frame #' #' @return Returns a list that classifies each assignment as #' TN (TRUE Negative),TP (TRUE Positive), FN (FALSE Negative), FP (FALSE Positive) #' @details -#' TN (TRUE Negative): validationData: not modality, assigned scenario: not modality +#' TN (TRUE Negative): validationData: unimodal, assigned scenario: unimodal #' -#' TP (TRUE Positive): validationData: modality, assigned scenario: modality +#' TP (TRUE Positive): validationData: bimodal, assigned scenario: bimodal #' -#' FN (FALSE Negative): validationData: modality, assigned scenario: not modality +#' FN (FALSE Negative): validationData: bimodal, assigned scenario: unimodal +#' or +#' FN (FALSE Negative): validationData: unimodal/bimodal, +#' assigned scenario: more than bimodal #' -#' FP (FALSE Positive): validationData: not modality, assigned scenario: modality +#' FP (FALSE Positive): validationData: unimodal, assigned scenario: bimodal #' -compareScenarios <- function(modality, algorithmOutput,validationData){ +compareScenarios <- function(algorithmOutput,validationData){ compareScenarios <- (unlist(lapply(1:length(algorithmOutput$Genes),function(i){ - if(validationData[[i]]$modus != modality && algorithmOutput$Genes[[i]]$modus!=modality){ + if(validationData[i,2]==1&&algorithmOutput$Genes[[i]]$modus=="unimodal"){ + #unimodal + classified as unimodal evaluation <- "TN" - }else if(validationData[[i]]$modus ==modality && algorithmOutput$Genes[[i]]$modus==modality){ + }else if((validationData[i,2]==2||validationData[i,2]==3)&&algorithmOutput$Genes[[i]]$modus=="bimodal"){ + #bimodal + classified as bimodal evaluation <- "TP" - }else if(validationData[[i]]$modus !=modality && algorithmOutput$Genes[[i]]$modus==modality){ + }else if((validationData[i,2]==1)&&algorithmOutput$Genes[[i]]$modus=="bimodal"){ + #unimodal + classified as bimodal evaluation <- "FP" - }else if(validationData[[i]]$modus ==modality && algorithmOutput$Genes[[i]]$modus!=modality){ + }else if((validationData[i,2]==2||validationData[i,2]==3)&&algorithmOutput$Genes[[i]]$modus=="unimodal"){ + #bimodal + classified as unimodal + evaluation <- "FN" + }else if((validationData[i,2]==1||validationData[i,2]==2||validationData[i,2]==3)&& + algorithmOutput$Genes[[i]]$modus=="multimodal"){ + #unimodal/bimodal + classified as more than multimodal evaluation <- "FN" }else{ evaluation <- NA @@ -56,10 +66,49 @@ isApproxSame<- function(validationVal,classVal,maxDistance){ } } +#' Expand the validationData +#' +#' Adds two columns to the validationData: mean2 and sdGamma +#' @param validationData not expanded validationData data frame +#' @return returns validationData with the two added columns +#' +#' @details +#' For scenario 1, no mean2 and no sdGamma is added (NA added) +#' For scenario 2 the mean2 (calculated out of mean1 and foldChange) is added, +#' but no sdGamma is added (NA added) +#' For scenario 3, the mean 2 is not added(NA added, foldChange was already +#' used), but sdGamma is calculated with sqrt(gammaShape*(gammaScale^2)) +#' gammaScale is 1/gammaRate +#' +expandValidationData <- function(validationData){ + validationData <- cbind(validationData,unlist(lapply(1:nrow(validationData),function(i){ + if(as.numeric(validationData[i,2])==1){#is unimodal, no mean2 + addValue <- NA + }else if(as.numeric(validationData[i,2])==2){#calculates mean2 with mean1*FC + addValue <- as.numeric(validationData[i,3])*as.numeric(validationData[i,6]) + }else if(as.numeric(validationData[i,2])==3){#calculates mean2 with shape*scale + gammaScale <- 1/as.numeric(validationData[i,10]) + mean2 <- as.numeric(validationData[i,9])*gammaScale + addValue <- mean2 + } + + }))) + colnames(validationData)[11]<-"Mean2" + validationData <- cbind(validationData,unlist(lapply(1:nrow(validationData),function(i){ + if((as.numeric(validationData[i,2])==1) || (as.numeric(validationData[i,2])==2)){#is unimodal or bimodal without gamma, no sdGamma2 + addValue <- NA + }else if(as.numeric(validationData[i,2])==3){#calculates sd2 with sqrt(shape*(scale^2)) + gammaScale <- 1/as.numeric(validationData[i,10]) + sdGamma <- sqrt(as.numeric(validationData[i,9])*(gammaScale*gammaScale)) + addValue <- sdGamma + } + }))) + colnames(validationData)[12]<-"sdGamma" + return(validationData) +} + #' Evaluate AlgorithmOutput #' -#' @param modality numerical, for which modality output is evaluated, 1 for unimodal, -#' 2 for bimodal,... #' @param algorithmOutput The formatted output of the algorithm that is now #' evaluated #' @param validationData the validationData data.frame @@ -67,16 +116,16 @@ isApproxSame<- function(validationVal,classVal,maxDistance){ #' value and the validation data value with which the estimated is classified #' as TRUE (correctly estimated) #' @examples -#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) +#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData #' output <- useMclust(expression, verbose=FALSE) #' # TRUE if less than +- 10 % difference to value of validationData -#' evaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = output, +#' evaluation <- evaluateAlgorithmOutput(algorithmOutput = output, #' validationData = validationData) #' # TRUE if less than +- 20 % difference to value of validationData -#' evaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = output, +#' evaluation <- evaluateAlgorithmOutput(algorithmOutput = output, #' validationData = validationData,maxDifference = 20) #' @return Returns a data.frame where for each as TP classified gene it was #' evaluated whether it was estimated approximately correct or not (TRUE or @@ -89,41 +138,62 @@ isApproxSame<- function(validationVal,classVal,maxDistance){ #' or equal to 10%, the estimation is classified as TRUE (correctly estimated) #' #' @export -evaluateAlgorithmOutput <- function(modality,algorithmOutput,validationData,maxDifference = 10){ - compared <- compareScenarios(modality = modality,algorithmOutput = algorithmOutput,validationData = validationData) +evaluateAlgorithmOutput <- function(algorithmOutput,validationData,maxDifference = 10){ + validationDataExpanded <- expandValidationData(validationData = validationData) + compared <- compareScenarios(algorithmOutput = algorithmOutput,validationData = validationDataExpanded) - genes <- names(validationData) + genes <- validationDataExpanded[,1] #row 'header' - approxSame <- (lapply(1:length(algorithmOutput$Genes),function(i){ - if((algorithmOutput$Genes[[i]]$modus==modality) && (validationData[[i]]$modus==modality)){ - classValsMean <- sort(algorithmOutput$Genes[[i]]$means) - x <- algorithmOutput$Genes[[i]]$sds - names(x) <- order(algorithmOutput$Genes[[i]]$means) - classValSDs <- x[order(names(x))] - y <- algorithmOutput$Genes[[i]]$sizes - names(y) <- order(algorithmOutput$Genes[[i]]$means) - classValProps <- y[order(names(y))] - means <- unlist(lapply(1:length(validationData[[i]]$means),function(j){ - isApproxSame(validationVal = validationData[[i]]$means[j],classVal = classValsMean[j],maxDistance = maxDifference) - })) - sds <- unlist(lapply(1:length(validationData[[i]]$sds),function(j){ - isApproxSame(validationVal = validationData[[i]]$sds[j],classVal = classValSDs[j],maxDistance = maxDifference) - })) - props <- unlist(lapply(1:length(validationData[[i]]$sizes),function(j){ - isApproxSame(validationVal = validationData[[i]]$sizes[j],classVal = classValProps[j],maxDistance = maxDifference) - })) + numCol <- ncol(algorithmOutput) + perAt <- floor(numCol/3) - c(means,sds,props) - } - })) - names(approxSame) <- genes - evaluation<- data.frame(do.call('rbind',approxSame)) - meanNames <- sprintf(c("mean%s"),(1:(ncol(evaluation)/3))) - sdNames <- sprintf(c("sd%s"),(1:(ncol(evaluation)/3))) - propNames <- sprintf(c("prop%s"),(1:(ncol(evaluation)/3))) - colnames(evaluation) <- c(meanNames,sdNames,propNames) + approxSame <- (lapply(1:length(algorithmOutput$Genes),function(i){ + if((algorithmOutput$Genes[[i]]$modus=="bimodal") && (as.numeric(validationDataExpanded[i,2])==2||as.numeric(validationDataExpanded[i,2])==3)){ + mean1 <- isApproxSame(validationVal = min(as.numeric(validationDataExpanded[i,3]),as.numeric(validationDataExpanded[i,11])), + classVal = min(algorithmOutput$Genes[[i]]$means[1],algorithmOutput$Genes[[i]]$means[2]), + maxDistance = maxDifference) + mean2 <- isApproxSame(validationVal = max(as.numeric(validationDataExpanded[i,3]),as.numeric(validationDataExpanded[i,11])), + classVal = max(algorithmOutput$Genes[[i]]$means[1],algorithmOutput$Genes[[i]]$means[2]), + maxDistance = maxDifference) + if(as.numeric(validationDataExpanded[i,2])==2){ + sd1 <- isApproxSame(validationVal = (min(as.numeric(validationDataExpanded[i,4]),as.numeric(validationDataExpanded[i,5]))), + classVal = min(algorithmOutput$Genes[[i]]$sds[1],algorithmOutput$Genes[[i]]$sds[1]), + maxDistance = maxDifference) + sd2 <- isApproxSame(validationVal = (max(as.numeric(validationDataExpanded[i,4]),as.numeric(validationDataExpanded[i,5]))), + classVal = max(algorithmOutput$Genes[[i]]$sds[1],algorithmOutput$Genes[[i]]$sds[2]), + maxDistance = maxDifference) + }else if(as.numeric(validationDataExpanded[i,2])==3){ + sd1 <- isApproxSame(validationVal = (min(as.numeric(validationDataExpanded[i,4]),as.numeric(validationDataExpanded[i,12]))), + classVal = min(algorithmOutput$Genes[[i]]$sds[1],algorithmOutput$Genes[[i]]$sds[2]), + maxDistance = maxDifference) + sd2 <- isApproxSame(validationVal = (max(as.numeric(validationDataExpanded[i,4]),as.numeric(validationDataExpanded[i,12]))), + classVal = max(algorithmOutput$Genes[[i]]$sds[1],algorithmOutput$Genes[[i]]$sds[2]), + maxDistance = maxDifference) + } + if((algorithmOutput$Genes[[i]]$sizes[1]==100)&&(is.na(algorithmOutput$Genes[[i]]$sizes[2]))){ + prop1 <- isApproxSame(validationVal = ((max(as.numeric(validationDataExpanded[i,7]),as.numeric(validationDataExpanded[i,8])))), + classVal = algorithmOutput$Genes[[i]]$sizes[1], + maxDistance = maxDifference) + prop2 <- FALSE + } + else{ + prop1 <- isApproxSame(validationVal = ((min(as.numeric(validationDataExpanded[i,7]),as.numeric(validationDataExpanded[i,8])))), + classVal = min(algorithmOutput$Genes[[i]]$sizes[1],algorithmOutput$Genes[[i]]$sizes[2]), + maxDistance = maxDifference) + prop2 <- isApproxSame(validationVal = ((max(as.numeric(validationDataExpanded[i,7]),as.numeric(validationDataExpanded[i,8])))), + classVal = max(algorithmOutput$Genes[[i]]$sizes[1],algorithmOutput$Genes[[i]]$sizes[2]), + maxDistance = maxDifference) + } - return(evaluation) + c(paste0("Gene",i),mean1,mean2,sd1,sd2,prop1,prop2) + } + })) + evaluation<- data.frame(do.call('rbind',approxSame)) + result <- data.frame(evaluation[,2:ncol(evaluation)]) + rownames(result) <- evaluation[,1] + colnames(result) <- c("mean1","mean2","sd1","sd2","prop1","prop2") + + return(result) } #' getClassification @@ -131,22 +201,21 @@ evaluateAlgorithmOutput <- function(modality,algorithmOutput,validationData,maxD #' Returns the numbers of TN (TRUE Negative),TP (TRUE Positive), #' FN (FALSE Negative) and FP (FALSE Positive) for one of the algorithm's #' formatted outputs. -#' @param modality numerical, for which modality the classification is calculated, 1 for unimodal, -#' 2 for bimodal,... +#' #' @param algorithmOutput The formatted output of the algorithm #' @param algorithmName The name of the evaluated algorithm #' @param validationData The validationData behind the simulated gene #' expression data frame +#' #' @examples -#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) +#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData #' mclustOutput <- useMclust(expression, verbose=FALSE) #' evaluation <- evaluateAlgorithmOutput(algorithmOutput = output, #' validationData = validationData) -#' getClassification(modality= 2,algorithmOutput = mclustOutput, -#' algorithmName = "mclust", +#' getClassification(algorithmOutput = mclustOutput,algorithmName = "mclust", #' validationData = validationData) #' @return a classification data frame that contains the number of #' FN, FP, TN and TP @@ -163,9 +232,9 @@ evaluateAlgorithmOutput <- function(modality,algorithmOutput,validationData,maxD #' #' FP (FALSE Positive): validationData: unimodal, assigned scenario: bimodal #' -getClassification <- function(modality,algorithmOutput,algorithmName="mclust",validationData){ +getClassification <- function(algorithmOutput,algorithmName="mclust",validationData){ - compared <- compareScenarios(modality = modality, + compared <- compareScenarios( algorithmOutput = algorithmOutput,validationData = validationData) fn <- table(compared)["FN"] @@ -190,14 +259,14 @@ getClassification <- function(modality,algorithmOutput,algorithmName="mclust",va #' @param evaluation The output of the evaluation of the formatted output #' @param algorithmName The name of the algorithm which created the output #' @return A data frame with the percentual proportion of as TRUE or FALSE evaluated values +#' #' @examples -#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) +#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData #' mclustOutput <- useMclust(expression, verbose=FALSE) -#' mclustEvaluation <- evaluateAlgorithmOutput(algorithmOutput = mclustOutput, -#' validationData = validationData) +#' mclustEvaluation <- evaluateAlgorithmOutput(algorithmOutput = mclustOutput,validationData = validationData) #' getStatistic(evaluation = mclustEvaluation,algorithmName = "mclust") #' getStatistic <- function(evaluation,algorithmName ="mclust"){ @@ -218,7 +287,7 @@ getStatistic <- function(evaluation,algorithmName ="mclust"){ #' @param evaluations list of named evaluations #' @return A data frame with the percentual proportion of as TRUE or FALSE evaluated values #' @examples -#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) +#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData @@ -253,15 +322,14 @@ getStatistics <- function(evaluations) { #' Returns the numbers of TN (TRUE Negative),TP (TRUE Positive), #' FN (FALSE Negative) and FP (FALSE Positive) for each of the algorithm's #' formatted outputs. -#' @param modality numerical, for which modality the classification is calculated, 1 for unimodal, -#' 2 for bimodal,... +#' #' @param outputs list of named outputs #' @param validationData The validationData behind the simulated gene expression data frame #' #' @return a classification data frame that contains the number of #' FN, FP, TN and TP #' @examples -#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) +#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData @@ -270,11 +338,9 @@ getStatistics <- function(evaluations) { #' flexmixOutput <- useFlexmix(expression = expression,verbose = FALSE) #' #' #getClassifications of one algorithm -#' getClassifications(modality=2,validationData, -#' outputs = list("mclust"=mclustOutput)) +#' getClassifications(validationData,outputs = list("mclust"=mclustOutput)) #' #getClassifications of two or more algorithms -#' getClassifications(modality=2,validationData, -#' outputs = list("mclust"=mclustOutput,"HDBSCAN"=hdbscanOutput)) +#' getClassifications(validationData,outputs = list("mclust"=mclustOutput,"HDBSCAN"=hdbscanOutput)) #' #' @details #' TN (TRUE Negative): validationData: unimodal, assigned scenario: unimodal @@ -290,10 +356,10 @@ getStatistics <- function(evaluations) { #' #' #' @export -getClassifications <- function(modality,validationData,outputs){ +getClassifications <- function(validationData,outputs){ numOutputs <- (length(outputs)) classification <- lapply(1:numOutputs,function(i){ - getClassification(modality,algorithmOutput = outputs[[i]], + getClassification(algorithmOutput = outputs[[i]], algorithmName = names(outputs)[i], validationData = validationData) diff --git a/R/params.R b/R/params.R index 5a7ec2e..7242aae 100644 --- a/R/params.R +++ b/R/params.R @@ -9,7 +9,7 @@ #' #' @examples #' params <- newParams() -#' getParam(object = params,name = "nGenes") +#' getParam(params, "nUniGenes") #' #' @rdname getParam #' @export @@ -27,10 +27,9 @@ setGeneric("getParam", function(object, name) {standardGeneric("getParam")}) #' #' @examples #' params <- newParams() -#' params <- setParam(object = params,name = "nPatients",value = 250) +#' params = setParam(params, "nBiGenes", 100) #' #' @rdname setParam -#' @importFrom methods new rbind2 slot slot<- slotNames validObject #' @export setGeneric("setParam", function(object, name, value) { @@ -80,51 +79,59 @@ setGeneric("setParam", #' @aliases Params-class #' @exportClass Params setClass("Params", - slots = c(nGenes ="list", - means = "list", - foldChanges = "list", - sd = "list", - gamma = "list", - proportions = "vector", + slots = c(nUniGenes = "numeric", + nBiGenes = "numeric", + nGuGenes ="numeric", nPatients = "numeric", - seed = "numeric"), - prototype = prototype(nGenes = list("1"= 700, "2" = list("gauss"=150,"gamma"=150)), - means = list("1"=c(2,4),"2" = c(2,4)), - foldChanges = list("1" = c(2,4),"2" = list("gauss"= c(2,4),"gamma" = c(3,5))), - sd = list("mu"= 0.61,"lambda"=2.21), - gamma = list("shape" = 2, "rate" = 2), - proportions = c(10,20,30,40,50,60,70,80,90), - nPatients = 200, - seed = sample(1:1e6, 1))) + seed = "numeric", + sdParms = "vector", + uniMean = "vector", + biMean = "vector", + foldChange ="vector", + proportions = "vector", + guMean = "vector", + gammaShape = "numeric", + gammaRate = "numeric"), + prototype = prototype(nUniGenes = 300, nBiGenes = 500, nGuGenes = 200, + nPatients = 100, seed = sample(1:1e6, 1), + sdParms = c(0.61,2.21), uniMean = c(2,5), + biMean = c(2,5), foldChange =c(2,4), + proportions = c(10,20,30,40,50,60,70,80,90), guMean = c(2,5), + gammaShape = 2, gammaRate = 2)) #' @rdname getParam setMethod("getParam", "Params", function(object, name) { - methods::slot(object, name) + slot(object, name) }) #' @rdname setParam setMethod("setParam", "Params", function(object, name, value) { checkmate::assertString(name) - methods::slot(object, name) <- value - methods::validObject(object) + slot(object, name) <- value + validObject(object) return(object) }) setMethod("show", "Params", function(object) { - pp <- list("Global:" = c("nGenes" = "nGenes", - "Means" = "means", - "FoldChanges" = "foldChanges", - "sd" ="sd", - "Gamma" ="gamma", + pp <- list("Global:" = c("UnimodalGenes" = "nUniGenes", + "BimodalGenes" = "nBiGenes", + "GammaUnimodalGenes" = "nGuGenes", + "Patients" = "nPatients", + "Seed" = "seed", + "Mu,Lambda for sd"="sdParms", + "Unimodal Mean" = "uniMean", + "Bimodal Mean" = "biMean", + "FoldChange" = "foldChange", "Proportions" = "proportions", - "nPatients" = "nPatients", - "Seed" = "seed")) + "GammaUnimodal Mean" = "guMean", + "Gamma Shape" = "gammaShape", + "Gamma Rate" = "gammaRate")) cat("A Params object of class", class(object), "\n") cat("Parameters can be Default' or 'NOT DEFAULT'.", "\n\n") showPP(object, pp) - cat(length(methods::slotNames(object)) - 8, "additional parameters", "\n\n") + cat(length(slotNames(object)) - 13, "additional parameters", "\n\n") }) #' New Params @@ -137,19 +144,13 @@ setMethod("show", "Params", function(object) { #' @return New Params object. #' @examples #' params <- newParams() -#' params <- newParams("nGenes"=list("1"=100,"2"=list("gauss"=100,"gamma"=100)), -#' "proportions"=c(30,70)) -#' params <- newParams( -#' nGenes=list("1"=c(700),"2"=list(gauss = 150, gamma = 150),"3"=list(gauss = 150, gamma = 150)), -#' means=list("1"= c(2, 4),"2"= c(2, 4),"3"= c(2, 4)), -#' foldChanges = list("1"=NA,"2"= list(gauss = c(2, 4), gamma = c(2, 4)), -#' "3"= list(gauss = list(c(2, 4), c(2, 4)), gamma = list(c(2, 4), c(2, 4))))) +#' params <- newParams(nUniGenes = 200, nPatients = 10) #' #' @rdname newParams #' @export newParams <- function(...) { - params <- methods::new("Params") + params <- new("Params") params <- setParams(params, ...) return(params) @@ -165,7 +166,7 @@ newParams <- function(...) { #' @return List with the values of the selected parameters. #' @examples #' params <- newParams() -#' getParams(params = params,names = c("nGenes","foldChanges","proportions","means")) +#' getParams(params, c("nUniGenes", "nPatients", "uniMean")) #' @export getParams <- function(params, names) { @@ -197,12 +198,10 @@ getParams <- function(params, names) { #' params <- newParams() #' params #' # Set individually -#' params <- params <- setParams(params,proportions = c(80,20),foldChanges = list("1"=c(2,4), -#' "2"= list("gauss"=c(2,5),"gamma"=c(3,5)))) +#' params <- setParams(params, nUniGenes = 1000, nPatients = 50) #' params #' # Set via update list -#' params <- setParams(params,update = list("nGenes"=list("1"=100, -#' "2"=list("gauss"=200,"gamma"=100)),"means"=list("1"=c(1,3),"2"=c(2,4)),"nPatients"=700)) +#' params <- setParams(params, list(nUniGenes = 1000, nPatients = 50)) #' params #' @export setParams <- function(params, update = NULL, ...) { @@ -237,12 +236,12 @@ showPP <- function(params, pp) { checkmate::assertClass(params, classes = "Params") checkmate::assertList(pp, types = "character", min.len = 1) - default <- methods::new(class(params)) + default <- new(class(params)) for (category in names(pp)) { parameters <- pp[[category]] values <- getParams(params, parameters) short.values <- sapply(values, function(x) { - if (length(x) > 10) { + if (length(x) > 4) { paste0(paste(head(x, n = 4), collapse = ", "), ",...") } else { paste(x, collapse = ", ") diff --git a/R/simulation.R b/R/simulation.R index 69a2467..101ed7d 100644 --- a/R/simulation.R +++ b/R/simulation.R @@ -13,7 +13,7 @@ #' simulated from the mean and the standard deviation taken from the validation data #' for this gene. The number of values that are simulated for each #' distribution equals the number of Patients {"nPatients"}: -#' \code{expression <- stats::rnorm(nPatients,as.numeric(valData[i,3]), as.numeric(valData[i,4]))} +#' \code{expression <- rnorm(nPatients,as.numeric(valData[i,3]), as.numeric(valData[i,4]))} #' #' Scenario 2 - Bimodal with two unimodal gaussian distributions #' Gene expression for the bimodal scenario 2 (two gaussian distributions) is @@ -22,7 +22,7 @@ #' deviation taken from the validation data for this gene. The number of values that are #' simulated ("nBi1") are calculated by multiplicating the number of Patients with the #' first proportion and rounding this number off: -#' \code{data1 <- stats::rnorm(nBi1,as.numeric(valData[i,3]),as.numeric(valData[i,4]))} +#' \code{data1 <- rnorm(nBi1,as.numeric(valData[i,3]),as.numeric(valData[i,4]))} #' The second distribution is simulated with the mean calculated by #' multiplying the mean of the first distribution with the foldChange #' from the validation data of this gene. The standard deviation is also taken from the validation data of this gene. The number of values that are @@ -31,7 +31,7 @@ #' the mean and the standard deviation taken from the validation data for this gene. #' The number of values that are simulated for each distribution equals the #' number of Patients {"nPatients"}: -#' \code{data2 <- stats::rnorm(nBi2,foldBi,as.numeric(valData[i,5]))} +#' \code{data2 <- rnorm(nBi2,foldBi,as.numeric(valData[i,5]))} #' At the end both of the simulated distributions are put into one expression. #' \code{expression <- c(data1,data2)} #' @@ -44,7 +44,7 @@ #' The number of values that are simulated ("nGu1") are calculated by #' multiplying the number of Patients with the first proportion and rounding #' this number off. -#' \code{data1 <- stats::rgamma(n = nGu1,shape = as.numeric(valData[i,9]), +#' \code{data1 <- rgamma(n = nGu1,shape = as.numeric(valData[i,9]), #' rate = as.numeric(valData[i,10]))} #' The second distribution, which is the unimodal gaussian distribution is #' simulated with the calculated mean, which is calculated by multiplying the @@ -52,7 +52,7 @@ #' deviation is taken from the validation data of this gene. The number of values that #' are simulated ("nGu2") are calculated by multiplying the number of Patients #' with the second proportion and rounding this number up. -#' \code{data2 <- stats::rnorm(nGu2,foldGamma,as.numeric(valData[i,4]))} +#' \code{data2 <- rnorm(nGu2,foldGamma,as.numeric(valData[i,4]))} #' At the end both of the simulated distributions are put into one expression. #' \code{expression <- c(data1,data2)} #' @@ -63,13 +63,11 @@ #' simulation <- simulateExpression(params = newParams()) #' # Override default parameters #' # Not recommended, better use setParams; See \code{\link{Params}} -#' simulation <- simulateExpression(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100)), nPatients = 50) +#' simulation <- simulateExpression(nUniGenes = 1000, nPatients = 50) #' # without messages printed to the console -#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params = params,verbose = FALSE) #' @export #' @importFrom checkmate assertClass assertString assertCharacter assertList -#' @importFrom stats dnorm rgamma rnorm runif sd #' simulateExpression <- function(params = newParams(), verbose = TRUE,...) { @@ -77,48 +75,43 @@ simulateExpression <- function(params = newParams(), verbose = TRUE,...) { checkmate::assertClass(params, "Params") params <- setParams(params, ...) + # Set random seed seed <- getParam(params, "seed") set.seed(seed) valData <- createValidationData(params = params, verbose = verbose) nPatients <- getParam(params,"nPatients") - - numGenes <- sum(unlist(getParam(params,name = "nGenes"))) + nUniGenes <- getParam(params, "nUniGenes") + nBiGenes <- getParam(params,"nBiGenes") + nGuGenes <- getParam(params,"nGuGenes") + nGenes <- nUniGenes+nBiGenes+nGuGenes if (verbose){message("Drawing gene expression values...")} - simNotAdjusted <- lapply(1:numGenes,function(i){ - if(valData[[i]]$scenario=="unimodal-gaussian"){ - expression <- stats::rnorm(n = nPatients,mean = valData[[i]]$means,sd = valData[[i]]$sds) - }else if(valData[[i]]$scenario=="multimodal-gaussian"){ - expression <- unlist(lapply(1:valData[[i]]$modus,function(j){ - expression <- stats::rnorm(n = valData[[i]]$sizes[j],mean = valData[[i]]$means[j],sd = valData[[i]]$sds[j]) - })) - }else if(valData[[i]]$scenario=="multimodal-gamma+gaussian"){ - expression <- unlist(lapply(1:valData[[i]]$modus,function(j){ - if(j==1){ - expression <- stats::rgamma(n = valData[[i]]$sizes[j],shape = params@gamma$shape,rate = params@gamma$rate) - }else{ - expression <- stats::rnorm(n = valData[[i]]$sizes[j],mean = valData[[i]]$means[j],sd = valData[[i]]$sds[j]) - } - })) + ##STEP 2 - Generating the simulation result + sim <- lapply(1:nGenes,function(i){ + if(valData[i,2]==1){ + expression <- rnorm(nPatients,as.numeric(valData[i,3]),as.numeric(valData[i,4])) + }else if(valData[i,2]==2){ + nBi1 = floor(nPatients*as.numeric(valData[i,7])/100) + nBi2 = ceiling(nPatients*as.numeric(valData[i,8])/100) + foldBi = as.numeric(valData[i,3])*as.numeric(valData[i,6]) + data1 <- rnorm(nBi1,as.numeric(valData[i,3]),as.numeric(valData[i,4])) + data2 <- rnorm(nBi2,foldBi,as.numeric(valData[i,5])) + expression <- c(data1,data2) + }else if(valData[i,2]==3){ + nGu1 = floor(nPatients*as.numeric(valData[i,7])/100) + nGu2 = ceiling(nPatients*as.numeric(valData[i,8])/100) + data1 <- rgamma(n = nGu1,shape = as.numeric(valData[i,9]),rate = as.numeric(valData[i,10])) + data2 <- rnorm(nGu2,as.numeric(valData[i,3]),as.numeric(valData[i,4])) + expression <- c(data1,data2) } expression }) - simNotAdjusted <- data.frame(do.call('rbind',simNotAdjusted)) - sim <- lapply(1:nrow(simNotAdjusted),function(i){ - values <- unlist(lapply(1:ncol(simNotAdjusted),function(j){ - if(simNotAdjusted[i,j]<0){ - 0 - }else{ - simNotAdjusted[i,j] - } - })) - }) sim <- data.frame(do.call('rbind',sim)) - genes <- names(valData) + genes <- valData[,1] patients <- sprintf("Patient%04d", 1:nPatients) #columns 'header' rownames(sim) <- genes colnames(sim) <- patients @@ -173,134 +166,57 @@ simulateExpression <- function(params = newParams(), verbose = TRUE,...) { #' #' gammaRate: numeric, Default value: 2 #' -#' +#' @export #' @importFrom LaplacesDemon rinvgaussian #' createValidationData <- function(params = newParams(),verbose = TRUE){ - numGenes <- sum(unlist(getParam(params,name = "nGenes"))) + # Get the parameters we are going to use + nPatients <- getParam(params, "nPatients") + nUniGenes <- getParam(params, "nUniGenes") + nBiGenes <- getParam(params,"nBiGenes") + nGuGenes <- getParam(params,"nGuGenes") + nGenes <- nUniGenes+nBiGenes+nGuGenes + nSDs <- nUniGenes+ 2*nBiGenes+nGuGenes + sdParms <- getParam(params, "sdParms") + uniMean <- getParam(params, "uniMean") + biMean <- getParam(params, "biMean") + foldChange <- getParam(params, "foldChange") + proportions <- getParam(params, "proportions") + guMean <- getParam(params, "guMean") + gammaShape <- getParam(params, "gammaShape") + gammaRate <- getParam(params,"gammaRate") if (verbose) { message("Drawing parameters for expression simulation...") } - genes <- sprintf("Gene%04d",1:numGenes) - reps <- c() - reps <- unlist(lapply(1:length(unlist(params@nGenes)),function(i){ - if(i == 1){ - reps <- rep(x = i,times = params@nGenes$`1`) - }else{ - if(i%%2==0){ - reps <- c(reps,rep(x = i, times = sum(unlist(eval(parse(text=(paste0("params@nGenes$`",floor(i/2)+1,"`$gauss")))))))) - }else{ - reps <- c(reps,rep(x = i, times = sum(unlist(eval(parse(text=(paste0("params@nGenes$`",floor(i/2)+1,"`$gamma")))))))) - } - } + genes <- sprintf("Gene%04d",1:nGenes) + scenarios <- sample(rep(c(1,2,3), times = c(nUniGenes, nBiGenes, nGuGenes)), size = nGenes, replace = F) - })) - scenarios <- sample(x = reps,size = numGenes,replace = FALSE) - drawGeneParams <- function(nPatients,modality, proportions,means=NULL,sdParms,foldChanges = NULL,gammaParms = NULL){ - ranMean1 <- c() - ranSd1 <- c() - if(!is.null(gammaParms)){ - ranMean1 <- gammaParms$shape/gammaParms$rate - ranSd1 <- sqrt(gammaParms$shape/(gammaParms$rate^2)) - }else{ - ranMean1 <- stats::runif(1, min = means[1], max = means[2]) - ranSd1 <- LaplacesDemon::rinvgaussian(n = 1, mu = sdParms$mu, lambda = sdParms$lambda) - } - ranProps <- 100 - ranFCs <- NA - means <- c() - ranSd <- c() - if(modality >= 2){ - combs <- t(utils::combn(x = proportions,m = modality,simplify=TRUE)) - combinations <- unlist(lapply(1:nrow(combs),function(i){ - if(sum(combs[i,])==100){ - i - } - })) - sampleProp <- sample(combinations,1,replace = FALSE) - ranProps <- combs[sampleProp,] + drawGeneParams <- function(proportions, mean.min, mean.max, sd.mu, sd.lambda, fc.min = NULL, fc.max = NULL, gamma.shape = NA, gamma.rate = NA) { + ranMean <- runif(1, min = mean.min, max = mean.max) + ranSd <- LaplacesDemon::rinvgaussian(n = 1, mu = sd.mu, lambda = sd.lambda) + ranProp <- ifelse(length(proportions) > 1, sample(proportions, size = 1), proportions) + ranProp2 <- 100 - ranProp - ranSd<- unlist(lapply(2:modality,function(i){ - eval(parse(text = paste(paste0("ranSd",i), - "<- ", - LaplacesDemon::rinvgaussian(n = 1, mu = sdParms$mu, lambda = sdParms$lambda)))) - })) - if(modality==2){ - ranFCs <- unlist(lapply(1:1,function(i){ - eval(parse(text = paste(paste0("ranFC",i), - "<-",(stats::runif(1, min = foldChanges[1], max = foldChanges[2]))))) - })) - }else{ - ranFCs <- unlist(lapply(1:(length(foldChanges)),function(i){ - if(modality ==2){ - eval(parse(text = paste(paste0("ranFC",i), - "<-",(stats::runif(1, min = foldChanges[1], max = foldChanges[2]))))) - }else{ - eval(parse(text = paste(paste0("ranFC",i), - "<-",(stats::runif(1, min = foldChanges[[i]][1], max = foldChanges[[i]][2]))))) - } + ranSd2 <- NA + ranFC <- NA - })) - } - eval(parse(text = paste(paste0("ranMean",2:modality), - "<-",NA))) - for(i in 1:modality){ - if(i!=1){ - eval(parse(text = paste(paste0("ranMean",(i)),"<-",paste0("ranMean",(i-1)),"*",paste0("ranFCs[",(i-1),"]")))) - } - } + if(!is.null(fc.min) & !is.null(fc.max)) { + ranSd2 <- LaplacesDemon::rinvgaussian(n = 1, mu = sd.mu, lambda = sd.lambda) + ranFC <- runif(1, min = fc.min, max = fc.max) } - sds <- c(ranSd1,ranSd) - means <- unlist(lapply(1:modality,function(i){ - eval(parse(text = paste(paste0("ranMean",(i))))) - })) - sizes <- (ranProps/100)*nPatients - patients <- 1 - groups <- lapply(1:length(sizes),function(i){ - if(i==1){ - sprintf("Patient%d",1:sizes[i]) - }else{ - sprintf("Patient%d",(sum(sizes[1:(i-1)])+1):(sizes[i]+sum(sizes[1:(i-1)]))) - } - }) - scenario <-c() - if(modality==1){ - scenario = "unimodal-gaussian" - }else if(modality>1&&is.null(gammaParms)){ - scenario = "multimodal-gaussian" - }else{ - scenario <- "multimodal-gamma+gaussian" - } - - return(list("modus"=modality,"scenario"=scenario,"means"=means,"foldChanges"=ranFCs,"sds"=sds,"proportions"=ranProps,"sizes"=sizes,"groups"=groups)) + return(data.frame(Mean = ranMean, Sd1 = ranSd, Sd2 = ranSd2, FC = ranFC, Prop = ranProp, Prop2 = ranProp2, Shape = gamma.shape, Rate = gamma.rate)) } - valData <- lapply(1:length(scenarios), function(i) { - if(scenarios[i]==1){ - gP <- drawGeneParams(nPatients = params@nPatients, - modality = 1, - proportions = params@proportions, - means = params@means$`1`, - sdParms = params@sd) - } - if((scenarios[i]>1)&&(scenarios[i]%%2==0)){ - gP <- drawGeneParams(nPatients = params@nPatients, - modality = (floor(scenarios[i]/2))+1, - proportions = params@proportions, - means = eval(parse(text = paste(paste0("params@means$`",((floor(scenarios[i]/2))+1),"`")))), - sdParms = params@sd, - foldChanges = eval(parse(text = paste(paste0("params@foldChanges$`",((floor(scenarios[i]/2))+1),"`$gauss"))))) - } - if((scenarios[i]>1)&&(scenarios[i]%%2!=0)){ - gP <- drawGeneParams(nPatients = params@nPatients, - modality = (floor(scenarios[i]/2))+1, - proportions = params@proportions, - sdParms = params@sd, - foldChanges = eval(parse(text = paste(paste0("params@foldChanges$`",((floor(scenarios[i]/2))+1),"`$gamma")))), - gammaParms = params@gamma) - } + simData <- lapply(scenarios, function(s) { + if(s == 1) gP <- drawGeneParams(100, uniMean[1], uniMean[2], sdParms[1], sdParms[2]) + if(s == 2) gP <- drawGeneParams(proportions, biMean[1], biMean[2], sdParms[1], sdParms[2], fc.min = foldChange[1], fc.max = foldChange[2]) + if(s == 3) gP <- drawGeneParams(proportions, guMean[1], guMean[2], sdParms[1], sdParms[2], gamma.shape = gammaShape, gamma.rate = gammaRate) return(gP) }) - names(valData)<- genes + simData <- do.call("rbind", simData) + + valData <- cbind(genes, scenarios, simData) + colnames(valData) <- c("Gene","Scenario","Mean","SD1","SD2", + "FC","prop1","prop2","GammaShape","GammaRate") if(verbose){ message("Finished creating validation data.") } return(valData) } From 065abd19f59bc54a2acd13251e3784351a069845 Mon Sep 17 00:00:00 2001 From: sbeck Date: Wed, 21 Feb 2018 11:24:30 +0100 Subject: [PATCH 3/6] Added possibility of simulating and evaluating multimodality (higher than bimodal The params object was changed. Especially, the slots were updated and renamed. The params object can now hold the parameters for unlimited high modality simulation. The creation of the validationData() was updated to work with the new params object structure. Also, the validationData now has a new structure which resembles approximately the output data format. The validationData now has is a list with one entry per gene. Each entry contains information about the modus (modality as numbers: 1=unimodal, 2=bimodal,...), the scenario(unimodal gaussian, multimodal gaussian or multimodal gamma+gaussian), the means, the foldChanges between the means, the standard deviations of the means, the proportions of the different distributions, the sizes of the groups of the different distributions and the groups with the patients that belong to each group. With the new validationData format the simulation of the expression was also changed. The simulation of unlimited modality gene expression was added. The three possible scenarios with which gene expression can be simulated are now: unimodal gaussian, multimodal gaussian and multimodal gamma+gaussian. The "modus" output of the useXXX() functions is now shown as a number instead of a string. All evaluation functions that work with the validationData are now updated to work with the new structure. The expandValidationData() function was deleted, because it is no longer necessary. The following functions have a new parameter "modality" which is used to evaluate the output of the useXXX functions for a certain modality: - compareScenarios(), - evaluateAlgorithmOutput(), -getClassification(), -getClassifications() The examples in the roxygen documentation of the functions were updated, as well as the documentation itself --- R/algorithmSpecificFunctions.R | 84 ++++--------- R/evaluate.R | 200 ++++++++++-------------------- R/params.R | 87 ++++++------- R/simulation.R | 218 +++++++++++++++++++++++---------- 4 files changed, 289 insertions(+), 300 deletions(-) diff --git a/R/algorithmSpecificFunctions.R b/R/algorithmSpecificFunctions.R index f587b26..604c8e5 100644 --- a/R/algorithmSpecificFunctions.R +++ b/R/algorithmSpecificFunctions.R @@ -3,6 +3,8 @@ #' Use the mclust algorithm for generating output to evaluate the #' classification (unimodal,bimodal or other) of genes. #' +#' +#' #' @param expression (Simulated) Gene Expression data frame which genes shall #' be classified as unimodal, bimodal or other #' @param verbose logical. Whether to print progress messages @@ -10,7 +12,7 @@ #' @return The formatted output of the mclust algorithm #' #' @examples -#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' #with messages printed to console @@ -18,18 +20,18 @@ #' #without messages printed to console #' mclustOutput <- useMclust(expression = expression, verbose = FALSE) #' +#' @import mclust #' @export useMclust <- function(expression,verbose = TRUE,parallel = FALSE) { + if (verbose) {message("mclust-models are calculated...")} if(parallel==FALSE){ - # Calculate mclust models sequentially models <- lapply(1:nrow(expression), function(i) { mclust::Mclust((expression[i,]),modelNames=c("V"),verbose = FALSE) }) } if(parallel==TRUE){ - # Calculate mclust models in parallel n.cores = parallel::detectCores()-1 cl = parallel::makeCluster(n.cores) parallel::clusterExport(cl, "expression") @@ -37,11 +39,9 @@ useMclust <- function(expression,verbose = TRUE,parallel = FALSE) { models <- parallel::parLapply(cl, 1:nrow(expression), function(i) { mclust::Mclust((expression[i,]), modelNames=c("V")) }) } - #unser Model zeigt schon log2 Werte if(parallel==TRUE){parallel::stopCluster(cl)} names(models) = rownames(expression) patients <- names(expression) - #save(models, file = "MClustModels.Rdata") mclustOutput <- mclustClassification(models = models,patients = patients) return(mclustOutput) @@ -59,11 +59,12 @@ useMclust <- function(expression,verbose = TRUE,parallel = FALSE) { #' @param patients List of the patients #' #' @return The formatted output of the mclust algorithm +#' +#' +#' mclustClassification <- function(models,patients){ - - ##get values from mclust into a data.frame - genes <- names(models) #row 'header' + genes <- names(models) outputs <- lapply(1:length(genes),function(i){ @@ -75,16 +76,7 @@ mclustClassification <- function(models,patients){ }})) }) proportions <- models[[i]]$parameters$pro - modus <- "modal" - if(length(models[[i]]$parameters$mean)==1){ - modus <- "unimodal" - } - if(length(models[[i]]$parameters$mean)==2){ - modus <- "bimodal" - } - if(length(models[[i]]$parameters$mean)>2){ - modus <- "multimodal" - } + modus <- length(models[[i]]$parameters$mean) output <- list( "modus" = modus, "means" = unname(models[[i]]$parameters$mean), @@ -114,7 +106,7 @@ mclustClassification <- function(models,patients){ #' @return hdbscanOutput The formatted output of the HDBSCAN clustering #' #' @examples -#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' #with messages printed to console @@ -124,14 +116,11 @@ mclustClassification <- function(models,patients){ #' #without messages printed to console #' hdbscanOutput <- useHdbscan(expression = expression, verbose = FALSE) #' +#' @import dbscan #' @export useHdbscan <- function(expression,minPts = 10,verbose = TRUE){ if(verbose){message("HDBSCAN-clusters are calculated...")} - #wichtige Fragen: - #wenn wir die NoisePunkte ebenfalls zu den Gruppen zuordnen, sollen - #die sizes ebenfalls angepasst werden? Oder sollen die sizes die Originalzahlen darstellen - #müssen ebenso means und sds neu berechnet werden oder nicht? - genes <- rownames(expression) #row 'header' + genes <- rownames(expression) patients <- names(expression) outputs <- lapply(1:nrow(expression),function(i){ @@ -171,13 +160,9 @@ useHdbscan <- function(expression,minPts = 10,verbose = TRUE){ } modus <- "modal" if((max(classification))==0){ - modus <- "unimodal" - } - if(max(classification)==2){ - modus <- "bimodal" - } - if(max(classification)>2){ - modus <- "multimodal" + modus <- 1 + }else{ + modus <- max(classification) } output <- list( @@ -191,20 +176,6 @@ useHdbscan <- function(expression,minPts = 10,verbose = TRUE){ names(outputs)<- genes hdbscanOutput <- list(Genes = outputs,"ClinicalData"="mnt/agnerds/xyz","Expressionmatrix"="mnt/agnerds/xyzyy") - - # hdbscanClusters <- lapply(1:nrow(expression), function(i) { - # - # - # proportions <- unlist(lapply(0:(max(clustering$cluster)),function(n){ - # length((which(clustering$cluster == n))) / length(clustering$cluster) - # })) - # c(means,sds,proportions) - # }) - # names(hdbscanClusters) = rownames(expression) - # save(hdbscanClusters, file = "hdbscanClusters.Rdata") - # if(verbose){message("Output of clustering is processed and formatted...")} - # hdbscanOutput <- hdbscanClassification(hdbscanClusters = hdbscanClusters) - return(hdbscanOutput) } @@ -215,9 +186,10 @@ useHdbscan <- function(expression,minPts = 10,verbose = TRUE){ #' predicted by a model or an estimator and the values actually observed. #' NAs are filtered out before calculating the RMSE #' -#' @param error The difference of a value predicted by a model and an actually observed value +#' @param error The difference of a value predicted by a model and an actually +#' observed value +#' #' -#' @return The RMSE value rmse <- function(error){ error <- unlist(error) error2 <- unlist(lapply(1:length(error),function(i){if(!is.na(error[[i]])){c(error[[i]])}})) @@ -230,8 +202,6 @@ rmse <- function(error){ #' Use the FlexMix algorithm for creating a model to find the best fitting #' model to evaluate the classification (unimodal,bimodal or other) of genes. #' -#' -#' #' @param expression (Simulated) Gene Expression data frame which genes shall #' be classified as unimodal, bimodal or other #' @param verbose logical. Whether to print progress messages @@ -239,7 +209,7 @@ rmse <- function(error){ #' @return flexmixOutput The formatted output of the best fitting models from flexmix #' #' @examples -#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' #with messages printed to console @@ -247,6 +217,8 @@ rmse <- function(error){ #' #without messages printed to console #' flexmixOutput <- useFlexmix(expression = expression, verbose = FALSE) #' +#' @import flexmix +#' @importFrom plyr compact #' @export useFlexmix <- function(expression, verbose = TRUE){ mo1 <- flexmix::FLXMRglm(family = "gaussian") @@ -366,12 +338,11 @@ useFlexmix <- function(expression, verbose = TRUE){ c(3) } })) - #save(bestModel, file = "flexModels.Rdata") if(verbose){message("Creating the formatted output with the best fitting models")} outputs <- lapply(1:length(bestModel),function(i){ if(bestModel[i]==1){ - modus <- "unimodal" + modus <- 1 means <- (flexModelsUni[[i]]$means) sds <- c(flexModelsUni[[i]]$sds) sizes <- c(flexModelsUni[[i]]$sizes) @@ -382,10 +353,9 @@ useFlexmix <- function(expression, verbose = TRUE){ }})) }) - #c(flexModelsUni[[i]][1],NA,flexModelsUni[[i]][2],NA,(flexModelsUni[[i]][3])/100,NA) }else if(bestModel[i]==2){ - modus <- "bimodal" + modus <- 2 means <- c(flexModelsBi[[i]]$means) sds <- c(flexModelsBi[[i]]$sds) sizes <- c(flexModelsBi[[i]]$sizes) @@ -397,7 +367,7 @@ useFlexmix <- function(expression, verbose = TRUE){ }) }else if(bestModel[i]==3){ - modus <- "bimodal" + modus <- 2 means <- c(flexModelsGU[[i]]$means) sds <- c(flexModelsGU[[i]]$sds) sizes <- c(flexModelsGU[[i]]$sizes) @@ -408,8 +378,8 @@ useFlexmix <- function(expression, verbose = TRUE){ }})) }) } - if(modus=="bimodal"&&(sizes[1]==ncol(expression2)||sizes[2]==ncol(expression2))){ - modus="unimodal" + if(modus==2&&(sizes[1]==ncol(expression2)||sizes[2]==ncol(expression2))){ + modus=1 means <- means[!is.na(means)] sds <- sds[!is.na(sds)] sizes <- sizes[!is.na(sizes)] diff --git a/R/evaluate.R b/R/evaluate.R index 5f22355..defc453 100644 --- a/R/evaluate.R +++ b/R/evaluate.R @@ -2,41 +2,31 @@ #' #' Compare the scenario of each gene that was assigned by the algorithm with #' the scenario that is written in the validation Data. -#' +#' @param modality numerical, for which modality the scenarioComparing is calculated, 1 for unimodal, +#' 2 for bimodal,... #' @param algorithmOutput The output to be compared #' @param validationData The validationData behind the simulated gene expression data frame #' #' @return Returns a list that classifies each assignment as #' TN (TRUE Negative),TP (TRUE Positive), FN (FALSE Negative), FP (FALSE Positive) #' @details -#' TN (TRUE Negative): validationData: unimodal, assigned scenario: unimodal +#' TN (TRUE Negative): validationData: not modality, assigned scenario: not modality #' -#' TP (TRUE Positive): validationData: bimodal, assigned scenario: bimodal +#' TP (TRUE Positive): validationData: modality, assigned scenario: modality #' -#' FN (FALSE Negative): validationData: bimodal, assigned scenario: unimodal -#' or -#' FN (FALSE Negative): validationData: unimodal/bimodal, -#' assigned scenario: more than bimodal +#' FN (FALSE Negative): validationData: modality, assigned scenario: not modality #' -#' FP (FALSE Positive): validationData: unimodal, assigned scenario: bimodal +#' FP (FALSE Positive): validationData: not modality, assigned scenario: modality #' -compareScenarios <- function(algorithmOutput,validationData){ +compareScenarios <- function(modality, algorithmOutput,validationData){ compareScenarios <- (unlist(lapply(1:length(algorithmOutput$Genes),function(i){ - if(validationData[i,2]==1&&algorithmOutput$Genes[[i]]$modus=="unimodal"){ - #unimodal + classified as unimodal + if(validationData[[i]]$modus != modality && algorithmOutput$Genes[[i]]$modus!=modality){ evaluation <- "TN" - }else if((validationData[i,2]==2||validationData[i,2]==3)&&algorithmOutput$Genes[[i]]$modus=="bimodal"){ - #bimodal + classified as bimodal + }else if(validationData[[i]]$modus ==modality && algorithmOutput$Genes[[i]]$modus==modality){ evaluation <- "TP" - }else if((validationData[i,2]==1)&&algorithmOutput$Genes[[i]]$modus=="bimodal"){ - #unimodal + classified as bimodal + }else if(validationData[[i]]$modus !=modality && algorithmOutput$Genes[[i]]$modus==modality){ evaluation <- "FP" - }else if((validationData[i,2]==2||validationData[i,2]==3)&&algorithmOutput$Genes[[i]]$modus=="unimodal"){ - #bimodal + classified as unimodal - evaluation <- "FN" - }else if((validationData[i,2]==1||validationData[i,2]==2||validationData[i,2]==3)&& - algorithmOutput$Genes[[i]]$modus=="multimodal"){ - #unimodal/bimodal + classified as more than multimodal + }else if(validationData[[i]]$modus ==modality && algorithmOutput$Genes[[i]]$modus!=modality){ evaluation <- "FN" }else{ evaluation <- NA @@ -66,49 +56,10 @@ isApproxSame<- function(validationVal,classVal,maxDistance){ } } -#' Expand the validationData -#' -#' Adds two columns to the validationData: mean2 and sdGamma -#' @param validationData not expanded validationData data frame -#' @return returns validationData with the two added columns -#' -#' @details -#' For scenario 1, no mean2 and no sdGamma is added (NA added) -#' For scenario 2 the mean2 (calculated out of mean1 and foldChange) is added, -#' but no sdGamma is added (NA added) -#' For scenario 3, the mean 2 is not added(NA added, foldChange was already -#' used), but sdGamma is calculated with sqrt(gammaShape*(gammaScale^2)) -#' gammaScale is 1/gammaRate -#' -expandValidationData <- function(validationData){ - validationData <- cbind(validationData,unlist(lapply(1:nrow(validationData),function(i){ - if(as.numeric(validationData[i,2])==1){#is unimodal, no mean2 - addValue <- NA - }else if(as.numeric(validationData[i,2])==2){#calculates mean2 with mean1*FC - addValue <- as.numeric(validationData[i,3])*as.numeric(validationData[i,6]) - }else if(as.numeric(validationData[i,2])==3){#calculates mean2 with shape*scale - gammaScale <- 1/as.numeric(validationData[i,10]) - mean2 <- as.numeric(validationData[i,9])*gammaScale - addValue <- mean2 - } - - }))) - colnames(validationData)[11]<-"Mean2" - validationData <- cbind(validationData,unlist(lapply(1:nrow(validationData),function(i){ - if((as.numeric(validationData[i,2])==1) || (as.numeric(validationData[i,2])==2)){#is unimodal or bimodal without gamma, no sdGamma2 - addValue <- NA - }else if(as.numeric(validationData[i,2])==3){#calculates sd2 with sqrt(shape*(scale^2)) - gammaScale <- 1/as.numeric(validationData[i,10]) - sdGamma <- sqrt(as.numeric(validationData[i,9])*(gammaScale*gammaScale)) - addValue <- sdGamma - } - }))) - colnames(validationData)[12]<-"sdGamma" - return(validationData) -} - #' Evaluate AlgorithmOutput #' +#' @param modality numerical, for which modality output is evaluated, 1 for unimodal, +#' 2 for bimodal,... #' @param algorithmOutput The formatted output of the algorithm that is now #' evaluated #' @param validationData the validationData data.frame @@ -116,16 +67,16 @@ expandValidationData <- function(validationData){ #' value and the validation data value with which the estimated is classified #' as TRUE (correctly estimated) #' @examples -#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData #' output <- useMclust(expression, verbose=FALSE) #' # TRUE if less than +- 10 % difference to value of validationData -#' evaluation <- evaluateAlgorithmOutput(algorithmOutput = output, +#' evaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = output, #' validationData = validationData) #' # TRUE if less than +- 20 % difference to value of validationData -#' evaluation <- evaluateAlgorithmOutput(algorithmOutput = output, +#' evaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = output, #' validationData = validationData,maxDifference = 20) #' @return Returns a data.frame where for each as TP classified gene it was #' evaluated whether it was estimated approximately correct or not (TRUE or @@ -138,62 +89,41 @@ expandValidationData <- function(validationData){ #' or equal to 10%, the estimation is classified as TRUE (correctly estimated) #' #' @export -evaluateAlgorithmOutput <- function(algorithmOutput,validationData,maxDifference = 10){ - validationDataExpanded <- expandValidationData(validationData = validationData) - compared <- compareScenarios(algorithmOutput = algorithmOutput,validationData = validationDataExpanded) - - genes <- validationDataExpanded[,1] #row 'header' +evaluateAlgorithmOutput <- function(modality,algorithmOutput,validationData,maxDifference = 10){ + compared <- compareScenarios(modality = modality,algorithmOutput = algorithmOutput,validationData = validationData) - numCol <- ncol(algorithmOutput) - perAt <- floor(numCol/3) + genes <- names(validationData) - approxSame <- (lapply(1:length(algorithmOutput$Genes),function(i){ - if((algorithmOutput$Genes[[i]]$modus=="bimodal") && (as.numeric(validationDataExpanded[i,2])==2||as.numeric(validationDataExpanded[i,2])==3)){ - mean1 <- isApproxSame(validationVal = min(as.numeric(validationDataExpanded[i,3]),as.numeric(validationDataExpanded[i,11])), - classVal = min(algorithmOutput$Genes[[i]]$means[1],algorithmOutput$Genes[[i]]$means[2]), - maxDistance = maxDifference) - mean2 <- isApproxSame(validationVal = max(as.numeric(validationDataExpanded[i,3]),as.numeric(validationDataExpanded[i,11])), - classVal = max(algorithmOutput$Genes[[i]]$means[1],algorithmOutput$Genes[[i]]$means[2]), - maxDistance = maxDifference) - if(as.numeric(validationDataExpanded[i,2])==2){ - sd1 <- isApproxSame(validationVal = (min(as.numeric(validationDataExpanded[i,4]),as.numeric(validationDataExpanded[i,5]))), - classVal = min(algorithmOutput$Genes[[i]]$sds[1],algorithmOutput$Genes[[i]]$sds[1]), - maxDistance = maxDifference) - sd2 <- isApproxSame(validationVal = (max(as.numeric(validationDataExpanded[i,4]),as.numeric(validationDataExpanded[i,5]))), - classVal = max(algorithmOutput$Genes[[i]]$sds[1],algorithmOutput$Genes[[i]]$sds[2]), - maxDistance = maxDifference) - }else if(as.numeric(validationDataExpanded[i,2])==3){ - sd1 <- isApproxSame(validationVal = (min(as.numeric(validationDataExpanded[i,4]),as.numeric(validationDataExpanded[i,12]))), - classVal = min(algorithmOutput$Genes[[i]]$sds[1],algorithmOutput$Genes[[i]]$sds[2]), - maxDistance = maxDifference) - sd2 <- isApproxSame(validationVal = (max(as.numeric(validationDataExpanded[i,4]),as.numeric(validationDataExpanded[i,12]))), - classVal = max(algorithmOutput$Genes[[i]]$sds[1],algorithmOutput$Genes[[i]]$sds[2]), - maxDistance = maxDifference) - } - if((algorithmOutput$Genes[[i]]$sizes[1]==100)&&(is.na(algorithmOutput$Genes[[i]]$sizes[2]))){ - prop1 <- isApproxSame(validationVal = ((max(as.numeric(validationDataExpanded[i,7]),as.numeric(validationDataExpanded[i,8])))), - classVal = algorithmOutput$Genes[[i]]$sizes[1], - maxDistance = maxDifference) - prop2 <- FALSE - } - else{ - prop1 <- isApproxSame(validationVal = ((min(as.numeric(validationDataExpanded[i,7]),as.numeric(validationDataExpanded[i,8])))), - classVal = min(algorithmOutput$Genes[[i]]$sizes[1],algorithmOutput$Genes[[i]]$sizes[2]), - maxDistance = maxDifference) - prop2 <- isApproxSame(validationVal = ((max(as.numeric(validationDataExpanded[i,7]),as.numeric(validationDataExpanded[i,8])))), - classVal = max(algorithmOutput$Genes[[i]]$sizes[1],algorithmOutput$Genes[[i]]$sizes[2]), - maxDistance = maxDifference) - } + approxSame <- (lapply(1:length(algorithmOutput$Genes),function(i){ + if((algorithmOutput$Genes[[i]]$modus==modality) && (validationData[[i]]$modus==modality)){ + classValsMean <- sort(algorithmOutput$Genes[[i]]$means) + x <- algorithmOutput$Genes[[i]]$sds + names(x) <- order(algorithmOutput$Genes[[i]]$means) + classValSDs <- x[order(names(x))] + y <- algorithmOutput$Genes[[i]]$sizes + names(y) <- order(algorithmOutput$Genes[[i]]$means) + classValProps <- y[order(names(y))] + means <- unlist(lapply(1:length(validationData[[i]]$means),function(j){ + isApproxSame(validationVal = validationData[[i]]$means[j],classVal = classValsMean[j],maxDistance = maxDifference) + })) + sds <- unlist(lapply(1:length(validationData[[i]]$sds),function(j){ + isApproxSame(validationVal = validationData[[i]]$sds[j],classVal = classValSDs[j],maxDistance = maxDifference) + })) + props <- unlist(lapply(1:length(validationData[[i]]$sizes),function(j){ + isApproxSame(validationVal = validationData[[i]]$sizes[j],classVal = classValProps[j],maxDistance = maxDifference) + })) - c(paste0("Gene",i),mean1,mean2,sd1,sd2,prop1,prop2) - } - })) - evaluation<- data.frame(do.call('rbind',approxSame)) - result <- data.frame(evaluation[,2:ncol(evaluation)]) - rownames(result) <- evaluation[,1] - colnames(result) <- c("mean1","mean2","sd1","sd2","prop1","prop2") + c(means,sds,props) + } + })) + names(approxSame) <- genes + evaluation<- data.frame(do.call('rbind',approxSame)) + meanNames <- sprintf(c("mean%s"),(1:(ncol(evaluation)/3))) + sdNames <- sprintf(c("sd%s"),(1:(ncol(evaluation)/3))) + propNames <- sprintf(c("prop%s"),(1:(ncol(evaluation)/3))) + colnames(evaluation) <- c(meanNames,sdNames,propNames) - return(result) + return(evaluation) } #' getClassification @@ -201,21 +131,22 @@ evaluateAlgorithmOutput <- function(algorithmOutput,validationData,maxDifference #' Returns the numbers of TN (TRUE Negative),TP (TRUE Positive), #' FN (FALSE Negative) and FP (FALSE Positive) for one of the algorithm's #' formatted outputs. -#' +#' @param modality numerical, for which modality the classification is calculated, 1 for unimodal, +#' 2 for bimodal,... #' @param algorithmOutput The formatted output of the algorithm #' @param algorithmName The name of the evaluated algorithm #' @param validationData The validationData behind the simulated gene #' expression data frame -#' #' @examples -#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData #' mclustOutput <- useMclust(expression, verbose=FALSE) #' evaluation <- evaluateAlgorithmOutput(algorithmOutput = output, #' validationData = validationData) -#' getClassification(algorithmOutput = mclustOutput,algorithmName = "mclust", +#' getClassification(modality= 2,algorithmOutput = mclustOutput, +#' algorithmName = "mclust", #' validationData = validationData) #' @return a classification data frame that contains the number of #' FN, FP, TN and TP @@ -232,9 +163,9 @@ evaluateAlgorithmOutput <- function(algorithmOutput,validationData,maxDifference #' #' FP (FALSE Positive): validationData: unimodal, assigned scenario: bimodal #' -getClassification <- function(algorithmOutput,algorithmName="mclust",validationData){ +getClassification <- function(modality,algorithmOutput,algorithmName="mclust",validationData){ - compared <- compareScenarios( + compared <- compareScenarios(modality = modality, algorithmOutput = algorithmOutput,validationData = validationData) fn <- table(compared)["FN"] @@ -259,14 +190,14 @@ getClassification <- function(algorithmOutput,algorithmName="mclust",validationD #' @param evaluation The output of the evaluation of the formatted output #' @param algorithmName The name of the algorithm which created the output #' @return A data frame with the percentual proportion of as TRUE or FALSE evaluated values -#' #' @examples -#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData #' mclustOutput <- useMclust(expression, verbose=FALSE) -#' mclustEvaluation <- evaluateAlgorithmOutput(algorithmOutput = mclustOutput,validationData = validationData) +#' mclustEvaluation <- evaluateAlgorithmOutput(algorithmOutput = mclustOutput, +#' validationData = validationData) #' getStatistic(evaluation = mclustEvaluation,algorithmName = "mclust") #' getStatistic <- function(evaluation,algorithmName ="mclust"){ @@ -287,7 +218,7 @@ getStatistic <- function(evaluation,algorithmName ="mclust"){ #' @param evaluations list of named evaluations #' @return A data frame with the percentual proportion of as TRUE or FALSE evaluated values #' @examples -#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData @@ -322,14 +253,15 @@ getStatistics <- function(evaluations) { #' Returns the numbers of TN (TRUE Negative),TP (TRUE Positive), #' FN (FALSE Negative) and FP (FALSE Positive) for each of the algorithm's #' formatted outputs. -#' +#' @param modality numerical, for which modality the classification is calculated, 1 for unimodal, +#' 2 for bimodal,... #' @param outputs list of named outputs #' @param validationData The validationData behind the simulated gene expression data frame #' #' @return a classification data frame that contains the number of #' FN, FP, TN and TP #' @examples -#' params <- newParams(nUniGenes = 100, nBiGenes = 100, nGuGenes = 100) +#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' validationData <- simulation$validationData @@ -338,9 +270,11 @@ getStatistics <- function(evaluations) { #' flexmixOutput <- useFlexmix(expression = expression,verbose = FALSE) #' #' #getClassifications of one algorithm -#' getClassifications(validationData,outputs = list("mclust"=mclustOutput)) +#' getClassifications(modality=2,validationData, +#' outputs = list("mclust"=mclustOutput)) #' #getClassifications of two or more algorithms -#' getClassifications(validationData,outputs = list("mclust"=mclustOutput,"HDBSCAN"=hdbscanOutput)) +#' getClassifications(modality=2,validationData, +#' outputs = list("mclust"=mclustOutput,"HDBSCAN"=hdbscanOutput)) #' #' @details #' TN (TRUE Negative): validationData: unimodal, assigned scenario: unimodal @@ -356,10 +290,10 @@ getStatistics <- function(evaluations) { #' #' #' @export -getClassifications <- function(validationData,outputs){ +getClassifications <- function(modality,validationData,outputs){ numOutputs <- (length(outputs)) classification <- lapply(1:numOutputs,function(i){ - getClassification(algorithmOutput = outputs[[i]], + getClassification(modality,algorithmOutput = outputs[[i]], algorithmName = names(outputs)[i], validationData = validationData) diff --git a/R/params.R b/R/params.R index 7242aae..5a7ec2e 100644 --- a/R/params.R +++ b/R/params.R @@ -9,7 +9,7 @@ #' #' @examples #' params <- newParams() -#' getParam(params, "nUniGenes") +#' getParam(object = params,name = "nGenes") #' #' @rdname getParam #' @export @@ -27,9 +27,10 @@ setGeneric("getParam", function(object, name) {standardGeneric("getParam")}) #' #' @examples #' params <- newParams() -#' params = setParam(params, "nBiGenes", 100) +#' params <- setParam(object = params,name = "nPatients",value = 250) #' #' @rdname setParam +#' @importFrom methods new rbind2 slot slot<- slotNames validObject #' @export setGeneric("setParam", function(object, name, value) { @@ -79,59 +80,51 @@ setGeneric("setParam", #' @aliases Params-class #' @exportClass Params setClass("Params", - slots = c(nUniGenes = "numeric", - nBiGenes = "numeric", - nGuGenes ="numeric", - nPatients = "numeric", - seed = "numeric", - sdParms = "vector", - uniMean = "vector", - biMean = "vector", - foldChange ="vector", + slots = c(nGenes ="list", + means = "list", + foldChanges = "list", + sd = "list", + gamma = "list", proportions = "vector", - guMean = "vector", - gammaShape = "numeric", - gammaRate = "numeric"), - prototype = prototype(nUniGenes = 300, nBiGenes = 500, nGuGenes = 200, - nPatients = 100, seed = sample(1:1e6, 1), - sdParms = c(0.61,2.21), uniMean = c(2,5), - biMean = c(2,5), foldChange =c(2,4), - proportions = c(10,20,30,40,50,60,70,80,90), guMean = c(2,5), - gammaShape = 2, gammaRate = 2)) + nPatients = "numeric", + seed = "numeric"), + prototype = prototype(nGenes = list("1"= 700, "2" = list("gauss"=150,"gamma"=150)), + means = list("1"=c(2,4),"2" = c(2,4)), + foldChanges = list("1" = c(2,4),"2" = list("gauss"= c(2,4),"gamma" = c(3,5))), + sd = list("mu"= 0.61,"lambda"=2.21), + gamma = list("shape" = 2, "rate" = 2), + proportions = c(10,20,30,40,50,60,70,80,90), + nPatients = 200, + seed = sample(1:1e6, 1))) #' @rdname getParam setMethod("getParam", "Params", function(object, name) { - slot(object, name) + methods::slot(object, name) }) #' @rdname setParam setMethod("setParam", "Params", function(object, name, value) { checkmate::assertString(name) - slot(object, name) <- value - validObject(object) + methods::slot(object, name) <- value + methods::validObject(object) return(object) }) setMethod("show", "Params", function(object) { - pp <- list("Global:" = c("UnimodalGenes" = "nUniGenes", - "BimodalGenes" = "nBiGenes", - "GammaUnimodalGenes" = "nGuGenes", - "Patients" = "nPatients", - "Seed" = "seed", - "Mu,Lambda for sd"="sdParms", - "Unimodal Mean" = "uniMean", - "Bimodal Mean" = "biMean", - "FoldChange" = "foldChange", + pp <- list("Global:" = c("nGenes" = "nGenes", + "Means" = "means", + "FoldChanges" = "foldChanges", + "sd" ="sd", + "Gamma" ="gamma", "Proportions" = "proportions", - "GammaUnimodal Mean" = "guMean", - "Gamma Shape" = "gammaShape", - "Gamma Rate" = "gammaRate")) + "nPatients" = "nPatients", + "Seed" = "seed")) cat("A Params object of class", class(object), "\n") cat("Parameters can be Default' or 'NOT DEFAULT'.", "\n\n") showPP(object, pp) - cat(length(slotNames(object)) - 13, "additional parameters", "\n\n") + cat(length(methods::slotNames(object)) - 8, "additional parameters", "\n\n") }) #' New Params @@ -144,13 +137,19 @@ setMethod("show", "Params", function(object) { #' @return New Params object. #' @examples #' params <- newParams() -#' params <- newParams(nUniGenes = 200, nPatients = 10) +#' params <- newParams("nGenes"=list("1"=100,"2"=list("gauss"=100,"gamma"=100)), +#' "proportions"=c(30,70)) +#' params <- newParams( +#' nGenes=list("1"=c(700),"2"=list(gauss = 150, gamma = 150),"3"=list(gauss = 150, gamma = 150)), +#' means=list("1"= c(2, 4),"2"= c(2, 4),"3"= c(2, 4)), +#' foldChanges = list("1"=NA,"2"= list(gauss = c(2, 4), gamma = c(2, 4)), +#' "3"= list(gauss = list(c(2, 4), c(2, 4)), gamma = list(c(2, 4), c(2, 4))))) #' #' @rdname newParams #' @export newParams <- function(...) { - params <- new("Params") + params <- methods::new("Params") params <- setParams(params, ...) return(params) @@ -166,7 +165,7 @@ newParams <- function(...) { #' @return List with the values of the selected parameters. #' @examples #' params <- newParams() -#' getParams(params, c("nUniGenes", "nPatients", "uniMean")) +#' getParams(params = params,names = c("nGenes","foldChanges","proportions","means")) #' @export getParams <- function(params, names) { @@ -198,10 +197,12 @@ getParams <- function(params, names) { #' params <- newParams() #' params #' # Set individually -#' params <- setParams(params, nUniGenes = 1000, nPatients = 50) +#' params <- params <- setParams(params,proportions = c(80,20),foldChanges = list("1"=c(2,4), +#' "2"= list("gauss"=c(2,5),"gamma"=c(3,5)))) #' params #' # Set via update list -#' params <- setParams(params, list(nUniGenes = 1000, nPatients = 50)) +#' params <- setParams(params,update = list("nGenes"=list("1"=100, +#' "2"=list("gauss"=200,"gamma"=100)),"means"=list("1"=c(1,3),"2"=c(2,4)),"nPatients"=700)) #' params #' @export setParams <- function(params, update = NULL, ...) { @@ -236,12 +237,12 @@ showPP <- function(params, pp) { checkmate::assertClass(params, classes = "Params") checkmate::assertList(pp, types = "character", min.len = 1) - default <- new(class(params)) + default <- methods::new(class(params)) for (category in names(pp)) { parameters <- pp[[category]] values <- getParams(params, parameters) short.values <- sapply(values, function(x) { - if (length(x) > 4) { + if (length(x) > 10) { paste0(paste(head(x, n = 4), collapse = ", "), ",...") } else { paste(x, collapse = ", ") diff --git a/R/simulation.R b/R/simulation.R index 101ed7d..69a2467 100644 --- a/R/simulation.R +++ b/R/simulation.R @@ -13,7 +13,7 @@ #' simulated from the mean and the standard deviation taken from the validation data #' for this gene. The number of values that are simulated for each #' distribution equals the number of Patients {"nPatients"}: -#' \code{expression <- rnorm(nPatients,as.numeric(valData[i,3]), as.numeric(valData[i,4]))} +#' \code{expression <- stats::rnorm(nPatients,as.numeric(valData[i,3]), as.numeric(valData[i,4]))} #' #' Scenario 2 - Bimodal with two unimodal gaussian distributions #' Gene expression for the bimodal scenario 2 (two gaussian distributions) is @@ -22,7 +22,7 @@ #' deviation taken from the validation data for this gene. The number of values that are #' simulated ("nBi1") are calculated by multiplicating the number of Patients with the #' first proportion and rounding this number off: -#' \code{data1 <- rnorm(nBi1,as.numeric(valData[i,3]),as.numeric(valData[i,4]))} +#' \code{data1 <- stats::rnorm(nBi1,as.numeric(valData[i,3]),as.numeric(valData[i,4]))} #' The second distribution is simulated with the mean calculated by #' multiplying the mean of the first distribution with the foldChange #' from the validation data of this gene. The standard deviation is also taken from the validation data of this gene. The number of values that are @@ -31,7 +31,7 @@ #' the mean and the standard deviation taken from the validation data for this gene. #' The number of values that are simulated for each distribution equals the #' number of Patients {"nPatients"}: -#' \code{data2 <- rnorm(nBi2,foldBi,as.numeric(valData[i,5]))} +#' \code{data2 <- stats::rnorm(nBi2,foldBi,as.numeric(valData[i,5]))} #' At the end both of the simulated distributions are put into one expression. #' \code{expression <- c(data1,data2)} #' @@ -44,7 +44,7 @@ #' The number of values that are simulated ("nGu1") are calculated by #' multiplying the number of Patients with the first proportion and rounding #' this number off. -#' \code{data1 <- rgamma(n = nGu1,shape = as.numeric(valData[i,9]), +#' \code{data1 <- stats::rgamma(n = nGu1,shape = as.numeric(valData[i,9]), #' rate = as.numeric(valData[i,10]))} #' The second distribution, which is the unimodal gaussian distribution is #' simulated with the calculated mean, which is calculated by multiplying the @@ -52,7 +52,7 @@ #' deviation is taken from the validation data of this gene. The number of values that #' are simulated ("nGu2") are calculated by multiplying the number of Patients #' with the second proportion and rounding this number up. -#' \code{data2 <- rnorm(nGu2,foldGamma,as.numeric(valData[i,4]))} +#' \code{data2 <- stats::rnorm(nGu2,foldGamma,as.numeric(valData[i,4]))} #' At the end both of the simulated distributions are put into one expression. #' \code{expression <- c(data1,data2)} #' @@ -63,11 +63,13 @@ #' simulation <- simulateExpression(params = newParams()) #' # Override default parameters #' # Not recommended, better use setParams; See \code{\link{Params}} -#' simulation <- simulateExpression(nUniGenes = 1000, nPatients = 50) +#' simulation <- simulateExpression(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100)), nPatients = 50) #' # without messages printed to the console +#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) #' simulation <- simulateExpression(params = params,verbose = FALSE) #' @export #' @importFrom checkmate assertClass assertString assertCharacter assertList +#' @importFrom stats dnorm rgamma rnorm runif sd #' simulateExpression <- function(params = newParams(), verbose = TRUE,...) { @@ -75,43 +77,48 @@ simulateExpression <- function(params = newParams(), verbose = TRUE,...) { checkmate::assertClass(params, "Params") params <- setParams(params, ...) - # Set random seed seed <- getParam(params, "seed") set.seed(seed) valData <- createValidationData(params = params, verbose = verbose) nPatients <- getParam(params,"nPatients") - nUniGenes <- getParam(params, "nUniGenes") - nBiGenes <- getParam(params,"nBiGenes") - nGuGenes <- getParam(params,"nGuGenes") - nGenes <- nUniGenes+nBiGenes+nGuGenes + + numGenes <- sum(unlist(getParam(params,name = "nGenes"))) if (verbose){message("Drawing gene expression values...")} - ##STEP 2 - Generating the simulation result - sim <- lapply(1:nGenes,function(i){ - if(valData[i,2]==1){ - expression <- rnorm(nPatients,as.numeric(valData[i,3]),as.numeric(valData[i,4])) - }else if(valData[i,2]==2){ - nBi1 = floor(nPatients*as.numeric(valData[i,7])/100) - nBi2 = ceiling(nPatients*as.numeric(valData[i,8])/100) - foldBi = as.numeric(valData[i,3])*as.numeric(valData[i,6]) - data1 <- rnorm(nBi1,as.numeric(valData[i,3]),as.numeric(valData[i,4])) - data2 <- rnorm(nBi2,foldBi,as.numeric(valData[i,5])) - expression <- c(data1,data2) - }else if(valData[i,2]==3){ - nGu1 = floor(nPatients*as.numeric(valData[i,7])/100) - nGu2 = ceiling(nPatients*as.numeric(valData[i,8])/100) - data1 <- rgamma(n = nGu1,shape = as.numeric(valData[i,9]),rate = as.numeric(valData[i,10])) - data2 <- rnorm(nGu2,as.numeric(valData[i,3]),as.numeric(valData[i,4])) - expression <- c(data1,data2) + simNotAdjusted <- lapply(1:numGenes,function(i){ + if(valData[[i]]$scenario=="unimodal-gaussian"){ + expression <- stats::rnorm(n = nPatients,mean = valData[[i]]$means,sd = valData[[i]]$sds) + }else if(valData[[i]]$scenario=="multimodal-gaussian"){ + expression <- unlist(lapply(1:valData[[i]]$modus,function(j){ + expression <- stats::rnorm(n = valData[[i]]$sizes[j],mean = valData[[i]]$means[j],sd = valData[[i]]$sds[j]) + })) + }else if(valData[[i]]$scenario=="multimodal-gamma+gaussian"){ + expression <- unlist(lapply(1:valData[[i]]$modus,function(j){ + if(j==1){ + expression <- stats::rgamma(n = valData[[i]]$sizes[j],shape = params@gamma$shape,rate = params@gamma$rate) + }else{ + expression <- stats::rnorm(n = valData[[i]]$sizes[j],mean = valData[[i]]$means[j],sd = valData[[i]]$sds[j]) + } + })) } expression }) + simNotAdjusted <- data.frame(do.call('rbind',simNotAdjusted)) + sim <- lapply(1:nrow(simNotAdjusted),function(i){ + values <- unlist(lapply(1:ncol(simNotAdjusted),function(j){ + if(simNotAdjusted[i,j]<0){ + 0 + }else{ + simNotAdjusted[i,j] + } + })) + }) sim <- data.frame(do.call('rbind',sim)) - genes <- valData[,1] + genes <- names(valData) patients <- sprintf("Patient%04d", 1:nPatients) #columns 'header' rownames(sim) <- genes colnames(sim) <- patients @@ -166,57 +173,134 @@ simulateExpression <- function(params = newParams(), verbose = TRUE,...) { #' #' gammaRate: numeric, Default value: 2 #' -#' @export +#' #' @importFrom LaplacesDemon rinvgaussian #' createValidationData <- function(params = newParams(),verbose = TRUE){ - # Get the parameters we are going to use - nPatients <- getParam(params, "nPatients") - nUniGenes <- getParam(params, "nUniGenes") - nBiGenes <- getParam(params,"nBiGenes") - nGuGenes <- getParam(params,"nGuGenes") - nGenes <- nUniGenes+nBiGenes+nGuGenes - nSDs <- nUniGenes+ 2*nBiGenes+nGuGenes - sdParms <- getParam(params, "sdParms") - uniMean <- getParam(params, "uniMean") - biMean <- getParam(params, "biMean") - foldChange <- getParam(params, "foldChange") - proportions <- getParam(params, "proportions") - guMean <- getParam(params, "guMean") - gammaShape <- getParam(params, "gammaShape") - gammaRate <- getParam(params,"gammaRate") + numGenes <- sum(unlist(getParam(params,name = "nGenes"))) if (verbose) { message("Drawing parameters for expression simulation...") } - genes <- sprintf("Gene%04d",1:nGenes) - scenarios <- sample(rep(c(1,2,3), times = c(nUniGenes, nBiGenes, nGuGenes)), size = nGenes, replace = F) + genes <- sprintf("Gene%04d",1:numGenes) + reps <- c() + reps <- unlist(lapply(1:length(unlist(params@nGenes)),function(i){ + if(i == 1){ + reps <- rep(x = i,times = params@nGenes$`1`) + }else{ + if(i%%2==0){ + reps <- c(reps,rep(x = i, times = sum(unlist(eval(parse(text=(paste0("params@nGenes$`",floor(i/2)+1,"`$gauss")))))))) + }else{ + reps <- c(reps,rep(x = i, times = sum(unlist(eval(parse(text=(paste0("params@nGenes$`",floor(i/2)+1,"`$gamma")))))))) + } + } - drawGeneParams <- function(proportions, mean.min, mean.max, sd.mu, sd.lambda, fc.min = NULL, fc.max = NULL, gamma.shape = NA, gamma.rate = NA) { - ranMean <- runif(1, min = mean.min, max = mean.max) - ranSd <- LaplacesDemon::rinvgaussian(n = 1, mu = sd.mu, lambda = sd.lambda) - ranProp <- ifelse(length(proportions) > 1, sample(proportions, size = 1), proportions) - ranProp2 <- 100 - ranProp + })) + scenarios <- sample(x = reps,size = numGenes,replace = FALSE) + drawGeneParams <- function(nPatients,modality, proportions,means=NULL,sdParms,foldChanges = NULL,gammaParms = NULL){ + ranMean1 <- c() + ranSd1 <- c() + if(!is.null(gammaParms)){ + ranMean1 <- gammaParms$shape/gammaParms$rate + ranSd1 <- sqrt(gammaParms$shape/(gammaParms$rate^2)) + }else{ + ranMean1 <- stats::runif(1, min = means[1], max = means[2]) + ranSd1 <- LaplacesDemon::rinvgaussian(n = 1, mu = sdParms$mu, lambda = sdParms$lambda) + } + ranProps <- 100 + ranFCs <- NA + means <- c() + ranSd <- c() + if(modality >= 2){ + combs <- t(utils::combn(x = proportions,m = modality,simplify=TRUE)) + combinations <- unlist(lapply(1:nrow(combs),function(i){ + if(sum(combs[i,])==100){ + i + } + })) + sampleProp <- sample(combinations,1,replace = FALSE) + ranProps <- combs[sampleProp,] - ranSd2 <- NA - ranFC <- NA + ranSd<- unlist(lapply(2:modality,function(i){ + eval(parse(text = paste(paste0("ranSd",i), + "<- ", + LaplacesDemon::rinvgaussian(n = 1, mu = sdParms$mu, lambda = sdParms$lambda)))) + })) + if(modality==2){ + ranFCs <- unlist(lapply(1:1,function(i){ + eval(parse(text = paste(paste0("ranFC",i), + "<-",(stats::runif(1, min = foldChanges[1], max = foldChanges[2]))))) + })) + }else{ + ranFCs <- unlist(lapply(1:(length(foldChanges)),function(i){ + if(modality ==2){ + eval(parse(text = paste(paste0("ranFC",i), + "<-",(stats::runif(1, min = foldChanges[1], max = foldChanges[2]))))) + }else{ + eval(parse(text = paste(paste0("ranFC",i), + "<-",(stats::runif(1, min = foldChanges[[i]][1], max = foldChanges[[i]][2]))))) + } - if(!is.null(fc.min) & !is.null(fc.max)) { - ranSd2 <- LaplacesDemon::rinvgaussian(n = 1, mu = sd.mu, lambda = sd.lambda) - ranFC <- runif(1, min = fc.min, max = fc.max) + })) + } + eval(parse(text = paste(paste0("ranMean",2:modality), + "<-",NA))) + for(i in 1:modality){ + if(i!=1){ + eval(parse(text = paste(paste0("ranMean",(i)),"<-",paste0("ranMean",(i-1)),"*",paste0("ranFCs[",(i-1),"]")))) + } + } } - return(data.frame(Mean = ranMean, Sd1 = ranSd, Sd2 = ranSd2, FC = ranFC, Prop = ranProp, Prop2 = ranProp2, Shape = gamma.shape, Rate = gamma.rate)) + sds <- c(ranSd1,ranSd) + means <- unlist(lapply(1:modality,function(i){ + eval(parse(text = paste(paste0("ranMean",(i))))) + })) + sizes <- (ranProps/100)*nPatients + patients <- 1 + groups <- lapply(1:length(sizes),function(i){ + if(i==1){ + sprintf("Patient%d",1:sizes[i]) + }else{ + sprintf("Patient%d",(sum(sizes[1:(i-1)])+1):(sizes[i]+sum(sizes[1:(i-1)]))) + } + }) + scenario <-c() + if(modality==1){ + scenario = "unimodal-gaussian" + }else if(modality>1&&is.null(gammaParms)){ + scenario = "multimodal-gaussian" + }else{ + scenario <- "multimodal-gamma+gaussian" + } + + return(list("modus"=modality,"scenario"=scenario,"means"=means,"foldChanges"=ranFCs,"sds"=sds,"proportions"=ranProps,"sizes"=sizes,"groups"=groups)) } - simData <- lapply(scenarios, function(s) { - if(s == 1) gP <- drawGeneParams(100, uniMean[1], uniMean[2], sdParms[1], sdParms[2]) - if(s == 2) gP <- drawGeneParams(proportions, biMean[1], biMean[2], sdParms[1], sdParms[2], fc.min = foldChange[1], fc.max = foldChange[2]) - if(s == 3) gP <- drawGeneParams(proportions, guMean[1], guMean[2], sdParms[1], sdParms[2], gamma.shape = gammaShape, gamma.rate = gammaRate) + valData <- lapply(1:length(scenarios), function(i) { + if(scenarios[i]==1){ + gP <- drawGeneParams(nPatients = params@nPatients, + modality = 1, + proportions = params@proportions, + means = params@means$`1`, + sdParms = params@sd) + } + if((scenarios[i]>1)&&(scenarios[i]%%2==0)){ + gP <- drawGeneParams(nPatients = params@nPatients, + modality = (floor(scenarios[i]/2))+1, + proportions = params@proportions, + means = eval(parse(text = paste(paste0("params@means$`",((floor(scenarios[i]/2))+1),"`")))), + sdParms = params@sd, + foldChanges = eval(parse(text = paste(paste0("params@foldChanges$`",((floor(scenarios[i]/2))+1),"`$gauss"))))) + } + if((scenarios[i]>1)&&(scenarios[i]%%2!=0)){ + gP <- drawGeneParams(nPatients = params@nPatients, + modality = (floor(scenarios[i]/2))+1, + proportions = params@proportions, + sdParms = params@sd, + foldChanges = eval(parse(text = paste(paste0("params@foldChanges$`",((floor(scenarios[i]/2))+1),"`$gamma")))), + gammaParms = params@gamma) + } return(gP) }) - simData <- do.call("rbind", simData) - - valData <- cbind(genes, scenarios, simData) - colnames(valData) <- c("Gene","Scenario","Mean","SD1","SD2", - "FC","prop1","prop2","GammaShape","GammaRate") + names(valData)<- genes if(verbose){ message("Finished creating validation data.") } return(valData) } From e568d7ed3967919da36b124197d929fe85578e11 Mon Sep 17 00:00:00 2001 From: sbeck Date: Wed, 21 Feb 2018 14:45:59 +0100 Subject: [PATCH 4/6] Code was cleaned, sampling the proportions of the different distributions for multimodal genes was changed The code was cleaned (unnecessary comments were deleted). The proportions of the different distributions is now calculated by taking the proportion parameter from the params object and replicating it as often as the modality is. Then all possible combinations are generated, the ones that add up to 100 are kept , sorted and cleaned so that each combination is unique. Then 1 is randomly sampled. --- R/algorithmSpecificFunctions.R | 11 ++++------- R/evaluate.R | 1 - R/params.R | 1 + R/simulation.R | 19 ++++++++++++------- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/R/algorithmSpecificFunctions.R b/R/algorithmSpecificFunctions.R index 604c8e5..da23892 100644 --- a/R/algorithmSpecificFunctions.R +++ b/R/algorithmSpecificFunctions.R @@ -23,8 +23,6 @@ #' @import mclust #' @export useMclust <- function(expression,verbose = TRUE,parallel = FALSE) { - - if (verbose) {message("mclust-models are calculated...")} if(parallel==FALSE){ @@ -167,15 +165,15 @@ useHdbscan <- function(expression,minPts = 10,verbose = TRUE){ output <- list( "modus" = modus, - "means" = c(unname(means)), # means newly calculated (noise points are included) - "sds" = c(unname(sds)), #sds newly calculated - "sizes" = unname(table(classificationMinDistance[])), #with changed Groups for Noise Points + "means" = c(unname(means)), + "sds" = c(unname(sds)), + "sizes" = unname(table(classificationMinDistance[])), "groups" = groups ) }) names(outputs)<- genes hdbscanOutput <- list(Genes = outputs,"ClinicalData"="mnt/agnerds/xyz","Expressionmatrix"="mnt/agnerds/xyzyy") - + return(hdbscanOutput) } @@ -352,7 +350,6 @@ useFlexmix <- function(expression, verbose = TRUE){ c(patients[k]) }})) }) - }else if(bestModel[i]==2){ modus <- 2 diff --git a/R/evaluate.R b/R/evaluate.R index defc453..d091ed3 100644 --- a/R/evaluate.R +++ b/R/evaluate.R @@ -17,7 +17,6 @@ #' FN (FALSE Negative): validationData: modality, assigned scenario: not modality #' #' FP (FALSE Positive): validationData: not modality, assigned scenario: modality -#' compareScenarios <- function(modality, algorithmOutput,validationData){ compareScenarios <- (unlist(lapply(1:length(algorithmOutput$Genes),function(i){ if(validationData[[i]]$modus != modality && algorithmOutput$Genes[[i]]$modus!=modality){ diff --git a/R/params.R b/R/params.R index 5a7ec2e..91ba846 100644 --- a/R/params.R +++ b/R/params.R @@ -258,3 +258,4 @@ showPP <- function(params, pp) { cat("\n") } } + diff --git a/R/simulation.R b/R/simulation.R index 69a2467..8d6472a 100644 --- a/R/simulation.R +++ b/R/simulation.R @@ -65,7 +65,7 @@ #' # Not recommended, better use setParams; See \code{\link{Params}} #' simulation <- simulateExpression(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100)), nPatients = 50) #' # without messages printed to the console -#' params <- newParams(nGenes = list("1"=100,"2"=list("gauss"= 100,"gamma"= 100))) +#' params <- newParams() #' simulation <- simulateExpression(params = params,verbose = FALSE) #' @export #' @importFrom checkmate assertClass assertString assertCharacter assertList @@ -81,9 +81,7 @@ simulateExpression <- function(params = newParams(), verbose = TRUE,...) { set.seed(seed) valData <- createValidationData(params = params, verbose = verbose) - nPatients <- getParam(params,"nPatients") - numGenes <- sum(unlist(getParam(params,name = "nGenes"))) if (verbose){message("Drawing gene expression values...")} @@ -119,7 +117,7 @@ simulateExpression <- function(params = newParams(), verbose = TRUE,...) { }) sim <- data.frame(do.call('rbind',sim)) genes <- names(valData) - patients <- sprintf("Patient%04d", 1:nPatients) #columns 'header' + patients <- sprintf("Patient%04d", 1:nPatients) rownames(sim) <- genes colnames(sim) <- patients @@ -210,14 +208,21 @@ createValidationData <- function(params = newParams(),verbose = TRUE){ means <- c() ranSd <- c() if(modality >= 2){ - combs <- t(utils::combn(x = proportions,m = modality,simplify=TRUE)) + repProp <- rep(proportions,modality) + combs <- t(utils::combn(x = repProp,m = modality,simplify=TRUE)) combinations <- unlist(lapply(1:nrow(combs),function(i){ if(sum(combs[i,])==100){ i } })) - sampleProp <- sample(combinations,1,replace = FALSE) - ranProps <- combs[sampleProp,] + selectedCombs <- combs[combinations,] + combsSorted <- lapply(1:nrow(selectedCombs),function(i){ + sort(selectedCombs[i,]) + }) + combs <- do.call('rbind',combsSorted) + uniqueCombs <- unique.array(combs) + sampleProp <- sample(1:nrow(uniqueCombs),1,replace = FALSE) + ranProps <- uniqueCombs[sampleProp,] ranSd<- unlist(lapply(2:modality,function(i){ eval(parse(text = paste(paste0("ranSd",i), From 88808e2feb257a759bbf53d17132373e41583f77 Mon Sep 17 00:00:00 2001 From: sbeck Date: Thu, 1 Mar 2018 08:59:10 +0100 Subject: [PATCH 5/6] Some mistakes were fixed to ensure correct number of patients in simulation The number of patients was not always computed correctly, added some checks and rounding to get the correct number of patients when simulating the gene expression matrix and creating the validationData --- R/simulation.R | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/R/simulation.R b/R/simulation.R index 8d6472a..e32e380 100644 --- a/R/simulation.R +++ b/R/simulation.R @@ -91,14 +91,26 @@ simulateExpression <- function(params = newParams(), verbose = TRUE,...) { expression <- stats::rnorm(n = nPatients,mean = valData[[i]]$means,sd = valData[[i]]$sds) }else if(valData[[i]]$scenario=="multimodal-gaussian"){ expression <- unlist(lapply(1:valData[[i]]$modus,function(j){ - expression <- stats::rnorm(n = valData[[i]]$sizes[j],mean = valData[[i]]$means[j],sd = valData[[i]]$sds[j]) + if((sum(valData[[i]]$sizes)-valData[[i]]$sizes[j]+ceiling(valData[[i]]$sizes[j]))<=nPatients){ + expression <- stats::rnorm(n = ceiling(valData[[i]]$sizes[j]),mean = valData[[i]]$means[j],sd = valData[[i]]$sds[j]) + }else{ + expression <- stats::rnorm(n = (valData[[i]]$sizes[j]),mean = valData[[i]]$means[j],sd = valData[[i]]$sds[j]) + } })) }else if(valData[[i]]$scenario=="multimodal-gamma+gaussian"){ expression <- unlist(lapply(1:valData[[i]]$modus,function(j){ if(j==1){ - expression <- stats::rgamma(n = valData[[i]]$sizes[j],shape = params@gamma$shape,rate = params@gamma$rate) + if((sum(valData[[i]]$sizes)-valData[[i]]$sizes[j]+ceiling(valData[[i]]$sizes[j]))<=nPatients){ + expression <- stats::rgamma(n = ceiling(valData[[i]]$sizes[j]),shape = params@gamma$shape,rate = params@gamma$rate) + }else{ + expression <- stats::rgamma(n = (valData[[i]]$sizes[j]),shape = params@gamma$shape,rate = params@gamma$rate) + } }else{ - expression <- stats::rnorm(n = valData[[i]]$sizes[j],mean = valData[[i]]$means[j],sd = valData[[i]]$sds[j]) + if((sum(valData[[i]]$sizes)-valData[[i]]$sizes[j]+ceiling(valData[[i]]$sizes[j]))<=nPatients){ + expression <- stats::rnorm(n = ceiling(valData[[i]]$sizes[j]),mean = valData[[i]]$means[j],sd = valData[[i]]$sds[j]) + }else{ + expression <- stats::rnorm(n = (valData[[i]]$sizes[j]),mean = valData[[i]]$means[j],sd = valData[[i]]$sds[j]) + } } })) } @@ -262,9 +274,9 @@ createValidationData <- function(params = newParams(),verbose = TRUE){ patients <- 1 groups <- lapply(1:length(sizes),function(i){ if(i==1){ - sprintf("Patient%d",1:sizes[i]) + sprintf("Patient%g",1:sizes[i]) }else{ - sprintf("Patient%d",(sum(sizes[1:(i-1)])+1):(sizes[i]+sum(sizes[1:(i-1)]))) + sprintf("Patient%g",((sum(sizes[1:(i-1)])+1):(sizes[i]+sum(sizes[1:(i-1)])))) } }) scenario <-c() From 92255852f3351b35af33386bfa4be3a2772ec977 Mon Sep 17 00:00:00 2001 From: sbeck Date: Fri, 2 Mar 2018 11:20:01 +0100 Subject: [PATCH 6/6] Changed useFlexmix() function to be able to create models for higher modalities The useFlexmix() function now takes another function parameter 'maxModality'. This parameter is used to create models up to this modality (beginning at 1, ending at maxModality). The calculation was changed. First, all gaussian models (beginning with 1 gaussian distribution, ending with maxModality gaussian distributions) are computed and the rmse compared. This is repeated for the gamma + gaussian distribution models. (First, only a gamma distribution is calculated, at last one gamma distribution is calculated, as well as maxModality-1 gaussian distributions) The rmse's of the best model of gaussian and gamma gaussian is compared and the best model is therefore chosen. The values for means, sds, etc are taken from that chosen model and saved in the output. Furthermore, the parallel computation of the models was enabled and can be chosen via the logical parameter 'parallel' The examples of other functions were changed to work with the updated useFlexmix() function --- R/algorithmSpecificFunctions.R | 274 ++++++++++++++++++--------------- R/evaluate.R | 25 ++- R/simulation.R | 25 ++- 3 files changed, 181 insertions(+), 143 deletions(-) diff --git a/R/algorithmSpecificFunctions.R b/R/algorithmSpecificFunctions.R index da23892..27880fc 100644 --- a/R/algorithmSpecificFunctions.R +++ b/R/algorithmSpecificFunctions.R @@ -173,7 +173,7 @@ useHdbscan <- function(expression,minPts = 10,verbose = TRUE){ }) names(outputs)<- genes hdbscanOutput <- list(Genes = outputs,"ClinicalData"="mnt/agnerds/xyz","Expressionmatrix"="mnt/agnerds/xyzyy") - + return(hdbscanOutput) } @@ -194,7 +194,6 @@ rmse <- function(error){ sqrt(mean(error2^2)) } - #' UseFlexmix #' #' Use the FlexMix algorithm for creating a model to find the best fitting @@ -202,7 +201,9 @@ rmse <- function(error){ #' #' @param expression (Simulated) Gene Expression data frame which genes shall #' be classified as unimodal, bimodal or other +#' @param maxModality numerical, maximum modality for that models will be calculated #' @param verbose logical. Whether to print progress messages +#' @param parallel logical. Whether to calculate the models in parallel #' #' @return flexmixOutput The formatted output of the best fitting models from flexmix #' @@ -211,74 +212,88 @@ rmse <- function(error){ #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' #with messages printed to console -#' flexmixOutput <- useFlexmix(expression = expression) +#' flexmixOutput <- useFlexmix(expression = expression, maxModality = 2) #' #without messages printed to console -#' flexmixOutput <- useFlexmix(expression = expression, verbose = FALSE) +#' flexmixOutput <- useFlexmix(expression = expression, maxModality = 2, verbose = FALSE) #' #' @import flexmix #' @importFrom plyr compact #' @export -useFlexmix <- function(expression, verbose = TRUE){ - mo1 <- flexmix::FLXMRglm(family = "gaussian") - mo2 <- flexmix::FLXMRglm(family = "gaussian") - mo3 <- flexmix::FLXMRglm(family = "Gamma") +useFlexmix <- function(expression,maxModality, verbose = TRUE,parallel=FALSE){ + modelGauss <- c() + modelGamma <- c() genes <- rownames(expression) patients <- names(expression) - if(verbose){message("Creating and fitting the unimodal regression model with a gaussian distribution to each gene")} - flexModelsUni <- lapply(1:nrow(expression), function(i) { - (flexfitUni <- flexmix::flexmix(t(expression[i,])~1,data = expression,k = 1, model = mo1)) - c1 <- flexmix::parameters(flexfitUni, component=1) - lam <- table(flexmix::clusters(flexfitUni)) - c("means"=unname(c1[1]),"sds"=unname(c1[2]),"sizes"=unname(lam[1]),"classification"=list(flexmix::clusters(flexfitUni))) - }) - names(flexModelsUni) <- genes - predictedUni<- c() - predictedUni <- (lapply(1:length(flexModelsUni),function(i){ - predictedUni <- rnorm(n=((flexModelsUni[[i]]$sizes/100)*ncol(expression)),mean = flexModelsUni[[i]]$means,sd = flexModelsUni[[i]]$sds) - c(predictedUni) - })) - - rmseUni <- unlist(lapply(1:length(predictedUni),function(i){ - rmse((expression[i,] - predictedUni[[i]])) - })) - - if(verbose){message("Creating and fitting the bimodal regression model with two gaussian distributions to each gene")} - - flexModelsBi <- lapply(1:nrow(expression), function(i) { - - (flexfitBi <- flexmix::flexmix(t(expression[i,])~1,data = expression,k = 2, model = list(mo1,mo2))) - c1 <- flexmix::parameters(flexfitBi, component=1)[[1]] - - if(is.na(table(flexmix::clusters(flexfitBi))[2])){ - c2 <- c(NA,NA) + if(verbose){message("Creating and fitting the model with gaussian distributions to each gene")} + flexModelGauss <- lapply(1:maxModality,function(i){ + if(parallel==FALSE){ + flexModel <- lapply(1:nrow(expression),function(j){ + modelGauss <- flexmix::FLXMRglm(family = "gaussian") + modelGamma <- flexmix::FLXMRglm(family = "Gamma") + model <- rep(x = "modelGauss",i) + models <- sapply(model,function(y){eval(parse(text=y))}) + flexfit <- flexmix::flexmix(t(expression[j,])~1,data = expression,k = i, model = models) + if(i==1){ + means <- flexmix::parameters(flexfit)[1] + sds <- flexmix::parameters(flexfit)[2] + sizes <- unname(table(flexmix::clusters(flexfit))) + }else{ + means <- unlist(unname(flexmix::parameters(flexfit)[[1]][1,])) + sds <- unlist(unname(flexmix::parameters(flexfit)[[1]][2,])) + sizes <- unlist(unname(table(flexmix::clusters(flexfit)))) + } + c("means"=list(means),"sds"=list(sds),"sizes"=list(sizes),"classification"=list(flexmix::clusters(flexfit))) + }) } - else{ - c2 <- flexmix::parameters(flexfitBi, component=2)[[1]] - + if(parallel){ + modelGauss <- flexmix::FLXMRglm(family = "gaussian") + modelGamma <- flexmix::FLXMRglm(family = "Gamma") + n.cores = parallel::detectCores()-1 + cl = parallel::makeCluster(n.cores) + parallel::clusterExport(cl, c("expression")) + parallel::clusterEvalQ(cl, library(flexmix)) + flexModel <- parallel::parLapply(cl,1:nrow(expression),function(j){ + model <- rep(x = "modelGauss",i) + models <- sapply(model,function(y){eval(parse(text=y))}) + flexfit <- flexmix::flexmix(t(expression[j,])~1,data = expression,k = i, model = models) + if(i==1){ + means <- flexmix::parameters(flexfit)[1] + sds <- flexmix::parameters(flexfit)[2] + sizes <- unname(table(flexmix::clusters(flexfit))) + }else{ + means <- unlist(unname(flexmix::parameters(flexfit)[[1]][1,])) + sds <- unlist(unname(flexmix::parameters(flexfit)[[1]][2,])) + sizes <- unlist(unname(table(flexmix::clusters(flexfit)))) + } + c("means"=list(means),"sds"=list(sds),"sizes"=list(sizes),"classification"=list(flexmix::clusters(flexfit))) + }) } - lam <- table(flexmix::clusters(flexfitBi)) + if(parallel==TRUE){parallel::stopCluster(cl)} + names(flexModel) <- genes + flexModel - list("means"=c(c1[1],c2[1]),"sds"=c(c1[2],c2[2]),"sizes"=c(unname(lam[1]),(unname(lam[2]))),"classification"=c(flexmix::clusters(flexfitBi))) + }) + flexModelGaussRMSE <- lapply(1:maxModality,function(i){ + RMSE <- lapply(1:nrow(expression),function(j){ + prediction <- unlist(lapply(1:length(flexModelGauss[[i]][[j]]$sizes),function(k){ + predExpression <- stats::rnorm(n = flexModelGauss[[i]][[j]]$sizes[k], + mean = flexModelGauss[[i]][[j]]$means[k], + sd = flexModelGauss[[i]][[j]]$sds[k]) + })) + rmseVal <- unlist(rmse((expression[j,] - prediction))) + }) }) - names(flexModelsBi) <- genes - predictedBi<- c() - predictedBi <- (lapply(1:length(flexModelsBi),function(i){ - if(!is.na(flexModelsBi[[i]]$sizes[2])){ - predBi1 <- rnorm(n = floor((flexModelsBi[[i]]$sizes[1]/100)*ncol(expression)),mean = flexModelsBi[[i]]$means[1],sd = flexModelsBi[[i]]$sds[1]) - predBi2 <- rnorm(n = (ceiling((flexModelsBi[[i]]$means[2]/100)*ncol(expression))),mean = flexModelsBi[[i]]$means[2],sd = flexModelsBi[[1]]$sds[2]) - predBi <- c(predBi1,predBi2) - } - else{ - predBi <- rnorm(n=((flexModelsBi[[i]]$sizes[1]/100)*ncol(expression)),mean = flexModelsBi[[i]]$means[1],sd = flexModelsBi[[i]]$sds[1]) - } - c(predBi) - })) - rmseBi <- unlist(lapply(1:length(predictedBi),function(i){ - rmse((expression[i,] - predictedBi[[i]])) + bestModelGauss <- unlist(lapply(1:nrow(expression),function(i){ + values <- unlist(lapply(1:maxModality,function(j){ + flexModelGaussRMSE[[j]][[i]] + })) + which.min(values) })) + if(verbose){message("Creating and fitting the model with gamma and gaussian distributions to each gene")} + expression2 <- (lapply(1:nrow(expression),function(i){ unlist(lapply(1:length(expression),function(j){ if((expression[i,j]>0)){ @@ -292,88 +307,104 @@ useFlexmix <- function(expression, verbose = TRUE){ colnames(expression2) <- colnames(expression) rownames(expression2) <- rownames(expression) - if(verbose){message("Creating and fitting the bimodal regression model with a Gamma distribution and a gaussian distribution to each gene")} - flexModelsGU <- lapply(1:nrow(expression2), function(i) { - (flexfitGU <- flexmix::flexmix(t(expression2[i,])~1,data = expression2,k = 2, model = list(mo1,mo3))) - c1 <- flexmix::parameters(flexfitGU, component=1)[[1]] - if(!(is.na(table(flexmix::clusters(flexfitGU))[2]))){ - c2 <- flexmix::parameters(flexfitGU, component=2)[[1]] - } - else{ - c2 <- c(0,0) + flexModelGamma <- lapply(1:maxModality,function(i){ + + if(parallel==FALSE){ + modelGauss <- flexmix::FLXMRglm(family = "gaussian") + modelGamma <- flexmix::FLXMRglm(family = "Gamma") + flexModel <- lapply(1:nrow(expression2),function(j){ + model <- rep(x = c("modelGamma","modelGauss"),c(1,i-1)) + models <- sapply(model,function(y){eval(parse(text=y))}) + flexfit <- flexmix::flexmix(t(expression2[j,])~1,data = expression2,k = i, model = models) + if(i==1){ + means <- flexmix::parameters(flexfit)[1] + sds <- flexmix::parameters(flexfit)[2] + sizes <- unname(table(flexmix::clusters(flexfit))) + }else{ + means <- unlist(unname(flexmix::parameters(flexfit)[[1]][1,])) + sds <- unlist(unname(flexmix::parameters(flexfit)[[1]][2,])) + sizes <- unlist(unname(table(flexmix::clusters(flexfit)))) + } + c("means"=list(means),"sds"=list(sds),"sizes"=list(sizes),"classification"=list(flexmix::clusters(flexfit))) + }) } - lam <- table(flexmix::clusters(flexfitGU)) - if(is.na(lam[2])){ - lam[2] <- 0 + if(parallel){ + modelGauss <- flexmix::FLXMRglm(family = "gaussian") + modelGamma <- flexmix::FLXMRglm(family = "Gamma") + n.cores = parallel::detectCores()-1 + cl = parallel::makeCluster(n.cores) + #parallel::clusterExport(cl, c("expression2")) + parallel::clusterEvalQ(cl, library(flexmix)) + flexModel <- parallel::parLapply(cl,1:nrow(expression2),function(j){ + model <- rep(x = c("modelGamma","modelGauss"),c(1,i-1)) + models <- sapply(model,function(y){eval(parse(text=y))}) + flexfit <- flexmix::flexmix(t(expression2[j,])~1,data = expression2,k = i, model = models) + if(i==1){ + means <- flexmix::parameters(flexfit)[1] + sds <- flexmix::parameters(flexfit)[2] + sizes <- unname(table(flexmix::clusters(flexfit))) + }else{ + means <- unlist(unname(flexmix::parameters(flexfit)[[1]][1,])) + sds <- unlist(unname(flexmix::parameters(flexfit)[[1]][2,])) + sizes <- unlist(unname(table(flexmix::clusters(flexfit)))) + } + c("means"=list(means),"sds"=list(sds),"sizes"=list(sizes),"classification"=list(flexmix::clusters(flexfit))) + }) } - list("means"=c(c1[1],c2[1]),"sds"=c(c1[2],c2[2]),"sizes"=c(unname(lam[1]),(unname(lam[2]))),"classification"=c(flexmix::clusters(flexfitGU))) + if(parallel==TRUE){parallel::stopCluster(cl)} + names(flexModel) <- genes + flexModel + }) + flexModelGammaRMSE <- lapply(1:maxModality,function(i){ + RMSE <- lapply(1:nrow(expression2),function(j){ + prediction <- unlist(lapply(1:length(flexModelGamma[[i]][[j]]$sizes),function(k){ + PredExpression <- stats::rnorm(n = flexModelGamma[[i]][[j]]$sizes[k], + mean = flexModelGamma[[i]][[j]]$means[k], + sd = flexModelGamma[[i]][[j]]$sds[k]) + })) + rmseVal <- unlist(rmse((expression2[j,] - prediction))) + }) }) - names(flexModelsGU) <- genes - predictedGU<- c() - predictedGU <- (lapply(1:length(flexModelsGU),function(i){ - if(!is.na(flexModelsGU[[i]]$sizes[2])){ - predGu1 <- rnorm(n = floor((flexModelsGU[[i]]$sizes[1]/100)*ncol(expression2)),mean = flexModelsGU[[i]]$means[1],sd = flexModelsGU[[i]]$sds[1]) - predGu2 <- rnorm(n = ceiling((flexModelsGU[[i]]$sizes[2]/100)*ncol(expression2)),mean = flexModelsGU[[i]]$means[2],sd = flexModelsGU[[i]]$sds[2]) - predGu <- c(predGu1,predGu2) - } - else{ - predGu <- rnorm(n=((flexModelsGU[[i]]$sizes[1]/100)*ncol(expression2)),mean = flexModelsGU[[i]]$means[1],sd = flexModelsGU[[i]]$sds[1]) - } - c(predGu) + bestModelGamma <- unlist(lapply(1:nrow(expression2),function(i){ + values <- unlist(lapply(1:maxModality,function(j){ + flexModelGammaRMSE[[j]][[i]] + })) + which.min(values) })) - rmseGU <- unlist(lapply(1:length(predictedGU),function(i){ - rmse((expression2[i,] - predictedGU[[i]])) - })) - if(verbose){message("Comparing the root-mean-squared errors of each model to find the best fitting model for each gene")} - bestModel <- unlist(lapply(1:length(rmseUni),function(i){ - if(rmseUni[i]