Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
multimodalR/R/evaluate.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
374 lines (363 sloc)
16.5 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' compare Scenarios | |
#' | |
#' Compare the scenario of each gene that was assigned by the algorithm with | |
#' the scenario that is written in the validation Data. | |
#' @param modality numerical, for which modality the scenarioComparing is calculated, 1 for unimodal, | |
#' 2 for bimodal,... | |
#' @param algorithmOutput The output to be compared | |
#' @param validationData The validationData behind the simulated gene expression data frame | |
#' | |
#' @return Returns a list that classifies each assignment as | |
#' TN (TRUE Negative),TP (TRUE Positive), FN (FALSE Negative), FP (FALSE Positive) | |
#' @details | |
#' TN (TRUE Negative): validationData: not modality, assigned scenario: not modality | |
#' | |
#' TP (TRUE Positive): validationData: modality, assigned scenario: modality | |
#' | |
#' FN (FALSE Negative): validationData: modality, assigned scenario: not modality | |
#' | |
#' FP (FALSE Positive): validationData: not modality, assigned scenario: modality | |
compareScenarios <- function(modality, algorithmOutput,validationData){ | |
compareScenarios <- (unlist(lapply(1:length(algorithmOutput$Genes),function(i){ | |
if(validationData[[i]]$modus != modality && algorithmOutput$Genes[[i]]$modus!=modality){ | |
evaluation <- "TN" | |
}else if(validationData[[i]]$modus ==modality && algorithmOutput$Genes[[i]]$modus==modality){ | |
evaluation <- "TP" | |
}else if(validationData[[i]]$modus !=modality && algorithmOutput$Genes[[i]]$modus==modality){ | |
evaluation <- "FP" | |
}else if(validationData[[i]]$modus ==modality && algorithmOutput$Genes[[i]]$modus!=modality){ | |
evaluation <- "FN" | |
}else{ | |
evaluation <- NA | |
} | |
}))) | |
return(compareScenarios) | |
} | |
#' Is Approximately the Same | |
#' | |
#' checks whether the difference between two values is less than or equal the | |
#' maxDistance | |
#' @param validationVal value of the validationData (exact value) | |
#' @param classVal estimated value (approx. value) | |
#' @param maxDistance percentage of maximum distance of the values | |
#' | |
#' @return TRUE, if the difference between the two values is less than or | |
#' equal the maxDistance; FALSE, if the difference between the two values is | |
#' greater than the maxDistance | |
#' | |
isApproxSame<- function(validationVal,classVal,maxDistance){ | |
difference <- (abs((validationVal - classVal)/validationVal)*100) | |
if(difference <= maxDistance){ | |
return(TRUE) | |
}else{ | |
return(FALSE) | |
} | |
} | |
#' Evaluate AlgorithmOutput | |
#' | |
#' @param modality numerical, for which modality output is evaluated, 1 for unimodal, | |
#' 2 for bimodal,... | |
#' @param algorithmOutput The formatted output of the algorithm that is now | |
#' evaluated | |
#' @param validationData the validationData data.frame | |
#' @param maxDifference the maximum percentage difference between the estimated | |
#' value and the validation data value with which the estimated is classified | |
#' as TRUE (correctly estimated) | |
#' @examples | |
#' \dontrun{ | |
#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) | |
#' simulation <- simulateExpression(params= params,verbose=FALSE) | |
#' expression <- simulation$expressionData | |
#' validationData <- simulation$validationData | |
#' output <- useMclust(expression, verbose=FALSE) | |
#' # TRUE if less than +- 10 % difference to value of validationData | |
#' evaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = output, | |
#' validationData = validationData) | |
#' # TRUE if less than +- 20 % difference to value of validationData | |
#' evaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = output, | |
#' validationData = validationData,maxDifference = 20) | |
#' } | |
#' @return Returns a data.frame where for each as TP classified gene it was | |
#' evaluated whether it was estimated approximately correct or not (TRUE or | |
#' FALSE) | |
#' @details | |
#' The estimated values of mean1, mean2, sd1, sd2, proportion1 and proportion2 | |
#' of each as TP classified gene are compared to those values of the validationData. | |
#' If the difference is greater than 10 %, the estimation is classified as | |
#' FALSE (not correctly estimated) for this value. If the difference is less | |
#' or equal to 10%, the estimation is classified as TRUE (correctly estimated) | |
#' | |
#' @export | |
evaluateAlgorithmOutput <- function(modality,algorithmOutput,validationData,maxDifference = 10){ | |
compared <- compareScenarios(modality = modality,algorithmOutput = algorithmOutput,validationData = validationData) | |
genes <- names(validationData) | |
approxSame <- (lapply(1:length(algorithmOutput$Genes),function(i){ | |
if((algorithmOutput$Genes[[i]]$modus==modality) && (validationData[[i]]$modus==modality)){ | |
classValsMean <- sort(algorithmOutput$Genes[[i]]$means) | |
x <- algorithmOutput$Genes[[i]]$sds | |
names(x) <- order(algorithmOutput$Genes[[i]]$means) | |
classValSDs <- x[order(names(x))] | |
y <- algorithmOutput$Genes[[i]]$sizes | |
names(y) <- order(algorithmOutput$Genes[[i]]$means) | |
classValProps <- y[order(names(y))] | |
means <- unlist(lapply(1:length(validationData[[i]]$means),function(j){ | |
isApproxSame(validationVal = validationData[[i]]$means[j],classVal = classValsMean[j],maxDistance = maxDifference) | |
})) | |
sds <- unlist(lapply(1:length(validationData[[i]]$sds),function(j){ | |
isApproxSame(validationVal = validationData[[i]]$sds[j],classVal = classValSDs[j],maxDistance = maxDifference) | |
})) | |
props <- unlist(lapply(1:length(validationData[[i]]$sizes),function(j){ | |
isApproxSame(validationVal = validationData[[i]]$sizes[j],classVal = classValProps[j],maxDistance = maxDifference) | |
})) | |
c(means,sds,props) | |
} | |
})) | |
names(approxSame) <- genes | |
evaluation<- data.frame(do.call('rbind',approxSame)) | |
meanNames <- sprintf(c("mean%s"),(1:(ncol(evaluation)/3))) | |
sdNames <- sprintf(c("sd%s"),(1:(ncol(evaluation)/3))) | |
propNames <- sprintf(c("prop%s"),(1:(ncol(evaluation)/3))) | |
colnames(evaluation) <- c(meanNames,sdNames,propNames) | |
return(evaluation) | |
} | |
#' getClassification | |
#' | |
#' Returns the numbers of TN (TRUE Negative),TP (TRUE Positive), | |
#' FN (FALSE Negative) and FP (FALSE Positive) for one of the algorithm's | |
#' formatted outputs. | |
#' @param modality numerical, for which modality the classification is calculated, 1 for unimodal, | |
#' 2 for bimodal,... | |
#' @param algorithmOutput The formatted output of the algorithm | |
#' @param algorithmName The name of the evaluated algorithm | |
#' @param validationData The validationData behind the simulated gene | |
#' expression data frame | |
#' @examples | |
#' \dontrun{ | |
#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) | |
#' simulation <- simulateExpression(params= params,verbose=FALSE) | |
#' expression <- simulation$expressionData | |
#' validationData <- simulation$validationData | |
#' mclustOutput <- useMclust(expression, verbose=FALSE) | |
#' getClassifications(modality = 2,validationData = validationData, | |
#' outputs = list("mclust" = mclustOutput)) | |
#' } | |
#' @return a classification data frame that contains the number of | |
#' FN, FP, TN and TP | |
#' | |
#' @details | |
#' TN (TRUE Negative): validationData: unimodal, assigned scenario: unimodal | |
#' | |
#' TP (TRUE Positive): validationData: bimodal, assigned scenario: bimodal | |
#' | |
#' FN (FALSE Negative): validationData: bimodal, assigned scenario: unimodal | |
#' or | |
#' FN (FALSE Negative): validationData: unimodal/bimodal, | |
#' assigned scenario: more than bimodal | |
#' | |
#' FP (FALSE Positive): validationData: unimodal, assigned scenario: bimodal | |
#' | |
getClassification <- function(modality,algorithmOutput,algorithmName="mclust",validationData){ | |
compared <- compareScenarios(modality = modality, | |
algorithmOutput = algorithmOutput,validationData = validationData) | |
fn <- table(compared)["FN"] | |
fp <- table(compared)["FP"] | |
tn <- table(compared)["TN"] | |
tp <- table(compared)["TP"] | |
classification <- data.frame(matrix(nrow = 1,ncol = 4,data = c(fn,fp,tn,tp), | |
byrow = TRUE)) | |
colnames(classification)<-c("FN","FP","TN","TP") | |
rownames(classification)<-c(algorithmName) | |
return(classification) | |
} | |
#' getStatistic | |
#' | |
#' This function produces informations about the evaluation of one | |
#' algorithms. The result is a data frame where the percentage of as TRUE and | |
#' as FALSE evaluated values of the genes | |
#' | |
#' @param evaluation The output of the evaluation of the formatted output | |
#' @param algorithmName The name of the algorithm which created the output | |
#' @return A data frame with the percentual proportion of as TRUE or FALSE evaluated values | |
#' @examples | |
#' \dontrun{ | |
#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) | |
#' simulation <- simulateExpression(params= params,verbose=FALSE) | |
#' expression <- simulation$expressionData | |
#' validationData <- simulation$validationData | |
#' mclustOutput <- useMclust(expression, verbose=FALSE) | |
#' mclustEvaluation <- evaluateAlgorithmOutput(modality = 2, algorithmOutput = mclustOutput, | |
#' validationData = validationData) | |
#' getStatistics(evaluations = list("mclust" = mclustEvaluation)) | |
#' } | |
getStatistic <- function(evaluation,algorithmName ="mclust"){ | |
true <- (table(unlist(evaluation))["TRUE"]/(table(unlist(evaluation))["FALSE"]+table(unlist(evaluation))["TRUE"]))*100 | |
false <- (table(unlist(evaluation))["FALSE"]/(table(unlist(evaluation))["FALSE"]+table(unlist(evaluation))["TRUE"]))*100 | |
stats <- data.frame(matrix(ncol = 1, nrow = 2,data = c(true,false),byrow = TRUE)) | |
colnames(stats)<- c(algorithmName) | |
rownames(stats)<- c("TRUE [%]","FALSE [%]") | |
return(stats) | |
} | |
#' Statistics | |
#' | |
#' This function produces informations about the evaluation of the three | |
#' algorithms. The result is a data frame where the percentage of as TRUE and | |
#' as FALSE evaluated values of the genes | |
#' | |
#' @param evaluations list of named evaluations | |
#' @return A data frame with the percentual proportion of as TRUE or FALSE evaluated values | |
#' @examples | |
#' \dontrun{ | |
#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) | |
#' simulation <- simulateExpression(params= params,verbose=FALSE) | |
#' expression <- simulation$expressionData | |
#' validationData <- simulation$validationData | |
#' mclustOutput <- useMclust(expression = expression,verbose = FALSE) | |
#' hdbscanOutput <- useHdbscan(expression = expression,verbose = FALSE) | |
#' mclustEvaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = mclustOutput, | |
#' validationData = validationData) | |
#' hdbscanEvaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = hdbscanOutput, | |
#' validationData = validationData) | |
#' | |
#' #getStatistics of one algorithm | |
#' getStatistics(evaluations = list("mclust"=mclustEvaluation)) | |
#' #getStatistics of two or more algorithms | |
#' getStatistics(evaluations = list("mclust"=mclustEvaluation,"HDBSCAN"=hdbscanEvaluation)) | |
#' } | |
#' @export | |
getStatistics <- function(evaluations) { | |
numEvaluations <- length(evaluations) | |
statistics <- lapply(1:numEvaluations, function(i) {getStatistic(evaluation = evaluations[[i]], algorithmName = names(evaluations)[i])}) | |
stats <- data.frame(do.call('cbind',statistics)) | |
colnames(stats)<- names(evaluations) | |
rownames(stats)<- c("TRUE [%]","FALSE [%]") | |
return(stats) | |
} | |
#' getClassifications | |
#' | |
#' Returns the numbers of TN (TRUE Negative),TP (TRUE Positive), | |
#' FN (FALSE Negative) and FP (FALSE Positive) for each of the algorithm's | |
#' formatted outputs. | |
#' @param modality numerical, for which modality the classification is calculated, 1 for unimodal, | |
#' 2 for bimodal,... | |
#' @param outputs list of named outputs | |
#' @param validationData The validationData behind the simulated gene expression data frame | |
#' | |
#' @return a classification data frame that contains the number of | |
#' FN, FP, TN and TP | |
#' @examples | |
#' \dontrun{ | |
#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) | |
#' simulation <- simulateExpression(params= params,verbose=FALSE) | |
#' expression <- simulation$expressionData | |
#' validationData <- simulation$validationData | |
#' mclustOutput <- useMclust(expression = expression,verbose = FALSE) | |
#' hdbscanOutput <- useHdbscan(expression = expression,verbose = FALSE) | |
#' | |
#' #getClassifications of one algorithm | |
#' getClassifications(modality=2,validationData, | |
#' outputs = list("mclust"=mclustOutput)) | |
#' #getClassifications of two or more algorithms | |
#' getClassifications(modality=2,validationData, | |
#' outputs = list("mclust"=mclustOutput,"HDBSCAN"=hdbscanOutput)) | |
#' } | |
#' | |
#' @details | |
#' TN (TRUE Negative): validationData: unimodal, assigned scenario: unimodal | |
#' | |
#' TP (TRUE Positive): validationData: bimodal, assigned scenario: bimodal | |
#' | |
#' FN (FALSE Negative): validationData: bimodal, assigned scenario: unimodal | |
#' or | |
#' FN (FALSE Negative): validationData: unimodal/bimodal, | |
#' assigned scenario: more than bimodal | |
#' | |
#' FP (FALSE Positive): validationData: unimodal, assigned scenario: bimodal | |
#' | |
#' @export | |
getClassifications <- function(modality,validationData,outputs){ | |
numOutputs <- (length(outputs)) | |
classification <- lapply(1:numOutputs,function(i){ | |
getClassification(modality,algorithmOutput = outputs[[i]], | |
algorithmName = names(outputs)[i], | |
validationData = validationData) | |
}) | |
classification <- data.frame(do.call('rbind',classification)) | |
colnames(classification)<-c("FN","FP","TN","TP") | |
rownames(classification)<-names(outputs) | |
return(classification) | |
} | |
#' Checks if output fulfills the criteria of the unified output data format | |
#' @details prints status messages about performed checks | |
#' @param output to be tested | |
#' @param expressionmatrix that was used by algorithm to create output | |
#' @return logical: TRUE if output is in the correct format, FALSE if any check was failed | |
#' @export | |
isValidUnifiedOutputDataFormat <- function(output,expressionmatrix){ | |
correctType <- FALSE | |
correctLengthOutput <- FALSE | |
correctLengthGenes <- FALSE | |
correctFormatClinicalData <- FALSE | |
correctFormatExpressionmatrix <- FALSE | |
correctFormatAllGenes <- FALSE | |
if(is.list(output)){ | |
message("Output is of type 'list'") | |
correctType <- TRUE | |
}else{ | |
message("Output is not of required type 'list'.") | |
return(FALSE)} | |
if(length(output)==3){ | |
message("Output has a length of 3.") | |
correctLengthOutput <- TRUE | |
}else{ | |
message("Output has not a length of 3.") | |
return(FALSE)} | |
if(length(output$Genes)==nrow(expressionmatrix)){ | |
message("Output$Genes holds the correct number of genes.") | |
correctLengthGenes <- TRUE | |
}else{ | |
message("Output does not hold the correct number of genes.") | |
return(FALSE)} | |
if((is.character(output$ClinicalData))&&(length(output$ClinicalData)==1)){ | |
message("Output$ClinicalData is of the required type 'character' and the required length of 1.") | |
correctFormatClinicalData <- TRUE | |
}else{ | |
message("Output$ClinicalData is not of the required type 'character' and/or not of the required length of 1.") | |
return(FALSE)} | |
if((is.character(output$Expressionmatrix))&&(length(output$Expressionmatrix)==1)){ | |
message("Output$Expressionmatrix is of the required type 'character' and of the required length of 1.") | |
correctFormatExpressionmatrix <- TRUE | |
}else{ | |
message("Output$Expressionmatrix is not of the required type 'character' and/or not of the required length of 1.") | |
return(FALSE)} | |
correctlyFormattedGenes <- try(unlist(lapply(1:length(output$Genes),function(i){ | |
isCorrectFormattedGene <- FALSE | |
allCorrectFormat <- is.integer(output$Genes[[i]]$modus)&& | |
is.double(output$Genes[[i]]$means)&&(length(output$Genes[[i]]$means)==output$Genes[[i]]$modus)&& | |
is.double(output$Genes[[i]]$sds)&&(length(output$Genes[[i]]$sds)==output$Genes[[i]]$modus)&& | |
is.integer(output$Genes[[i]]$sizes)&&(length(output$Genes[[i]]$sizes)==output$Genes[[i]]$modus)&& | |
is.list(output$Genes[[i]]$groups)&&(length(output$Genes[[i]]$groups)==output$Genes[[i]]$modus)&& | |
(sum(output$Genes[[i]]$sizes)==ncol(expressionmatrix)) | |
listCorrectLength<- unlist(lapply(1:length(output$Genes[[i]]$groups),function(j){ | |
length(output$Genes[[i]]$groups[[j]])==output$Genes[[i]]$sizes[[j]] | |
})) | |
allListsCorrectLength <- (!any(listCorrectLength==FALSE)) | |
isCorrectFormattedGene <- allCorrectFormat&&allListsCorrectLength | |
})),silent = T) | |
if(class(correctlyFormattedGenes)=="try-error"){ | |
message("The correct format of the genes could not be evaluated due to wrong characteristics of the Genes section.") | |
return(FALSE)} | |
else{ | |
correctFormatAllGenes <-(!any(correctlyFormattedGenes==FALSE)) | |
} | |
if(correctFormatAllGenes){ | |
message("All Genes are in the correct format.") | |
}else{ | |
message("One or more genes do not fulfil the required format of the Genes section.") | |
} | |
return(correctType&& | |
correctLengthOutput&& | |
correctLengthGenes&& | |
correctFormatClinicalData&& | |
correctFormatExpressionmatrix&& | |
correctFormatAllGenes) | |
} |