R/evaluate.R

#' compare Scenarios
#'
#' Compare the scenario of each gene that was assigned by the algorithm with
#' the scenario that is written in the validation Data.
#' @param modality numerical, for which modality the scenarioComparing is calculated, 1 for unimodal,
#' 2 for bimodal,...
#' @param algorithmOutput The output to be compared
#' @param validationData The validationData behind the simulated gene expression data frame
#'
#' @return Returns a list that classifies each assignment as
#' TN (TRUE Negative),TP (TRUE Positive), FN (FALSE Negative), FP (FALSE Positive)
#' @details
#' TN (TRUE Negative): validationData: not modality, assigned scenario: not modality
#'
#' TP (TRUE Positive): validationData: modality, assigned scenario: modality
#'
#' FN (FALSE Negative): validationData: modality, assigned scenario: not modality
#'
#' FP (FALSE Positive): validationData: not modality, assigned scenario: modality
compareScenarios <- function(modality, algorithmOutput,validationData){
  compareScenarios <- (unlist(lapply(1:length(algorithmOutput$Genes),function(i){
    if(validationData[[i]]$modus != modality && algorithmOutput$Genes[[i]]$modus!=modality){
      evaluation <- "TN"
    }else if(validationData[[i]]$modus ==modality && algorithmOutput$Genes[[i]]$modus==modality){
      evaluation <- "TP"
    }else if(validationData[[i]]$modus !=modality && algorithmOutput$Genes[[i]]$modus==modality){
      evaluation <- "FP"
    }else if(validationData[[i]]$modus ==modality && algorithmOutput$Genes[[i]]$modus!=modality){
      evaluation <- "FN"
    }else{
      evaluation <- NA
    }
  })))
  return(compareScenarios)
}

#' Is Approximately the Same
#'
#' checks whether the difference between two values is less than or equal the
#' maxDistance
#' @param validationVal value of the validationData (exact value)
#' @param classVal estimated value (approx. value)
#' @param maxDistance percentage of maximum distance of the values
#'
#' @return TRUE, if the difference between the two values is less than or
#' equal the maxDistance; FALSE, if the difference between the two values is
#' greater than the maxDistance
#'
isApproxSame<- function(validationVal,classVal,maxDistance){
  difference <- (abs((validationVal - classVal)/validationVal)*100)
  if(difference <= maxDistance){
    return(TRUE)
  }else{
    return(FALSE)
  }
}

#' Evaluate AlgorithmOutput
#'
#' @param modality numerical, for which modality output is evaluated, 1 for unimodal,
#' 2 for bimodal,...
#' @param algorithmOutput The formatted output of the algorithm that is now
#' evaluated
#' @param validationData the validationData data.frame
#' @param maxDifference the maximum percentage difference between the estimated
#' value and the validation data value with which the estimated is classified
#' as TRUE (correctly estimated)
#' @examples
#' \dontrun{
#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10)))
#' simulation <- simulateExpression(params= params,verbose=FALSE)
#' expression <- simulation$expressionData
#' validationData <- simulation$validationData
#' output <- useMclust(expression, verbose=FALSE)
#'  # TRUE if less than +- 10 % difference to value of validationData
#' evaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = output,
#' validationData = validationData)
#' # TRUE if less than +- 20 % difference to value of validationData
#' evaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = output,
#' validationData = validationData,maxDifference = 20)
#' }
#' @return Returns a data.frame where for each as TP classified gene it was
#' evaluated whether it was estimated approximately correct or not (TRUE or
#' FALSE)
#' @details
#' The estimated values of mean1, mean2, sd1, sd2, proportion1 and proportion2
#' of each as TP classified gene are compared to those values of the validationData.
#' If the difference is greater than 10 %, the estimation is classified as
#' FALSE (not correctly estimated) for this value. If the difference is less
#' or equal to 10%, the estimation is classified as TRUE (correctly estimated)
#'
#' @export
evaluateAlgorithmOutput <- function(modality,algorithmOutput,validationData,maxDifference = 10){
  compared <- compareScenarios(modality = modality,algorithmOutput = algorithmOutput,validationData = validationData)
  genes <- names(validationData)
  approxSame <- (lapply(1:length(algorithmOutput$Genes),function(i){
    if((algorithmOutput$Genes[[i]]$modus==modality) && (validationData[[i]]$modus==modality)){
      classValsMean <- sort(algorithmOutput$Genes[[i]]$means)
      x <- algorithmOutput$Genes[[i]]$sds
      names(x) <- order(algorithmOutput$Genes[[i]]$means)
      classValSDs <- x[order(names(x))]
      y <- algorithmOutput$Genes[[i]]$sizes
      names(y) <- order(algorithmOutput$Genes[[i]]$means)
      classValProps <- y[order(names(y))]
      means <- unlist(lapply(1:length(validationData[[i]]$means),function(j){
        isApproxSame(validationVal = validationData[[i]]$means[j],classVal = classValsMean[j],maxDistance = maxDifference)
      }))
      sds <- unlist(lapply(1:length(validationData[[i]]$sds),function(j){
        isApproxSame(validationVal = validationData[[i]]$sds[j],classVal = classValSDs[j],maxDistance = maxDifference)
      }))
      props <- unlist(lapply(1:length(validationData[[i]]$sizes),function(j){
        isApproxSame(validationVal = validationData[[i]]$sizes[j],classVal = classValProps[j],maxDistance = maxDifference)
      }))
      c(means,sds,props)
    }
  }))
  names(approxSame) <- genes
  evaluation<- data.frame(do.call('rbind',approxSame))
  meanNames <- sprintf(c("mean%s"),(1:(ncol(evaluation)/3)))
  sdNames <- sprintf(c("sd%s"),(1:(ncol(evaluation)/3)))
  propNames <- sprintf(c("prop%s"),(1:(ncol(evaluation)/3)))
  colnames(evaluation) <- c(meanNames,sdNames,propNames)

  return(evaluation)
}

#' getClassification
#'
#' Returns the numbers of TN (TRUE Negative),TP (TRUE Positive),
#' FN (FALSE Negative) and FP (FALSE Positive) for one of the algorithm's
#' formatted outputs.
#' @param modality numerical, for which modality the classification is calculated, 1 for unimodal,
#' 2 for bimodal,...
#' @param algorithmOutput The formatted output of the algorithm
#' @param algorithmName The name of the evaluated algorithm
#' @param validationData The validationData behind the simulated gene
#' expression data frame
#' @examples
#' \dontrun{
#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10)))
#' simulation <- simulateExpression(params= params,verbose=FALSE)
#' expression <- simulation$expressionData
#' validationData <- simulation$validationData
#' mclustOutput <- useMclust(expression, verbose=FALSE)
#' getClassifications(modality = 2,validationData = validationData,
#' outputs = list("mclust" = mclustOutput))
#' }
#' @return a classification data frame that contains the number of
#' FN, FP, TN and TP
#'
#' @details
#' TN (TRUE Negative): validationData: unimodal, assigned scenario: unimodal
#'
#' TP (TRUE Positive): validationData: bimodal, assigned scenario: bimodal
#'
#' FN (FALSE Negative): validationData: bimodal, assigned scenario: unimodal
#' or
#' FN (FALSE Negative): validationData: unimodal/bimodal,
#'  assigned scenario: more than bimodal
#'
#' FP (FALSE Positive): validationData: unimodal, assigned scenario: bimodal
#'
getClassification <- function(modality,algorithmOutput,algorithmName="mclust",validationData){
  compared <- compareScenarios(modality = modality,
                               algorithmOutput = algorithmOutput,validationData = validationData)
  fn <- table(compared)["FN"]
  fp <- table(compared)["FP"]
  tn <- table(compared)["TN"]
  tp <- table(compared)["TP"]
  classification <- data.frame(matrix(nrow = 1,ncol = 4,data = c(fn,fp,tn,tp),
                                      byrow = TRUE))
  colnames(classification)<-c("FN","FP","TN","TP")
  rownames(classification)<-c(algorithmName)
  return(classification)
}

#' getStatistic
#'
#' This function produces informations about the evaluation of one
#' algorithms. The result is a data frame where the percentage of as TRUE and
#' as FALSE evaluated values of the genes
#'
#' @param evaluation The output of the evaluation of the formatted output
#' @param algorithmName The name of the algorithm which created the output
#' @return A data frame with the percentual proportion of as TRUE or FALSE evaluated values
#' @examples
#' \dontrun{
#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10)))
#' simulation <- simulateExpression(params= params,verbose=FALSE)
#' expression <- simulation$expressionData
#' validationData <- simulation$validationData
#' mclustOutput <- useMclust(expression, verbose=FALSE)
#' mclustEvaluation <- evaluateAlgorithmOutput(modality = 2, algorithmOutput = mclustOutput,
#' validationData = validationData)
#' getStatistics(evaluations = list("mclust" = mclustEvaluation))
#' }
getStatistic <- function(evaluation,algorithmName ="mclust"){
  true <- (table(unlist(evaluation))["TRUE"]/(table(unlist(evaluation))["FALSE"]+table(unlist(evaluation))["TRUE"]))*100
  false <- (table(unlist(evaluation))["FALSE"]/(table(unlist(evaluation))["FALSE"]+table(unlist(evaluation))["TRUE"]))*100
  stats <- data.frame(matrix(ncol = 1, nrow = 2,data = c(true,false),byrow = TRUE))
  colnames(stats)<- c(algorithmName)
  rownames(stats)<- c("TRUE [%]","FALSE [%]")
  return(stats)
}

#' Statistics
#'
#' This function produces informations about the evaluation of the three
#' algorithms. The result is a data frame where the percentage of as TRUE and
#' as FALSE evaluated values of the genes
#'
#' @param evaluations list of named evaluations
#' @return A data frame with the percentual proportion of as TRUE or FALSE evaluated values
#' @examples
#' \dontrun{
#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10)))
#' simulation <- simulateExpression(params= params,verbose=FALSE)
#' expression <- simulation$expressionData
#' validationData <- simulation$validationData
#' mclustOutput <- useMclust(expression = expression,verbose = FALSE)
#' hdbscanOutput <- useHdbscan(expression = expression,verbose = FALSE)
#' mclustEvaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = mclustOutput,
#' validationData = validationData)
#' hdbscanEvaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = hdbscanOutput,
#' validationData = validationData)
#'
#' #getStatistics of one algorithm
#' getStatistics(evaluations = list("mclust"=mclustEvaluation))
#' #getStatistics of two or more algorithms
#' getStatistics(evaluations = list("mclust"=mclustEvaluation,"HDBSCAN"=hdbscanEvaluation))
#' }
#' @export
getStatistics <- function(evaluations) {
  numEvaluations <- length(evaluations)
  statistics <- lapply(1:numEvaluations, function(i) {getStatistic(evaluation = evaluations[[i]], algorithmName = names(evaluations)[i])})
  stats <- data.frame(do.call('cbind',statistics))
  colnames(stats)<- names(evaluations)
  rownames(stats)<- c("TRUE [%]","FALSE [%]")
  return(stats)
}

#' getClassifications
#'
#' Returns the numbers of TN (TRUE Negative),TP (TRUE Positive),
#' FN (FALSE Negative) and FP (FALSE Positive) for each of the algorithm's
#' formatted outputs.
#' @param modality numerical, for which modality the classification is calculated, 1 for unimodal,
#' 2 for bimodal,...
#' @param outputs list of named outputs
#' @param validationData The validationData behind the simulated gene expression data frame
#'
#' @return a classification data frame that contains the number of
#' FN, FP, TN and TP
#' @examples
#' \dontrun{
#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10)))
#' simulation <- simulateExpression(params= params,verbose=FALSE)
#' expression <- simulation$expressionData
#' validationData <- simulation$validationData
#' mclustOutput <- useMclust(expression = expression,verbose = FALSE)
#' hdbscanOutput <- useHdbscan(expression = expression,verbose = FALSE)
#'
#' #getClassifications of one algorithm
#' getClassifications(modality=2,validationData,
#' outputs = list("mclust"=mclustOutput))
#' #getClassifications of two or more algorithms
#' getClassifications(modality=2,validationData,
#' outputs = list("mclust"=mclustOutput,"HDBSCAN"=hdbscanOutput))
#' }
#'
#' @details
#' TN (TRUE Negative): validationData: unimodal, assigned scenario: unimodal
#'
#' TP (TRUE Positive): validationData: bimodal, assigned scenario: bimodal
#'
#' FN (FALSE Negative): validationData: bimodal, assigned scenario: unimodal
#' or
#' FN (FALSE Negative): validationData: unimodal/bimodal,
#'  assigned scenario: more than bimodal
#'
#' FP (FALSE Positive): validationData: unimodal, assigned scenario: bimodal
#'
#' @export
getClassifications <- function(modality,validationData,outputs){
  numOutputs <- (length(outputs))
  classification <- lapply(1:numOutputs,function(i){
    getClassification(modality,algorithmOutput = outputs[[i]],
                      algorithmName = names(outputs)[i],
                      validationData = validationData)
  })
  classification <- data.frame(do.call('rbind',classification))
  colnames(classification)<-c("FN","FP","TN","TP")
  rownames(classification)<-names(outputs)
  return(classification)
}

#' Checks if output fulfills the criteria of the unified output data format
#' @details prints status messages about performed checks
#' @param output to be tested
#' @param expressionmatrix that was used by algorithm to create output
#' @return logical: TRUE if output is in the correct format, FALSE if any check was failed
#' @export
isValidUnifiedOutputDataFormat <- function(output,expressionmatrix){
  correctType <- FALSE
  correctLengthOutput <- FALSE
  correctLengthGenes <- FALSE
  correctFormatClinicalData <- FALSE
  correctFormatExpressionmatrix <- FALSE
  correctFormatAllGenes <- FALSE

  if(is.list(output)){
    message("Output is of type 'list'")
    correctType <- TRUE
    }else{
      message("Output is not of required type 'list'.")
      return(FALSE)}
  if(length(output)==3){
    message("Output has a length of 3.")
    correctLengthOutput <- TRUE
  }else{
    message("Output has not a length of 3.")
    return(FALSE)}
  if(length(output$Genes)==nrow(expressionmatrix)){
    message("Output$Genes holds the correct number of genes.")
    correctLengthGenes <- TRUE
  }else{
    message("Output does not hold the correct number of genes.")
    return(FALSE)}
  if((is.character(output$ClinicalData))&&(length(output$ClinicalData)==1)){
    message("Output$ClinicalData is of the required type 'character' and the required length of 1.")
    correctFormatClinicalData <- TRUE
  }else{
    message("Output$ClinicalData is not of the required type 'character' and/or not of the required length of 1.")
    return(FALSE)}
  if((is.character(output$Expressionmatrix))&&(length(output$Expressionmatrix)==1)){
    message("Output$Expressionmatrix is of the required type 'character' and of the required length of 1.")
    correctFormatExpressionmatrix <- TRUE
  }else{
    message("Output$Expressionmatrix is not of the required type 'character' and/or not of the required length of 1.")
    return(FALSE)}
  correctlyFormattedGenes <- try(unlist(lapply(1:length(output$Genes),function(i){
    isCorrectFormattedGene <- FALSE
    allCorrectFormat <- is.integer(output$Genes[[i]]$modus)&&
      is.double(output$Genes[[i]]$means)&&(length(output$Genes[[i]]$means)==output$Genes[[i]]$modus)&&
      is.double(output$Genes[[i]]$sds)&&(length(output$Genes[[i]]$sds)==output$Genes[[i]]$modus)&&
      is.integer(output$Genes[[i]]$sizes)&&(length(output$Genes[[i]]$sizes)==output$Genes[[i]]$modus)&&
      is.list(output$Genes[[i]]$groups)&&(length(output$Genes[[i]]$groups)==output$Genes[[i]]$modus)&&
      (sum(output$Genes[[i]]$sizes)==ncol(expressionmatrix))
    listCorrectLength<- unlist(lapply(1:length(output$Genes[[i]]$groups),function(j){
      length(output$Genes[[i]]$groups[[j]])==output$Genes[[i]]$sizes[[j]]
    }))
    allListsCorrectLength <- (!any(listCorrectLength==FALSE))

    isCorrectFormattedGene <- allCorrectFormat&&allListsCorrectLength
  })),silent = T)
  if(class(correctlyFormattedGenes)=="try-error"){
    message("The correct format of the genes could not be evaluated due to wrong characteristics of the Genes section.")
    return(FALSE)}
  else{
    correctFormatAllGenes <-(!any(correctlyFormattedGenes==FALSE))
  }
  if(correctFormatAllGenes){
    message("All Genes are in the correct format.")
  }else{
    message("One or more genes do not fulfil the required format of the Genes section.")
  }

  return(correctType&&
           correctLengthOutput&&
           correctLengthGenes&&
           correctFormatClinicalData&&
           correctFormatExpressionmatrix&&
           correctFormatAllGenes)
}