diff --git a/.Rbuildignore b/.Rbuildignore index 91114bf..4b3f9f2 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,3 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +^data-raw$ diff --git a/DESCRIPTION b/DESCRIPTION index 3897b52..2eb0503 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,9 +1,9 @@ -Package: bimodalR -Title: Simulation of bimodal gene expression and evaluation of algorithms for bimodality detection +Package: multimodalR +Title: Simulation of multimodal gene expression, evaluation of algorithms detecting multimodality in gene expression data sets, and detection of multimodal genes in gene expression data sets Version: 0.0.0.9000 -Authors@R: c(person("Svenja", "Beck", email = "svenja-kristina.beck@mpi-bn.mpg.de", role = c("aut", "cre")), +Authors@R: c(person("Svenja Kristina", "Beck", email = "svenja-kristina.beck@mpi-bn.mpg.de", role = c("aut", "cre")), person("Jens", "Preussner", email = "jens.preussner@mpi-bn.mpg.de", role = c("aut"))) -Description: What the package does (one paragraph). +Description: multimodalR is an R package to simulate multimodal gene expression data, to evaluate different algorithms detecting multimodality and to detect bimodality or multimodality in data sets. Depends: R (>= 3.4.2) License: MIT + file LICENSE Encoding: UTF-8 @@ -17,8 +17,22 @@ Imports: LaplacesDemon, flexmix, plyr, mclust, - diptest -Suggests: genefilter, - knitr, + diptest, + ggplot2, + reshape2, + biomaRt, + doParallel, + foreach, + reticulate, + stats, + dplyr, + magrittr, + stringr, + survival, + ggkm, + KEGGREST, + cowplot, + utils +Suggests: knitr, rmarkdown VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index f85f09b..abd6ea1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,40 +1,106 @@ # Generated by roxygen2: do not edit by hand -export(DipTestOverP) -export(SilvermanOverP) +export(cleanOutput) export(evaluateAlgorithmOutput) +export(filter0sumGenes) +export(filterForFoldChange) +export(filterForKnownInCancer) +export(filterForMean) +export(filterForXChromosomeGenes) +export(filterForYChromosomeGenes) +export(filterMultimodalGenes) +export(filterRarelyExpressedGenes) export(getClassifications) +export(getFilteredData) export(getParam) export(getParams) export(getStatistics) +export(isValidUnifiedOutputDataFormat) export(newParams) +export(plotClassifications) +export(plotExpression) +export(plotOutput) +export(plotStatistics) +export(plotSurvivalCurves) +export(processOneCancerGroup) export(setParam) export(setParams) export(simulateExpression) +export(toCancerDataGroups) +export(updateGeneNames) +export(useDiptest) export(useFlexmix) export(useHdbscan) export(useMclust) +export(useSilvermantest) exportClasses(Params) -import(dbscan) -import(diptest) import(flexmix) import(mclust) -import(silvermantest) +importFrom(KEGGREST,keggGet) importFrom(LaplacesDemon,rinvgaussian) +importFrom(biomaRt,getBM) +importFrom(biomaRt,useMart) importFrom(checkmate,assertCharacter) importFrom(checkmate,assertClass) importFrom(checkmate,assertList) importFrom(checkmate,assertString) +importFrom(cowplot,add_sub) +importFrom(cowplot,ggdraw) +importFrom(diptest,dip.test) +importFrom(doParallel,registerDoParallel) +importFrom(dplyr,mutate) +importFrom(dplyr,select) +importFrom(foreach,"%dopar%") +importFrom(foreach,foreach) +importFrom(ggkm,geom_km) +importFrom(ggplot2,aes) +importFrom(ggplot2,coord_fixed) +importFrom(ggplot2,element_text) +importFrom(ggplot2,facet_wrap) +importFrom(ggplot2,geom_bar) +importFrom(ggplot2,geom_density) +importFrom(ggplot2,ggplot) +importFrom(ggplot2,ggtitle) +importFrom(ggplot2,labs) +importFrom(ggplot2,scale_color_discrete) +importFrom(ggplot2,scale_fill_manual) +importFrom(ggplot2,stat_function) +importFrom(ggplot2,theme) +importFrom(ggplot2,theme_bw) +importFrom(ggplot2,xlab) +importFrom(ggplot2,ylab) +importFrom(grDevices,dev.off) +importFrom(grDevices,grey.colors) +importFrom(grDevices,pdf) +importFrom(graphics,hist) +importFrom(graphics,par) +importFrom(magrittr,"%>%") importFrom(methods,"slot<-") importFrom(methods,new) importFrom(methods,rbind2) importFrom(methods,slot) importFrom(methods,slotNames) importFrom(methods,validObject) +importFrom(parallel,clusterEvalQ) +importFrom(parallel,clusterExport) +importFrom(parallel,detectCores) +importFrom(parallel,makeCluster) +importFrom(parallel,parLapply) +importFrom(parallel,stopCluster) importFrom(plyr,compact) +importFrom(reshape2,melt) +importFrom(reticulate,import) +importFrom(silvermantest,silverman.test) +importFrom(stats,BIC) +importFrom(stats,as.formula) importFrom(stats,dnorm) +importFrom(stats,pchisq) importFrom(stats,rgamma) importFrom(stats,rnorm) importFrom(stats,runif) importFrom(stats,sd) +importFrom(stringr,str_replace_all) +importFrom(survival,Surv) +importFrom(survival,survdiff) +importFrom(utils,combn) importFrom(utils,head) diff --git a/R/TCGAFunctions.R b/R/TCGAFunctions.R new file mode 100644 index 0000000..2ec5dfb --- /dev/null +++ b/R/TCGAFunctions.R @@ -0,0 +1,213 @@ +#TCGA Routine functions + +#' toCancerDataGroups +#' Sorts the TCGA expression data with the metadata into groups of different cancer types +#' @details The groupsData that is returned consists of the different diagnosis groups +#' (http://www.icd10data.com/ICD10CM/Codes/C00-D49) +#' and the expression matrix of each of the diagnosis groups +#' @param cancerDataExpression TCGA cancer expression data +#' @param metadata TCGA cancer metadata +#' @param verbose logical. Whether to print progress messages +#' +#' @export +toCancerDataGroups <- function(cancerDataExpression, metadata, + verbose = TRUE) { + if (verbose) { + message("Filtering 0 sum rows...") + } + cancerDataCleared <- filter0sumGenes(cancerData = cancerDataExpression,parallel = TRUE) + if (verbose) { + message("Splitting the data sets into different cancer groups...") + } + icd10 <- unique(unlist(lapply(1:length(metadata$diagnosis), function(i) { + unlist(strsplit(metadata$diagnosis[i], split = "[.]"))[1] + }))) + patientIds <- colnames(cancerDataCleared) + caseIdsUnique <- make.unique(names = as.character(metadata$caseID)) + diagnosis <- unlist(lapply(1:length(patientIds), function(i) { + diagnosis <- metadata$diagnosis[(which(caseIdsUnique == patientIds[i]))] + names(diagnosis) <- patientIds[i] + diagnosis + })) + diagnosisGroups <- unique(unlist(lapply(1:length(diagnosis), function(i) { + unlist(strsplit(diagnosis[i], split = "[.]"))[1] + }))) + shortDiagnosis <- (unlist(lapply(1:length(diagnosis), function(i) { + unlist(strsplit(diagnosis[i], split = "[.]"))[1] + }))) + patientsInGroups <- (lapply(1:length(diagnosisGroups), function(i) { + names(diagnosis[which(shortDiagnosis == diagnosisGroups[i])]) + })) + groupsDataSets <- lapply(1:length(patientsInGroups), function(i) { + expr <- lapply(1:length(patientsInGroups[[i]]), function(j) { + cancerDataCleared[, which(colnames(cancerDataCleared) == patientsInGroups[[i]][j])] + }) + expression <- do.call("cbind", expr) + rownames(expression) <- rownames(cancerDataCleared) + colnames(expression) <- patientsInGroups[[i]] + expression + }) + return(list(diagnosisGroups = diagnosisGroups, groupsExpressionDataSets = groupsDataSets)) +} + +#' processOneCancerGroup +#' Processes one cancer group +#' @details Processes one cancer group by a) filtering 0 sum rows, +#' b) using Silverman's test for unimodality, +#' c) filtering genes with <2% of patients that have expression values greater 0, +#' d) use useMclust on cancer data, +#' e) creating list of output and filtered expression matrix +#' @param cancerData TCGA cancer expression data of one cancer group +#' @param minExpressedPercentage integer specifying the percentage of patients that should at least be expressed for the gene to be analysed +#' @param algorithm "mclust", "hdbscan" or "flexmix". Defines which algorithm should be used to process the cancer data +#' @param maxModality An integer specifying the highest modality to calculate models in with mclust and flexmix +#' @param minClusterSize Integer; The minimum number of samples +#' in a group for that group to be considered a cluster in hdbscan +#' @param minSamples Integer for hdbscan; The number of samples +#' in a neighborhood for a point to be considered as a core point. +#' This includes the point itself. If NULL: defaults to the min_cluster_size. +#' @param pathToClinicalData path to the clinical data +#' @param pathToExpressionmatrix path to the expression matrix +#' @param SilvermanP The p-value that is used to reject Silvermans Test for unimodality +#' (given by k=1 using the Hall/York adjustments) +#' @param verbose logical. Whether to print progress messages +#' +#' @export +processOneCancerGroup <- function(cancerData,minExpressedPercentage=2,algorithm="mclust", maxModality=3, + minClusterSize=50,minSamples=NULL, + pathToClinicalData = "pathToClinicalData", + pathToExpressionmatrix = "pathToExpressionmatrix", + SilvermanP = 0.05, verbose = TRUE) { + if (verbose) { + message("Filtering 0 sum rows...") + } + clearedData <- filter0sumGenes(cancerData = cancerData,parallel = TRUE) + if (verbose) { + message("Using SilvermanOverP...") + } + possibles <- useSilvermantest(clearedData =clearedData,parallel = TRUE,SilvermanP = SilvermanP) + if (verbose) { + message("Filter genes with only a few patients (0] + sizes <- unlist(lapply(1:length(groups),function(j){ + length(unlist(expression[i,groups[[j]]])) + })) + }else{ + modus <- mclustOutput$Genes[[i]]$modus + means <- mclustOutput$Genes[[i]]$means + sds <- mclustOutput$Genes[[i]]$sds + sizes <- mclustOutput$Genes[[i]]$sizes + groups <- mclustOutput$Genes[[i]]$groups + } + if(length(groups)!=modus){ + modus <- length(groups) + means <- unlist(lapply(1:length(groups),function(j){ + mean(unlist(expression[i,groups[[j]]])) + })) + sizes <- unlist(lapply(1:length(groups),function(j){ + length(groups[[j]]) + })) + sds <- unlist(lapply(1:length(groups),function(j){ + sd(unlist(expression[i,groups[[j]]])) + })) + } + mclustOutput <- list("modus"=as.integer(modus),"means"=means,"sds"=sds,"sizes"=as.integer(sizes),"groups"=groups) + mclustOutput + })) + names(cleanOutput) <- names(mclustOutput$Genes) + mclustOutput <- list("Genes"=cleanOutput, + "ClinicalData"=mclustOutput$ClinicalData, + "Expressionmatrix"=mclustOutput$Expressionmatrix) return(mclustOutput) } - #' mclustClassification #' #' Using the output of the mclust algorithm for generating output in a unified @@ -70,21 +115,19 @@ useMclust <- function(expression,clusterNumbers = NULL,verbose = TRUE,parallel = #' @param models Unformatted output of the Mclust function from the mclust #' package of the genes from the gene expression data frame #' @param patients List of the patients +#' @param pathToClinicalData If real data is used the path +#' to the clinical data can be stored here +#' @param pathToExpressionmatrix If real data is used the path +#' to the gene expression matrix can be stored here #' #' @return The formatted output of the mclust algorithm #' -mclustClassification <- function(models,patients){ - +mclustClassification <- function(models,patients,pathToClinicalData,pathToExpressionmatrix){ genes <- names(models) - outputs <- lapply(1:length(genes),function(i){ - - classification <- models[[i]]$classification + classification <- unname(models[[i]]$classification) groups <- lapply(1:max(classification),function(j){ - unlist(lapply(1:models[[i]]$n, function(k){ - if(classification[[k]]==j){ - c(patients[k]) - }})) + patients[(which(classification==j))] }) proportions <- models[[i]]$parameters$pro modus <- length(models[[i]]$parameters$mean) @@ -97,7 +140,7 @@ mclustClassification <- function(models,patients){ ) }) names(outputs)<- genes - mclustOutput <- list(Genes = outputs,"ClinicalData"="mnt/agnerds/xyz","Expressionmatrix"="mnt/agnerds/xyzyy") + mclustOutput <- list(Genes = outputs,"ClinicalData"=pathToClinicalData,"Expressionmatrix"=pathToExpressionmatrix) return(mclustOutput) } @@ -106,119 +149,150 @@ mclustClassification <- function(models,patients){ #' #' Use the HDBSCAN algorithm for generating output to evaluate the #' classification (unimodal,bimodal or other) of genes. -#' +#' @details +#' For details on how HDBSCAN works and +#' for details on what parameters with ** do, +#' see http://hdbscan.readthedocs.io/en/latest/ #' @param expression (Simulated) Gene Expression data frame which genes shall #' be classified as unimodal, bimodal or other -#' @param minPts Integer; Minimum size of clusters. See details of ?hdbscan +#' @param minClusterSize ** Integer; The minimum number of samples +#' in a group for that group to be considered a cluster +#' @param minSamples ** Integer; The number of samples +#' in a neighborhood for a point to be considered as a core point. +#' This includes the point itself. If NULL: defaults to the min_cluster_size. #' @param verbose logical. Whether to print progress messages +#' @param pathToClinicalData If real data is used the path +#' to the clinical data can be stored here +#' @param pathToExpressionmatrix If real data is used the path +#' to the gene expression matrix can be stored here #' #' @return hdbscanOutput The formatted output of the HDBSCAN clustering #' #' @examples +#' \dontrun{ #' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData #' #with messages printed to console #' hdbscanOutput <- useHdbscan(expression = expression) #' #with specified minPts parameter -#' hdbscanOutput <- useHdbscan(expression = expression, minPts = 15) +#' hdbscanOutput <- useHdbscan(expression = expression, minClusterSize = 15) #' #without messages printed to console #' hdbscanOutput <- useHdbscan(expression = expression, verbose = FALSE) -#' -#' @import dbscan +#' } +#' @importFrom reticulate import +#' @importFrom plyr compact #' @export -#' -useHdbscan <- function(expression,minPts = 10,verbose = TRUE){ +useHdbscan <- function(expression,minClusterSize = 10,minSamples=NULL,verbose = TRUE,pathToClinicalData="pathTo/clinicalData",pathToExpressionmatrix="pathTo/expressionmatrix"){ + if(is.null(minSamples)){ + minSamples=minClusterSize + } if(verbose){message("HDBSCAN-clusters are calculated...")} + py.hdbscan <- reticulate::import("hdbscan") + py.numpy <- reticulate::import("numpy") + hdbscan <- py.hdbscan$HDBSCAN(min_cluster_size = as.integer(minClusterSize),min_samples = as.integer(minSamples),allow_single_cluster = TRUE) genes <- rownames(expression) patients <- names(expression) - outputs <- lapply(1:nrow(expression),function(i){ - clustering <- dbscan::hdbscan(t(expression[i,]),minPts = minPts) - means <- by(as.numeric(expression[i,]),INDICES = clustering$cluster, - FUN = mean) - sds <- by(as.numeric(expression[i,]),INDICES = clustering$cluster, - FUN = sd) + expr <- t(expression[i,]) + clustering <- data.frame(expr,cluster=(hdbscan$fit_predict(as.matrix(unlist(expr))))) + means <- plyr::compact(unname(by(as.numeric(expression[i,]),INDICES = clustering$cluster, + FUN = mean))) + sds <- plyr::compact(unname(by(as.numeric(expression[i,]),INDICES = clustering$cluster, + FUN = sd))) classification <- clustering$cluster if(length(means)>1){ classificationMinDistance <- unlist(lapply(1:ncol(expression), function(j){ - if(classification[[j]]==0){ - difference <- abs(means[2:(max(classification)+1)]-expression[i,j]) + if(classification[[j]]==-1){ + difference <- abs(means[2:length(table(classification))]-expression[i,j]) minDif <- min(difference) - unlist(lapply(1:max(classification),function(k){ - if(minDif==difference[k]){ - c(k) - }})) + which(difference == minDif)-1 }else{ classification[j] } })) - means <- by(as.numeric(expression[i,]),INDICES = classificationMinDistance, - FUN = mean) - sds <- by(as.numeric(expression[i,]),INDICES = classificationMinDistance, - FUN = sd) - groups <- lapply(1:max(classification),function(j){ - unlist(lapply(1:length(classificationMinDistance), function(k){ - if(classificationMinDistance[k]==j){ - c(patients[k]) - } - })) - }) + means <- plyr::compact(unname(by(as.numeric(expression[i,]),INDICES = classificationMinDistance, + FUN = mean))) + sds <- plyr::compact(unname(by(as.numeric(expression[i,]),INDICES = classificationMinDistance, + FUN = sd))) + if(length(means)>1){ + groups <- lapply(0:(length(means)-1),function(j){ + patients[(which(classificationMinDistance==j))] + }) + }else{ + groups <- list(patients) + } }else{ - groups <- patients + groups <- list(patients) classificationMinDistance <- classification } modus <- "modal" - if((max(classification))==0){ + if((max(classification))==0||(max(classification))==-1){ modus <- 1 }else{ - modus <- max(classification) + modus <- max(classification)+1 } - output <- list( - "modus" = modus, - "means" = c(unname(means)), - "sds" = c(unname(sds)), - "sizes" = unname(table(classificationMinDistance[])), + "modus" = as.integer(modus), + "means" = means, + "sds" = sds, + "sizes" = unname(table(classificationMinDistance)), "groups" = groups ) }) names(outputs)<- genes - hdbscanOutput <- list(Genes = outputs,"ClinicalData"="mnt/agnerds/xyz","Expressionmatrix"="mnt/agnerds/xyzyy") - - return(hdbscanOutput) -} + hdbscanOutput <- list(Genes = outputs,"ClinicalData"=pathToClinicalData,"Expressionmatrix"=pathToExpressionmatrix) + cleanOutput <- (lapply(1:length(hdbscanOutput$Genes),function(i){ + #corrects genes with an empty group to the correct modus, means, sds, sizes and groups + isNull <- unlist(lapply(1:length(hdbscanOutput$Genes[[i]]$groups),function(j){ + if(is.null(unlist(hdbscanOutput$Genes[[i]]$groups[j]))||length(unlist(hdbscanOutput$Genes[[i]]$groups[j]))==0){j} + })) + if(!is.null(isNull)){ + modus <- hdbscanOutput$Genes[[i]]$modus -1 + means <- hdbscanOutput$Genes[[i]]$means[-isNull] + sds <- hdbscanOutput$Genes[[i]]$sds[-isNull] + groups <- plyr::compact(hdbscanOutput$Genes[[i]]$groups) + groups <- groups[lapply(groups,length)>0] + sizes <- unlist(lapply(1:length(groups),function(j){ + length(unlist(expression[i,groups[[j]]])) + })) + }else{ + modus <- hdbscanOutput$Genes[[i]]$modus + means <- hdbscanOutput$Genes[[i]]$means + sds <- hdbscanOutput$Genes[[i]]$sds + sizes <- hdbscanOutput$Genes[[i]]$sizes + groups <- hdbscanOutput$Genes[[i]]$groups + } + hdbscanOutput <- list("modus"=as.integer(modus),"means"=means,"sds"=sds,"sizes"=as.integer(sizes),"groups"=groups) + hdbscanOutput + })) + names(cleanOutput) <- names(hdbscanOutput$Genes) + hdbscanOutput <- list("Genes"=cleanOutput, + "ClinicalData"=hdbscanOutput$ClinicalData, + "Expressionmatrix"=hdbscanOutput$Expressionmatrix) -#' RMSE - root-mean-square error -#' -#' The RMSE is frquently used as a measure of the differences between values -#' predicted by a model or an estimator and the values actually observed. -#' NAs are filtered out before calculating the RMSE -#' -#' @param error The difference of a value predicted by a model and an actually -#' observed value -#' -rmse <- function(error){ - error <- unlist(error) - error2 <- unlist(lapply(1:length(error),function(i){if(!is.na(error[[i]])){c(error[[i]])}})) - sqrt(mean(error2^2)) + return(hdbscanOutput) } #' UseFlexmix #' -#' Use the FlexMix algorithm for creating a model to find the best fitting +#' Use the FlexMix algorithm to create models and to find the best fitting #' model to evaluate the classification (unimodal,bimodal or other) of genes. #' #' @param expression (Simulated) Gene Expression data frame which genes shall #' be classified as unimodal, bimodal or other -#' @param maxModality numerical, maximum modality for that models will be calculated +#' @param maxModality numerical. maximum modality for that models will be calculated #' @param verbose logical. Whether to print progress messages #' @param parallel logical. Whether to calculate the models in parallel -#' +#' @param pathToClinicalData If real data is used the path +#' to the clinical data can be stored here +#' @param pathToExpressionmatrix If real data is used the path +#' to the gene expression matrix can be stored here #' @return flexmixOutput The formatted output of the best fitting models from flexmix #' #' @examples +#' \dontrun{ #' params <- newParams(nGenes = list("1"=75,"2"=list("gauss"= 75,"gamma"= 75))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData @@ -226,86 +300,20 @@ rmse <- function(error){ #' flexmixOutput <- useFlexmix(expression = expression, maxModality = 2) #' #without messages printed to console #' flexmixOutput <- useFlexmix(expression = expression, maxModality = 2, verbose = FALSE) -#' +#' } #' @import flexmix +#' @importFrom doParallel registerDoParallel +#' @importFrom parallel detectCores makeCluster stopCluster +#' @importFrom foreach foreach "%dopar%" #' @importFrom plyr compact +#' @importFrom stats BIC #' @export #' -useFlexmix <- function(expression,maxModality, verbose = TRUE,parallel=FALSE){ - modelGauss <- c() - modelGamma <- c() +useFlexmix <- function(expression,maxModality=3,verbose = TRUE,parallel=FALSE,pathToClinicalData="pathTo/clinicalData",pathToExpressionmatrix="pathTo/expressionmatrix"){ + '%dopar%' <- foreach::`%dopar%` + if (verbose){message("flexmix models for gaussian and gamma mixture models are calculated and compared...")} genes <- rownames(expression) patients <- names(expression) - - if(verbose){message("Creating and fitting the model with gaussian distributions to each gene")} - flexModelGauss <- lapply(1:maxModality,function(i){ - if(parallel==FALSE){ - flexModel <- lapply(1:nrow(expression),function(j){ - modelGauss <- flexmix::FLXMRglm(family = "gaussian") - modelGamma <- flexmix::FLXMRglm(family = "Gamma") - model <- rep(x = "modelGauss",i) - models <- sapply(model,function(y){eval(parse(text=y))}) - flexfit <- flexmix::flexmix(t(expression[j,])~1,data = expression,k = i, model = models) - if(i==1){ - means <- flexmix::parameters(flexfit)[1] - sds <- flexmix::parameters(flexfit)[2] - sizes <- unname(table(flexmix::clusters(flexfit))) - }else{ - means <- unlist(unname(flexmix::parameters(flexfit)[[1]][1,])) - sds <- unlist(unname(flexmix::parameters(flexfit)[[1]][2,])) - sizes <- unlist(unname(table(flexmix::clusters(flexfit)))) - } - c("means"=list(means),"sds"=list(sds),"sizes"=list(sizes),"classification"=list(flexmix::clusters(flexfit))) - }) - } - if(parallel){ - modelGauss <- flexmix::FLXMRglm(family = "gaussian") - modelGamma <- flexmix::FLXMRglm(family = "Gamma") - n.cores = parallel::detectCores()-1 - cl = parallel::makeCluster(n.cores) - parallel::clusterExport(cl, c("expression")) - parallel::clusterEvalQ(cl, library(flexmix)) - flexModel <- parallel::parLapply(cl,1:nrow(expression),function(j){ - model <- rep(x = "modelGauss",i) - models <- sapply(model,function(y){eval(parse(text=y))}) - flexfit <- flexmix::flexmix(t(expression[j,])~1,data = expression,k = i, model = models) - if(i==1){ - means <- flexmix::parameters(flexfit)[1] - sds <- flexmix::parameters(flexfit)[2] - sizes <- unname(table(flexmix::clusters(flexfit))) - }else{ - means <- unlist(unname(flexmix::parameters(flexfit)[[1]][1,])) - sds <- unlist(unname(flexmix::parameters(flexfit)[[1]][2,])) - sizes <- unlist(unname(table(flexmix::clusters(flexfit)))) - } - c("means"=list(means),"sds"=list(sds),"sizes"=list(sizes),"classification"=list(flexmix::clusters(flexfit))) - }) - } - if(parallel==TRUE){parallel::stopCluster(cl)} - names(flexModel) <- genes - flexModel - - }) - - flexModelGaussRMSE <- lapply(1:maxModality,function(i){ - RMSE <- lapply(1:nrow(expression),function(j){ - prediction <- unlist(lapply(1:length(flexModelGauss[[i]][[j]]$sizes),function(k){ - predExpression <- stats::rnorm(n = flexModelGauss[[i]][[j]]$sizes[k], - mean = flexModelGauss[[i]][[j]]$means[k], - sd = flexModelGauss[[i]][[j]]$sds[k]) - })) - rmseVal <- unlist(rmse((expression[j,] - prediction))) - }) - }) - bestModelGauss <- unlist(lapply(1:nrow(expression),function(i){ - values <- unlist(lapply(1:maxModality,function(j){ - flexModelGaussRMSE[[j]][[i]] - })) - which.min(values) - })) - - if(verbose){message("Creating and fitting the model with gamma and gaussian distributions to each gene")} - expression2 <- (lapply(1:nrow(expression),function(i){ unlist(lapply(1:length(expression),function(j){ if((expression[i,j]>0)){ @@ -318,127 +326,215 @@ useFlexmix <- function(expression,maxModality, verbose = TRUE,parallel=FALSE){ expression2 <- data.frame(do.call('rbind',expression2)) colnames(expression2) <- colnames(expression) rownames(expression2) <- rownames(expression) + if(parallel){ + modelGauss <- flexmix::FLXMCnorm1() + modelGamma <- flexmix::FLXMCdist1(dist = "gamma") + nCores = parallel::detectCores()-1 + cl <- parallel::makeCluster(nCores) + doParallel::registerDoParallel(cl) + outputs <- foreach::foreach(i = 1:nrow(expression),.packages = c("flexmix")) %dopar% { + flexModelGammaAll <- NULL + flexModelGaussAll <- NULL + flexModelGaussAll <- plyr::compact(lapply(1:maxModality,function(j){ + modeled <- try(flexmix::flexmix(t(expression[i,])~1,model = modelGauss,k=j),silent = TRUE) + if(class(modeled)!="try-error"){ + modeled + } + })) + if(length(flexModelGaussAll)>0){ + BICModelGauss <- unlist(lapply(1:length(flexModelGaussAll),function(j){ + stats::BIC(flexModelGaussAll[[j]]) + })) + bestBICGauss <- unique(min(BICModelGauss)) + bestModelGauss <- flexModelGaussAll[[min(which(BICModelGauss==bestBICGauss))]] + } - flexModelGamma <- lapply(1:maxModality,function(i){ - - if(parallel==FALSE){ - modelGauss <- flexmix::FLXMRglm(family = "gaussian") - modelGamma <- flexmix::FLXMRglm(family = "Gamma") - flexModel <- lapply(1:nrow(expression2),function(j){ - model <- rep(x = c("modelGamma","modelGauss"),c(1,i-1)) - models <- sapply(model,function(y){eval(parse(text=y))}) - flexfit <- flexmix::flexmix(t(expression2[j,])~1,data = expression2,k = i, model = models) - if(i==1){ - means <- flexmix::parameters(flexfit)[1] - sds <- flexmix::parameters(flexfit)[2] - sizes <- unname(table(flexmix::clusters(flexfit))) + flexModelGammaAll <- plyr::compact(lapply(1:maxModality,function(j){ + modeled <- try(flexmix::flexmix(t(expression[i,])~1,model = modelGamma,k=j),silent = TRUE) + if(class(modeled)!="try-error"){ + modeled + } + })) + if(length(flexModelGammaAll)>0){ + BICModelGamma <- unlist(lapply(1:length(flexModelGammaAll),function(j){ + stats::BIC(flexModelGammaAll[[j]]) + })) + bestBICGamma <- unique(min(BICModelGamma)) + bestModelGamma <- flexModelGammaAll[[min(which(BICModelGamma==bestBICGamma))]] + } + if(length(flexModelGaussAll)==0||length(flexModelGammaAll)==0){ + if(length(flexModelGaussAll)==0&&length(flexModelGammaAll)==0){ + stop('no model fit') }else{ - means <- unlist(unname(flexmix::parameters(flexfit)[[1]][1,])) - sds <- unlist(unname(flexmix::parameters(flexfit)[[1]][2,])) - sizes <- unlist(unname(table(flexmix::clusters(flexfit)))) + if(length(flexModelGammaAll)==0){ + bestModel <- "gauss" + }else{ + bestModel <- "gamma" + } } - c("means"=list(means),"sds"=list(sds),"sizes"=list(sizes),"classification"=list(flexmix::clusters(flexfit))) - }) - } - if(parallel){ - modelGauss <- flexmix::FLXMRglm(family = "gaussian") - modelGamma <- flexmix::FLXMRglm(family = "Gamma") - n.cores = parallel::detectCores()-1 - cl = parallel::makeCluster(n.cores) - #parallel::clusterExport(cl, c("expression2")) - parallel::clusterEvalQ(cl, library(flexmix)) - flexModel <- parallel::parLapply(cl,1:nrow(expression2),function(j){ - model <- rep(x = c("modelGamma","modelGauss"),c(1,i-1)) - models <- sapply(model,function(y){eval(parse(text=y))}) - flexfit <- flexmix::flexmix(t(expression2[j,])~1,data = expression2,k = i, model = models) - if(i==1){ - means <- flexmix::parameters(flexfit)[1] - sds <- flexmix::parameters(flexfit)[2] - sizes <- unname(table(flexmix::clusters(flexfit))) + }else{ + bestModel <- c() + if(bestBICGauss0){ + BICModelGauss <- unlist(lapply(1:length(flexModelGaussAll),function(j){ + stats::BIC(flexModelGaussAll[[j]]) + })) + bestBICGauss <- unique(min(BICModelGauss)) + bestModelGauss <- flexModelGaussAll[[min(which(BICModelGauss==bestBICGauss))]] + } + flexModelGammaAll <- plyr::compact(lapply(1:maxModality,function(j){ + modeled <- try(flexmix::flexmix(t(expression[i,])~1,model = modelGamma,k=j),silent = TRUE) + if(class(modeled)!="try-error"){ + modeled + } + })) + if(length(flexModelGammaAll)>0){ + BICModelGamma <- unlist(lapply(1:length(flexModelGammaAll),function(j){ + stats::BIC(flexModelGammaAll[[j]]) + })) + bestBICGamma <- unique(min(BICModelGamma)) + bestModelGamma <- flexModelGammaAll[[min(which(BICModelGamma==bestBICGamma))]] + } + if(length(flexModelGaussAll)==0||length(flexModelGammaAll)==0){ + if(length(flexModelGaussAll)==0&&length(flexModelGammaAll)==0){ + stop('no model fit') + }else{ + if(length(flexModelGammaAll)==0){ + bestModel <- "gauss" + }else{ + bestModel <- "gamma" + } + } + }else{ + bestModel <- c() + if(bestBICGauss0] + sizes <- unlist(lapply(1:length(groups),function(j){ + length(unlist(expression[i,groups[[j]]])) + })) + }else{ + modus <- flexmixOutput$Genes[[i]]$modus + means <- flexmixOutput$Genes[[i]]$means + sds <- flexmixOutput$Genes[[i]]$sds + sizes <- flexmixOutput$Genes[[i]]$sizes + groups <- flexmixOutput$Genes[[i]]$groups } - if(modus==2&&(sizes[1]==ncol(expression2)||sizes[2]==ncol(expression2))){ - modus=1 - means <- means[!is.na(means)] - sds <- sds[!is.na(sds)] - sizes <- sizes[!is.na(sizes)] - groups <- plyr::compact(groups) - } - - output <- list( - "modus" = modus, - "means" = means, - "sds" = sds, - "sizes" = sizes, - "groups" = groups - ) - - }) - - names(outputs)<- genes - flexmixOutput <- list(Genes = outputs,"ClinicalData"="mnt/agnerds/xyz","Expressionmatrix"="mnt/agnerds/xyzyy") + flexmixOutput <- list("modus"=as.integer(modus),"means"=means,"sds"=sds,"sizes"=as.integer(sizes),"groups"=groups) + flexmixOutput + })) + names(cleanOutput) <- names(flexmixOutput$Genes) + flexmixOutput <- list("Genes"=cleanOutput, + "ClinicalData"=flexmixOutput$ClinicalData, + "Expressionmatrix"=flexmixOutput$Expressionmatrix) return(flexmixOutput) } - diff --git a/R/evaluate.R b/R/evaluate.R index e56ca4b..e1ce4e9 100644 --- a/R/evaluate.R +++ b/R/evaluate.R @@ -66,6 +66,7 @@ isApproxSame<- function(validationVal,classVal,maxDistance){ #' value and the validation data value with which the estimated is classified #' as TRUE (correctly estimated) #' @examples +#' \dontrun{ #' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData @@ -77,6 +78,7 @@ isApproxSame<- function(validationVal,classVal,maxDistance){ #' # TRUE if less than +- 20 % difference to value of validationData #' evaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = output, #' validationData = validationData,maxDifference = 20) +#' } #' @return Returns a data.frame where for each as TP classified gene it was #' evaluated whether it was estimated approximately correct or not (TRUE or #' FALSE) @@ -90,9 +92,7 @@ isApproxSame<- function(validationVal,classVal,maxDistance){ #' @export evaluateAlgorithmOutput <- function(modality,algorithmOutput,validationData,maxDifference = 10){ compared <- compareScenarios(modality = modality,algorithmOutput = algorithmOutput,validationData = validationData) - genes <- names(validationData) - approxSame <- (lapply(1:length(algorithmOutput$Genes),function(i){ if((algorithmOutput$Genes[[i]]$modus==modality) && (validationData[[i]]$modus==modality)){ classValsMean <- sort(algorithmOutput$Genes[[i]]$means) @@ -111,7 +111,6 @@ evaluateAlgorithmOutput <- function(modality,algorithmOutput,validationData,maxD props <- unlist(lapply(1:length(validationData[[i]]$sizes),function(j){ isApproxSame(validationVal = validationData[[i]]$sizes[j],classVal = classValProps[j],maxDistance = maxDifference) })) - c(means,sds,props) } })) @@ -137,6 +136,7 @@ evaluateAlgorithmOutput <- function(modality,algorithmOutput,validationData,maxD #' @param validationData The validationData behind the simulated gene #' expression data frame #' @examples +#' \dontrun{ #' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData @@ -144,6 +144,7 @@ evaluateAlgorithmOutput <- function(modality,algorithmOutput,validationData,maxD #' mclustOutput <- useMclust(expression, verbose=FALSE) #' getClassifications(modality = 2,validationData = validationData, #' outputs = list("mclust" = mclustOutput)) +#' } #' @return a classification data frame that contains the number of #' FN, FP, TN and TP #' @@ -160,16 +161,12 @@ evaluateAlgorithmOutput <- function(modality,algorithmOutput,validationData,maxD #' FP (FALSE Positive): validationData: unimodal, assigned scenario: bimodal #' getClassification <- function(modality,algorithmOutput,algorithmName="mclust",validationData){ - compared <- compareScenarios(modality = modality, - algorithmOutput = algorithmOutput,validationData = validationData) - + algorithmOutput = algorithmOutput,validationData = validationData) fn <- table(compared)["FN"] fp <- table(compared)["FP"] tn <- table(compared)["TN"] tp <- table(compared)["TP"] - - classification <- data.frame(matrix(nrow = 1,ncol = 4,data = c(fn,fp,tn,tp), byrow = TRUE)) colnames(classification)<-c("FN","FP","TN","TP") @@ -187,6 +184,7 @@ getClassification <- function(modality,algorithmOutput,algorithmName="mclust",va #' @param algorithmName The name of the algorithm which created the output #' @return A data frame with the percentual proportion of as TRUE or FALSE evaluated values #' @examples +#' \dontrun{ #' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData @@ -195,6 +193,7 @@ getClassification <- function(modality,algorithmOutput,algorithmName="mclust",va #' mclustEvaluation <- evaluateAlgorithmOutput(modality = 2, algorithmOutput = mclustOutput, #' validationData = validationData) #' getStatistics(evaluations = list("mclust" = mclustEvaluation)) +#' } getStatistic <- function(evaluation,algorithmName ="mclust"){ true <- (table(unlist(evaluation))["TRUE"]/(table(unlist(evaluation))["FALSE"]+table(unlist(evaluation))["TRUE"]))*100 false <- (table(unlist(evaluation))["FALSE"]/(table(unlist(evaluation))["FALSE"]+table(unlist(evaluation))["TRUE"]))*100 @@ -213,6 +212,7 @@ getStatistic <- function(evaluation,algorithmName ="mclust"){ #' @param evaluations list of named evaluations #' @return A data frame with the percentual proportion of as TRUE or FALSE evaluated values #' @examples +#' \dontrun{ #' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData @@ -228,10 +228,11 @@ getStatistic <- function(evaluation,algorithmName ="mclust"){ #' getStatistics(evaluations = list("mclust"=mclustEvaluation)) #' #getStatistics of two or more algorithms #' getStatistics(evaluations = list("mclust"=mclustEvaluation,"HDBSCAN"=hdbscanEvaluation)) +#' } #' @export getStatistics <- function(evaluations) { numEvaluations <- length(evaluations) - statistics <- lapply(1:numEvaluations, function(i) getStatistic(evaluation = evaluations[[i]], algorithmName = names(evaluations)[i])) + statistics <- lapply(1:numEvaluations, function(i) {getStatistic(evaluation = evaluations[[i]], algorithmName = names(evaluations)[i])}) stats <- data.frame(do.call('cbind',statistics)) colnames(stats)<- names(evaluations) rownames(stats)<- c("TRUE [%]","FALSE [%]") @@ -251,6 +252,7 @@ getStatistics <- function(evaluations) { #' @return a classification data frame that contains the number of #' FN, FP, TN and TP #' @examples +#' \dontrun{ #' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) #' simulation <- simulateExpression(params= params,verbose=FALSE) #' expression <- simulation$expressionData @@ -264,6 +266,7 @@ getStatistics <- function(evaluations) { #' #getClassifications of two or more algorithms #' getClassifications(modality=2,validationData, #' outputs = list("mclust"=mclustOutput,"HDBSCAN"=hdbscanOutput)) +#' } #' #' @details #' TN (TRUE Negative): validationData: unimodal, assigned scenario: unimodal @@ -281,10 +284,9 @@ getStatistics <- function(evaluations) { getClassifications <- function(modality,validationData,outputs){ numOutputs <- (length(outputs)) classification <- lapply(1:numOutputs,function(i){ - getClassification(modality,algorithmOutput = outputs[[i]], - algorithmName = names(outputs)[i], - validationData = validationData) - + getClassification(modality,algorithmOutput = outputs[[i]], + algorithmName = names(outputs)[i], + validationData = validationData) }) classification <- data.frame(do.call('rbind',classification)) colnames(classification)<-c("FN","FP","TN","TP") @@ -292,3 +294,81 @@ getClassifications <- function(modality,validationData,outputs){ return(classification) } +#' Checks if output fulfills the criteria of the unified output data format +#' @details prints status messages about performed checks +#' @param output to be tested +#' @param expressionmatrix that was used by algorithm to create output +#' @return logical: TRUE if output is in the correct format, FALSE if any check was failed +#' @export +isValidUnifiedOutputDataFormat <- function(output,expressionmatrix){ + correctType <- FALSE + correctLengthOutput <- FALSE + correctLengthGenes <- FALSE + correctFormatClinicalData <- FALSE + correctFormatExpressionmatrix <- FALSE + correctFormatAllGenes <- FALSE + + if(is.list(output)){ + message("Output is of type 'list'") + correctType <- TRUE + }else{ + message("Output is not of required type 'list'.") + return(FALSE)} + if(length(output)==3){ + message("Output has a length of 3.") + correctLengthOutput <- TRUE + }else{ + message("Output has not a length of 3.") + return(FALSE)} + if(length(output$Genes)==nrow(expressionmatrix)){ + message("Output$Genes holds the correct number of genes.") + correctLengthGenes <- TRUE + }else{ + message("Output does not hold the correct number of genes.") + return(FALSE)} + if((is.character(output$ClinicalData))&&(length(output$ClinicalData)==1)){ + message("Output$ClinicalData is of the required type 'character' and the required length of 1.") + correctFormatClinicalData <- TRUE + }else{ + message("Output$ClinicalData is not of the required type 'character' and/or not of the required length of 1.") + return(FALSE)} + if((is.character(output$Expressionmatrix))&&(length(output$Expressionmatrix)==1)){ + message("Output$Expressionmatrix is of the required type 'character' and of the required length of 1.") + correctFormatExpressionmatrix <- TRUE + }else{ + message("Output$Expressionmatrix is not of the required type 'character' and/or not of the required length of 1.") + return(FALSE)} + correctlyFormattedGenes <- try(unlist(lapply(1:length(output$Genes),function(i){ + isCorrectFormattedGene <- FALSE + allCorrectFormat <- is.integer(output$Genes[[i]]$modus)&& + is.double(output$Genes[[i]]$means)&&(length(output$Genes[[i]]$means)==output$Genes[[i]]$modus)&& + is.double(output$Genes[[i]]$sds)&&(length(output$Genes[[i]]$sds)==output$Genes[[i]]$modus)&& + is.integer(output$Genes[[i]]$sizes)&&(length(output$Genes[[i]]$sizes)==output$Genes[[i]]$modus)&& + is.list(output$Genes[[i]]$groups)&&(length(output$Genes[[i]]$groups)==output$Genes[[i]]$modus)&& + (sum(output$Genes[[i]]$sizes)==ncol(expressionmatrix)) + listCorrectLength<- unlist(lapply(1:length(output$Genes[[i]]$groups),function(j){ + length(output$Genes[[i]]$groups[[j]])==output$Genes[[i]]$sizes[[j]] + })) + allListsCorrectLength <- (!any(listCorrectLength==FALSE)) + + isCorrectFormattedGene <- allCorrectFormat&&allListsCorrectLength + })),silent = T) + if(class(correctlyFormattedGenes)=="try-error"){ + message("The correct format of the genes could not be evaluated due to wrong characteristics of the Genes section.") + return(FALSE)} + else{ + correctFormatAllGenes <-(!any(correctlyFormattedGenes==FALSE)) + } + if(correctFormatAllGenes){ + message("All Genes are in the correct format.") + }else{ + message("One or more genes do not fulfil the required format of the Genes section.") + } + + return(correctType&& + correctLengthOutput&& + correctLengthGenes&& + correctFormatClinicalData&& + correctFormatExpressionmatrix&& + correctFormatAllGenes) +} diff --git a/R/filter.R b/R/filter.R index 48cbfca..f6fe9b6 100644 --- a/R/filter.R +++ b/R/filter.R @@ -1,14 +1,54 @@ +#' Filter 0-sum genes +#' +#' @param cancerData TCGA cancer expression data of one cancer group +#' @param parallel logical. Whether to calculate in parallel +#' +#' @return cleared data without 0 sum genes +#' @importFrom doParallel registerDoParallel +#' @importFrom parallel detectCores makeCluster stopCluster +#' @importFrom foreach foreach "%dopar%" +#' @importFrom plyr compact +#' @export +filter0sumGenes <- function(cancerData,parallel =TRUE){ + '%dopar%' <- foreach::`%dopar%` + if(parallel){ + cores <- parallel::detectCores() - 1 + cl <- parallel::makeCluster(cores) + doParallel::registerDoParallel(cl) + i <- c() + notZeroSum <- unlist(plyr::compact(foreach::foreach(i = 1:nrow(cancerData)) %dopar% { + if (sum(cancerData[i, ]) != 0) { + i + } + })) + parallel::stopCluster(cl) + }else{ + notZeroSum <- unlist(plyr::compact(lapply(1:nrow(cancerData),function(i){ + if (sum(cancerData[i, ]) != 0) { + i + } + }))) + } + clearedData <- cancerData[notZeroSum, ] + return(clearedData) +} + #' SilvermanOverP returns a filter function with bindings for P. -#' This function evaluates to TRUE if the Silverman Test's for unimodality can be rejected at the significance level P. #' -#' @param p The P-value that is used to reject Silvermans Test for unimodality (given by k=1, m = 1 using the Hall/York adjustments) +#' @details The null hypothesis of a silvermantest is that the underlying +#' density has at most k modes (H0: number of modes <= k). By rejecting +#' this hypothesis (returned p-value is smaller than given level of +#' significance) one can verify that the underlying density has more +#' than k modes. +#' This function evaluates to TRUE if the Silverman Test's for unimodality (k=1) +#' can be rejected at the significance level p. +#' @param p The p-value that is used to reject Silvermans Test for unimodality (given by k=1 using the Hall/York adjustments) #' @param R The number of bootstrap replicates for each test. #' @param na.rm Whether NAs should be removed. #' #' @return A function with bindings for p and R. -#' @import silvermantest +#' @importFrom silvermantest silverman.test #' -#' @export SilvermanOverP <- function(p, R = 100, na.rm = T) { function(x) { if (na.rm) @@ -18,16 +58,53 @@ SilvermanOverP <- function(p, R = 100, na.rm = T) { } } +#' useSilvermantest uses the SilvermanOverP function and returns filtered data of genes for that SilvermanOverP was TRUE +#' +#' @param clearedData cleared TCGA cancer expression data of one cancer group without 0sum genes +#' @param parallel logical. Whether to calculate in parallel +#' @param SilvermanP The p-value that is used to reject Silvermans Test for unimodality (given by k=1 using the Hall/York adjustments) +#' +#' @return filtered gene expression data of possibly multimodal data +#' @importFrom parallel detectCores makeCluster stopCluster +#' @importFrom doParallel registerDoParallel +#' @importFrom plyr compact +#' @importFrom foreach foreach "%dopar%" +#' @export +useSilvermantest <- function(clearedData,parallel,SilvermanP=0.05){ + '%dopar%' <- foreach::`%dopar%` + if(parallel){ + cores <- parallel::detectCores() - 1 + cl <- parallel::makeCluster(cores) + doParallel::registerDoParallel(cl) + i <- c() + filtered <- foreach::foreach(i = 1:nrow(clearedData),.export="SilvermanOverP") %dopar% + { + apply(clearedData[i, ], 1, FUN = SilvermanOverP(p = SilvermanP)) + } + parallel::stopCluster(cl) + }else{ + filtered <- lapply(1:nrow(clearedData),function(i){ + apply(clearedData[i, ], 1, FUN = SilvermanOverP(p = SilvermanP)) + }) + } + + mayBeBimodal <- unlist(lapply(1:length(filtered),function(i){ + if (filtered[i] == TRUE) { + i + } + })) + mayBeBimodal <- unlist(plyr::compact(mayBeBimodal)) + possibles <- clearedData[mayBeBimodal, ] + return(possibles) +} + #' DipTestOverP returns a filter function with bindings for P. #' This function evaluates to TRUE if the Hartigans Dip Test for unimodality can be rejected at the significance level P. -#' -#' @param p The P-value that is used to reject Hartigans Dip Test of unimodality ( +#' @param p The P-value that is used to reject Hartigans Dip Test of unimodality #' @param R The number of bootstrap replicates for each test. #' @param na.rm Whether NAs should be removed. -#' #' @return A function with bindings for p and R. -#' @import diptest -#' @export +#' @importFrom diptest dip.test DipTestOverP <- function(p, R = 100, na.rm = T) { function(x) { if (na.rm) @@ -36,3 +113,709 @@ DipTestOverP <- function(p, R = 100, na.rm = T) { h$p.value < p } } + +#' useDiptest uses the DipTestOverP function and returns filtered data of genes for that DipTestOverP was TRUE +#' +#' @param clearedData cleared TCGA cancer expression data of one cancer group without 0sum genes +#' @param parallel logical. Whether to calculate in parallel +#' @param DipTestP The p-value that is used to reject Hartigans Dip Test +#' +#' @return filtered gene expression data of possibly multimodal data +#' @importFrom parallel detectCores makeCluster stopCluster +#' @importFrom doParallel registerDoParallel +#' @importFrom plyr compact +#' @importFrom foreach foreach "%dopar%" +#' @export +useDiptest <- function(clearedData,parallel,DipTestP=0.05){ + '%dopar%' <- foreach::`%dopar%` + if(parallel){ + cores <- parallel::detectCores() - 1 + cl <- parallel::makeCluster(cores) + doParallel::registerDoParallel(cl) + i <- c() + filtered <- foreach::foreach(i = 1:nrow(clearedData),.export = "DipTestOverP") %dopar% + { + apply(clearedData[i, ], 1, FUN = DipTestOverP(p = DipTestP)) + } + parallel::stopCluster(cl) + }else{ + filtered <- lapply(1:nrow(clearedData),function(i){ + apply(clearedData[i, ], 1, FUN = DipTestOverP(p = DipTestP)) + }) + } + + mayBeBimodal <- unlist(lapply(1:length(filtered),function(i){ + if (filtered[i] == TRUE) { + i + } + })) + mayBeBimodal <- unlist(plyr::compact(mayBeBimodal)) + possibles <- clearedData[mayBeBimodal, ] + return(possibles) +} + +#' Filter rarely expressed genes +#' +#' @details filters out genes that are rarely expressed +#' (expressed by less than minPercentage of patients) +#' @param possibles cancerData of possible multimodal genes +#' @param minExpressedPercentage numerical. minimum percentage cutoff Value for +#' the minimum percentage of patients +#' @param parallel logical. Whether to calculate in parallel +#' @return cancerData of genes that are not rarely expressed +#' @importFrom parallel detectCores makeCluster stopCluster +#' @importFrom doParallel registerDoParallel +#' @importFrom foreach foreach "%dopar%" +#' @export +filterRarelyExpressedGenes<- function(possibles,minExpressedPercentage= 2,parallel=TRUE){ + '%dopar%' <- foreach::`%dopar%` + if(parallel){ + cores <- parallel::detectCores() - 1 + cl <- parallel::makeCluster(cores) + doParallel::registerDoParallel(cl) + i <- c() + greaterZero <- unlist(foreach::foreach(i = 1:nrow(possibles)) %dopar% + { + areGreaterZero <- (table(unlist(possibles[i, ]) > 0)["TRUE"]) + if (is.na(areGreaterZero)) { + areGreaterZero <- 0 + } + + if (unname(areGreaterZero > (length(possibles[i, ]) * minExpressedPercentage/100))) { + i + } + }) + parallel::stopCluster(cl) + }else{ + greaterZero <- unlist(lapply(1:nrow(possibles),function(i){ + areGreaterZero <- (table(unlist(possibles[i, ]) > 0)["TRUE"]) + if (is.na(areGreaterZero)) { + areGreaterZero <- 0 + } + if (unname(areGreaterZero > (length(possibles[i, ]) * minExpressedPercentage/100))) { + i + } + })) + } + toBeProcessed <- possibles[greaterZero, ] + return(toBeProcessed) +} + +#' cleanOutput +#' returns the cleaned output of algorithms +#' @details This function cleans up the output as empty groups will be deleted, +#' genes with groups < minPercentage of the sum of patients are corrected and +#' modus, means, sds, group sizes and patients belonging to each group will be updated +#' @param cancerGroupOutput output of one cancer group +#' @param cancerGroupExpressionmatrix expression matrix of this one cancer group +#' @param minPercentage minimum percentage of groups. +#' Groups with lower percentages will be sorted into the other groups +#' @param coresToUse The number of cores to use for parallel computing +#' @param parallel logical. Whether cleanOutput should be computed in parallel +#' @return cleaned output of algorithms +#' @importFrom parallel makeCluster stopCluster +#' @importFrom doParallel registerDoParallel +#' @importFrom plyr compact +#' @importFrom foreach foreach "%dopar%" +#' @export +cleanOutput <- function(cancerGroupOutput,cancerGroupExpressionmatrix,minPercentage=10,coresToUse = 20,parallel=TRUE){ + '%dopar%' <- foreach::`%dopar%` + i <- NULL + cleanOutput <- (lapply(1:length(cancerGroupOutput$Genes),function(i){ + #corrects genes with an empty group to the correct modus, means, sds, sizes and groups + isNull <- unlist(lapply(1:length(cancerGroupOutput$Genes[[i]]$groups),function(j){ + if(is.null(unlist(cancerGroupOutput$Genes[[i]]$groups[j]))||length(unlist(cancerGroupOutput$Genes[[i]]$groups[j]))==0){j} + })) + if(!is.null(isNull)){ + modus <- cancerGroupOutput$Genes[[i]]$modus -1 + means <- cancerGroupOutput$Genes[[i]]$means[-isNull] + sds <- cancerGroupOutput$Genes[[i]]$sds[-isNull] + groups <- plyr::compact(cancerGroupOutput$Genes[[i]]$groups) + groups <- groups[lapply(groups,length)>0] + sizes <- unlist(lapply(1:length(groups),function(j){ + length(unlist(cancerGroupExpressionmatrix[i,groups[[j]]])) + })) + }else{ + modus <- cancerGroupOutput$Genes[[i]]$modus + means <- cancerGroupOutput$Genes[[i]]$means + sds <- cancerGroupOutput$Genes[[i]]$sds + sizes <- cancerGroupOutput$Genes[[i]]$sizes + groups <- cancerGroupOutput$Genes[[i]]$groups + } + output <- list("modus"=as.integer(modus),"means"=means,"sds"=sds,"sizes"=as.integer(sizes),"groups"=groups) + output + })) + names(cleanOutput) <- names(cancerGroupOutput$Genes) + outputFiltered <- list("Genes"=cleanOutput, + "ClinicalData"=cancerGroupOutput$ClinicalData, + "Expressionmatrix"=cancerGroupOutput$Expressionmatrix) + + ##corrects genes with groups < minPercentage of patientsum --> patients are sorted to groups with nearest mean, + ## sizes are then newly counted, groups are updated, as well as means, modus and sds + cores <- coresToUse + cleanedOutput <- c() + if(parallel){ + cl <- parallel::makeCluster(cores) + doParallel::registerDoParallel(cl) + cleanedOutput <- (foreach::foreach(i= 1:length(outputFiltered$Genes)) %dopar% { + geneName <- names(outputFiltered$Genes)[i] + if(any(((outputFiltered$Genes[[geneName]]$sizes)/sum(outputFiltered$Genes[[geneName]]$sizes)*100)0){ + changeNow <- which(outputFiltered$Genes[[geneName]]$sizes==min(outputFiltered$Genes[[geneName]]$sizes)) + if(any(((outputFiltered$Genes[[geneName]]$sizes)/sum(outputFiltered$Genes[[geneName]]$sizes)*100)=3 --> first two groups are combined, rest remains unchanged + if(modus>=3){ + sizes <- c(sum(outputFiltered$Genes[[geneName]]$sizes[1:2]), + outputFiltered$Genes[[geneName]]$sizes[3:modus]) + groups <- list(unlist(c(outputFiltered$Genes[[geneName]]$groups[1], + outputFiltered$Genes[[geneName]]$groups[2])), + outputFiltered$Genes[[geneName]]$groups[3:modus]) + means <- unlist(lapply(1:length(groups),function(j){ + mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])])) + })) + sds <- unlist(lapply(1:length(groups),function(j){ + sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])])) + })) + modus <- modus -1 + }else{ + #modus <=2 (==2 because unimodals can't have size numbers unequal to patient number) + #groups are combined + sizes <- c(sum(outputFiltered$Genes[[geneName]]$sizes[1:2])) + groups <- list(unlist(c(outputFiltered$Genes[[geneName]]$groups[1], + outputFiltered$Genes[[geneName]]$groups[2]))) + means <- unlist(lapply(1:length(groups),function(j){ + mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])])) + })) + sds <- unlist(lapply(1:length(groups),function(j){ + sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])])) + })) + modus <- modus -1 + } + }else if(changeNow==max(1:modus)&&modus>1){ + #changeNow is greatest mean + if(modus>=3){ + #highest two groups are combined (last 2 groups), groups 1 to modus-2 remain unchanged + sizes <- c(outputFiltered$Genes[[geneName]]$sizes[1:(modus-2)], + sum(outputFiltered$Genes[[geneName]]$sizes[(modus-1):(modus)])) + groups <- list(outputFiltered$Genes[[geneName]]$groups[1:(modus-2)], + unlist(c(outputFiltered$Genes[[geneName]]$groups[(modus-1)], + outputFiltered$Genes[[geneName]]$groups[modus]))) + means <- unlist(lapply(1:length(groups),function(j){ + mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])])) + })) + sds <- unlist(lapply(1:length(groups),function(j){ + sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])])) + })) + modus <- modus -1 + }else{ + #modus <=2 (==2 because unimodals can't have size numbers unequal to patient number) + #groups are combined + sizes <- c(sum(outputFiltered$Genes[[geneName]]$sizes[1:2])) + groups <- list(unlist(c(outputFiltered$Genes[[geneName]]$groups[1], + outputFiltered$Genes[[geneName]]$groups[2]))) + means <- unlist(lapply(1:length(groups),function(j){ + mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])])) + })) + sds <- unlist(lapply(1:length(groups),function(j){ + sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])])) + })) + modus <- modus -1 + } + } + }else{ + ##changeNow is neither biggest nor smallest mean and + ##modus is >=3 because if modus 2 it would have to be greatest/smallest + left <- changeNow -1 + right <- changeNow +1 + patientsToSort <- cancerGroupExpressionmatrix[geneName,unlist(outputFiltered$Genes[[geneName]]$groups[changeNow])] + means <- outputFiltered$Genes[[geneName]]$means + sorted <- unlist(lapply(1:length(patientsToSort),function(j){ + if(abs(means[left]-patientsToSort[j])0){ + changeNow <- which(outputFiltered$Genes[[geneName]]$sizes==min(outputFiltered$Genes[[geneName]]$sizes)) + if(any(((outputFiltered$Genes[[geneName]]$sizes)/sum(outputFiltered$Genes[[geneName]]$sizes)*100)=3 --> first two groups are combined, rest remains unchanged + if(modus>=3){ + sizes <- c(sum(outputFiltered$Genes[[geneName]]$sizes[1:2]), + outputFiltered$Genes[[geneName]]$sizes[3:modus]) + groups <- list(unlist(c(outputFiltered$Genes[[geneName]]$groups[1], + outputFiltered$Genes[[geneName]]$groups[2])), + outputFiltered$Genes[[geneName]]$groups[3:modus]) + means <- unlist(lapply(1:length(groups),function(j){ + mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])])) + })) + sds <- unlist(lapply(1:length(groups),function(j){ + sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])])) + })) + modus <- modus -1 + }else{ + #modus <=2 (==2 because unimodals can't have size numbers unequal to patient number) + #groups are combined + sizes <- c(sum(outputFiltered$Genes[[geneName]]$sizes[1:2])) + groups <- list(unlist(c(outputFiltered$Genes[[geneName]]$groups[1], + outputFiltered$Genes[[geneName]]$groups[2]))) + means <- unlist(lapply(1:length(groups),function(j){ + mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])])) + })) + sds <- unlist(lapply(1:length(groups),function(j){ + sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])])) + })) + modus <- modus -1 + } + }else if(changeNow==max(1:modus)&&modus>1){ + #changeNow is greatest mean + if(modus>=3){ + #highest two groups are combined (last 2 groups), groups 1 to modus-2 remain unchanged + sizes <- c(outputFiltered$Genes[[geneName]]$sizes[1:(modus-2)], + sum(outputFiltered$Genes[[geneName]]$sizes[(modus-1):(modus)])) + groups <- list(outputFiltered$Genes[[geneName]]$groups[1:(modus-2)], + unlist(c(outputFiltered$Genes[[geneName]]$groups[(modus-1)], + outputFiltered$Genes[[geneName]]$groups[modus]))) + means <- unlist(lapply(1:length(groups),function(j){ + mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])])) + })) + sds <- unlist(lapply(1:length(groups),function(j){ + sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])])) + })) + modus <- modus -1 + }else{ + #modus <=2 (==2 because unimodals can't have size numbers unequal to patient number) + #groups are combined + sizes <- c(sum(outputFiltered$Genes[[geneName]]$sizes[1:2])) + groups <- list(unlist(c(outputFiltered$Genes[[geneName]]$groups[1], + outputFiltered$Genes[[geneName]]$groups[2]))) + means <- unlist(lapply(1:length(groups),function(j){ + mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])])) + })) + sds <- unlist(lapply(1:length(groups),function(j){ + sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])])) + })) + modus <- modus -1 + } + } + }else{ + ##changeNow is neither biggest nor smallest mean and + ##modus is >=3 because if modus 2 it would have to be greatest/smallest + left <- changeNow -1 + right <- changeNow +1 + patientsToSort <- cancerGroupExpressionmatrix[geneName,unlist(outputFiltered$Genes[[geneName]]$groups[changeNow])] + means <- outputFiltered$Genes[[geneName]]$means + sorted <- unlist(lapply(1:length(patientsToSort),function(j){ + if(abs(means[left]-patientsToSort[j])0] + sizes <- unlist(lapply(1:length(groups),function(j){ + length(unlist(cancerGroupExpressionmatrix[i,groups[[j]]])) + })) + + }else{ + modus <- cleanedOutputGroup$Genes[[i]]$modus + means <- cleanedOutputGroup$Genes[[i]]$means + sds <- cleanedOutputGroup$Genes[[i]]$sds + sizes <- cleanedOutputGroup$Genes[[i]]$sizes + groups <- cleanedOutputGroup$Genes[[i]]$groups + } + + groups <- lapply(1:length(groups),function(j){ + unlist(groups[j]) + }) + output <- list("modus"=as.integer(modus),"means"=means,"sds"=sds,"sizes"=as.integer(sizes),"groups"=groups) + output + })) + + names(cleanOutput2) <- names(cleanedOutputGroup$Genes) + output <- list("Genes"=cleanOutput2, + "ClinicalData"=cleanedOutputGroup$ClinicalData, + "Expressionmatrix"=cleanedOutputGroup$Expressionmatrix) + return(output) +} + +#' filterMultimodalGenes +#' returns the output and expression matrix of multimodal genes +#' @details This function returns a list of the output and expression matrix of multimodal genes +#' @param cleanedOutput cleaned output of one cancer group +#' @param processedExpressionmatrix expression matrix of the processed cancer group +#' @return list of the output and expression matrix of multimodal genes +#' @export +filterMultimodalGenes <- function(cleanedOutput,processedExpressionmatrix){ + modus <- unlist(lapply(1:length(cleanedOutput$Genes),function(i){ + cleanedOutput$Genes[[i]]$modus + })) + groupedByModus <- lapply(1:max(modus),function(i){ + unlist(lapply(1:length(modus),function(j){ + if(modus[[j]]==i){ + j + } + })) + }) + multimodalModi <- groupedByModus[2:length(groupedByModus)] + multimodalOutput<- list("Genes"= cleanedOutput$Genes[unlist(multimodalModi)], + "ClinicalData"=cleanedOutput$ClinicalData, + "Expressionmatrix"=cleanedOutput$Expressionmatrix) + multimodalEm<- processedExpressionmatrix[(unlist(names(multimodalOutput$Genes))),] + return(list("Output"=multimodalOutput,"Expressionmatrix"=multimodalEm)) +} + +#' filterForMean +#' returns the output and expression matrix of filtered multimodal genes +#' @details This function returns a list of the output and expression matrix of filtered multimodal genes +#' @param multimodalOutput cleaned output of multimodal genes of one cancer group +#' @param multimodalExpressionmatrix expression matrix of the multimodal genes of one cancer group +#' @param minMean minimum mean that has to exist in any mean of the multimodal genes +#' @return list of the output and expression matrix of filtered multimodal genes +#' @export +filterForMean <- function(multimodalOutput,multimodalExpressionmatrix,minMean = 2){ + filtered <-(unlist(lapply(1:length(multimodalOutput$Genes),function(i){ + geneName <- names(multimodalOutput$Genes)[i] + if(any(unlist(multimodalOutput$Genes[[i]]$means)>minMean)){ + geneName} + }))) + filteredOutput <- list("Genes"= multimodalOutput$Genes[unlist(filtered)], + "ClinicalData"=multimodalOutput$ClinicalData, + "Expressionmatrix"=multimodalOutput$Expressionmatrix) + filteredExpressionmatrix<- multimodalExpressionmatrix[(unlist(filtered)),] + return(list("Output"=filteredOutput,"Expressionmatrix"=filteredExpressionmatrix)) +} + +#' filterForFoldChange +#' returns the output and expression matrix of filtered multimodal genes +#' @details This function returns a list of the output and expression matrix of filtered multimodal genes +#' @param filteredMeanOutput output that was filtered for mean of multimodal genes of one cancer group +#' @param filteredMeanExpressionmatrix expression matrix that was filtered for mean of the multimodal genes of one cancer group +#' @param minFoldChange minimum fold change that has to exist between two adjacent means of the multimodal genes +#' @return list of the output and expression matrix of filtered multimodal genes +#' @export +filterForFoldChange <- function(filteredMeanOutput,filteredMeanExpressionmatrix,minFoldChange=2){ + filtered <-(unlist(lapply(1:length(filteredMeanOutput$Genes),function(i){ + geneName <- names(filteredMeanOutput$Genes)[i] + if(any(diff(unlist(filteredMeanOutput$Genes[[i]]$means))>minFoldChange)){ + geneName} + }))) + filteredOutput <- list("Genes"= filteredMeanOutput$Genes[unlist(filtered)], + "ClinicalData"=filteredMeanOutput$ClinicalData, + "Expressionmatrix"=filteredMeanOutput$Expressionmatrix) + filteredExpressionmatrix<- filteredMeanExpressionmatrix[(unlist(filtered)),] + return(list("Output"=filteredOutput,"Expressionmatrix"=filteredExpressionmatrix)) +} + +#' updateGeneNames +#' returns the output and expression matrix with the updated gene names +#' @details returns the output and expression matrix with the updated gene names of the given output and expression matrix +#' the gene names are loaded from ensembl via the biomaRt package +#' @details This function returns a list of the output and expression matrix of filtered multimodal genes +#' @param filteredOutput output of multimodal genes of one cancer group that was filtered twice +#' @param filteredExpressionmatrix expression matrix of the multimodal genes of one cancer group that was filtered twice +#' @return list of the output and expression matrix with updated gene names taken from Ensembl +#' @importFrom biomaRt useMart getBM +#' @export +updateGeneNames <- function(filteredOutput,filteredExpressionmatrix){ + namesMulti <-(unlist(lapply(1:length(filteredOutput$Genes),function(i){ + unlist(strsplit(names(filteredOutput$Genes)[i],split = "[.]"))[1] + }))) + mart = biomaRt::useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl",host = "www.ensembl.org") + genes = lapply(1:length(namesMulti),function(i){ + geneName <- biomaRt::getBM(attributes = c("ensembl_gene_id","external_gene_name"), + filters = "ensembl_gene_id", + values = namesMulti[i], + mart = mart, + quote = "\\") + if(nrow(geneName)==0){ + geneName <- data.frame("ensembl_gene_id" =namesMulti[i],"external_gene_name"=namesMulti[i]) + } + geneName + }) + genes2 <- do.call('rbind',genes) + geneNames <- rownames(filteredExpressionmatrix)<- make.unique(unlist(genes2[,2])) + names(filteredOutput$Genes) <- rownames((filteredExpressionmatrix)) + return(list("Output"=filteredOutput,"Expressionmatrix"=filteredExpressionmatrix)) +} + +#' Filter output and expressionmatrix for genes that are known in KEGG's "Pathways in Cancer" +#' returns the filtered output and expression matrix +#' @details needs the updated Ensembl Gene Names +#' @param output output of multimodal genes of one cancer group +#' @param expressionmatrix expression matrix of the multimodal genes of one cancer group +#' @return list of the output and expression matrix of genes that were found in KEGG's "Pathways in cancer" +#' @importFrom KEGGREST keggGet +#' @importFrom plyr compact +#' @export +filterForKnownInCancer <- function(output,expressionmatrix){ + query <- KEGGREST::keggGet(c("hsa05200")) + geneEntries <- unlist(plyr::compact(lapply(1:length(query[[1]]$GENE),function(i){ + if(i%%2==0){ + entry <- query[[1]]$GENE[[i]] + strsplit(x = entry,split = ";")[[1]][[1]] + } + }))) + geneNames <- names(output$Genes) + foundInCancerPathways <- unlist(plyr::compact(lapply(1:length(geneNames),function(i){ + if(any(geneEntries==geneNames[[i]])){ + geneNames[[i]] + } + }))) + outputFoundInCancerPathways <- list("Genes"=output$Genes[foundInCancerPathways],"ClinicalData"=output$ClinicalData,"Expressionmatrix"=output$Expressionmatrix) + emFoundInCancerPathways <- expressionmatrix[foundInCancerPathways,] + return(list("Output"=outputFoundInCancerPathways,"Expressionmatrix"=emFoundInCancerPathways)) +} + +#' Filter output and expressionmatrix for genes that are on the X Chromosome of the Human Genome +#' returns filtered output and expression matrix without genes on the X chromosome +#' @details needs the updated Ensembl Gene Names; uses gencode.v28.annotation.gtf +#' @param output output of multimodal genes of one cancer group +#' @param expressionmatrix expression matrix of the multimodal genes of one cancer group +#' @return list of the output and expression matrix of genes that were not found on the X Chromosome +#' @importFrom plyr compact +#' @export +filterForXChromosomeGenes <- function(output,expressionmatrix){ + geneNames <- names(output$Genes) + notXChromGenes <- unlist(plyr::compact(lapply(1:length(geneNames),function(i){ + if(!any(geneNamesX==geneNames[[i]])){ + geneNames[[i]] + } + }))) + filteredOutput <- list("Genes"= output$Genes[unlist(notXChromGenes)], + "ClinicalData"=output$ClinicalData, + "Expressionmatrix"=output$Expressionmatrix) + filteredExpressionmatrix<- expressionmatrix[(unlist(notXChromGenes)),] + return(list("Output"=filteredOutput,"Expressionmatrix"=filteredExpressionmatrix)) +} + +#' Filter output and expressionmatrix for genes that are on the Y Chromosome of the Human Genome +#' returns filtered output and expression matrix without genes on the Y chromosome +#' @details needs the updated Ensembl Gene Names; uses gencode.v28.annotation.gtf +#' @param output output of multimodal genes of one cancer group +#' @param expressionmatrix expression matrix of the multimodal genes of one cancer group +#' @return list of the output and expression matrix of genes that were not found on the Y Chromosome +#' @importFrom plyr compact +#' @export +filterForYChromosomeGenes <- function(output,expressionmatrix){ + geneNames <- names(output$Genes) + notYChromGenes <- unlist(plyr::compact(lapply(1:length(geneNames),function(i){ + if(!any(geneNamesY==geneNames[[i]])){ + geneNames[[i]] + } + }))) + filteredOutput <- list("Genes"= output$Genes[unlist(notYChromGenes)], + "ClinicalData"=output$ClinicalData, + "Expressionmatrix"=output$Expressionmatrix) + filteredExpressionmatrix<- expressionmatrix[(unlist(notYChromGenes)),] + return(list("Output"=filteredOutput,"Expressionmatrix"=filteredExpressionmatrix)) +} diff --git a/R/genesXY.R b/R/genesXY.R new file mode 100644 index 0000000..1f94c0a --- /dev/null +++ b/R/genesXY.R @@ -0,0 +1,19 @@ +#' Genes on the X Chromosome of Homo sapiens +#' +#' Created from gencode.v28.annotation.gtf +#' The result is a vector of gene names +#' Is used in "filterForXChromosomeGenes" function +#' @source \url{https://www.gencodegenes.org/releases/current.html} +#' @docType data +#' @usage data("geneNamesX") +"geneNamesX" + +#' Genes on the Y Chromosome of Homo sapiens +#' +#' Created from gencode.v19.annotation.gtf +#' The result is a vector of gene names +#' Is used in "filterForYChromosomeGenes" function +#' @source \url{https://www.gencodegenes.org/releases/current.html} +#' @docType data +#' @usage data("geneNamesY") +"geneNamesY" diff --git a/R/params.R b/R/params.R index f68df39..abe3d00 100644 --- a/R/params.R +++ b/R/params.R @@ -8,9 +8,10 @@ #' @return The extracted parameter value #' #' @examples +#' \dontrun{ #' params <- newParams() #' getParam(object = params,name = "nGenes") -#' +#' } #' @rdname getParam #' @export setGeneric("getParam", function(object, name) {standardGeneric("getParam")}) @@ -26,9 +27,10 @@ setGeneric("getParam", function(object, name) {standardGeneric("getParam")}) #' @return Object with new parameter value. #' #' @examples +#' \dontrun{ #' params <- newParams() #' params <- setParam(object = params,name = "nPatients",value = 250) -#' +#' } #' @rdname setParam #' @importFrom methods new rbind2 slot slot<- slotNames validObject #' @export @@ -37,21 +39,18 @@ setGeneric("setParam", standardGeneric("setParam") }) - #' The Params virtual class #' -#' Virtual S4 class that all other Params classes inherit from. +#' Virtual S4 class that stores all parameters needed for the simulation of gene expression data. #' #' @section Parameters: -#' #' The Params class defines the following parameters: -#' #' \describe{ #' \item{\code{nGenes}}{A list containing the numbers of genes to simulate for #' each scenario and modality.} #' \item{\code{means}}{A list containing the parameter range for the mean for #' each scenario and modality.} -#' \item{\code{foldChanges}}A list containing the parameter range for the log2 +#' \item{\code{foldChanges}}{A list containing the parameter range for the log2 #' foldChanges between distributions for multimodal genes. Has to contain one #' entry less than the modality that is simulated.} #' \item{\code{sd}}{The parameters mu and lambda for the generation of the @@ -119,23 +118,23 @@ setMethod("show", "Params", function(object) { #' New Params #' -#' Create a new Params object. Functions exist for each of the different -#' Params subtypes. +#' Create a new Params object. #' #' @param ... additional parameters passed to \code{\link{setParams}}. #' #' @return New Params object. #' @examples +#' \dontrun{ #' params <- newParams() -#' params <- newParams("nGenes"=list("1"=100,"2"=list("gauss"=100,"gamma"=100)), +#' params <- newParams("nGenes"=list("1"=10,"2"=list("gauss"=10,"gamma"=10)), #' "proportions"=c(30,70)) #' params <- newParams( -#' nGenes=list("1"=c(700),"2"=list(gauss = 150, gamma = 150), -#' "3"=list(gauss = 150, gamma = 150)), +#' nGenes=list("1"=c(70),"2"=list(gauss = 15, gamma = 15), +#' "3"=list(gauss = 15, gamma = 15)), #' means=list("1"= c(2, 4),"2"= c(2, 4),"3"= c(2, 4)), #' foldChanges = list("1"=NA,"2"= list(gauss = c(2, 4), gamma = c(2, 4)), #' "3"= list(gauss = list(c(2, 4), c(2, 4)), gamma = list(c(2, 4), c(2, 4))))) -#' +#' } #' @rdname newParams #' @export newParams <- function(...) { @@ -155,8 +154,10 @@ newParams <- function(...) { #' #' @return List with the values of the selected parameters. #' @examples +#' \dontrun{ #' params <- newParams() #' getParams(params = params,names = c("nGenes","foldChanges","proportions","means")) +#' } #' @export getParams <- function(params, names) { @@ -185,6 +186,7 @@ getParams <- function(params, names) { #' #' @return Params object with updated values. #' @examples +#' \dontrun{ #' params <- newParams() #' params #' # Set individually @@ -192,10 +194,11 @@ getParams <- function(params, names) { #' foldChanges = list("1"=c(2,4),"2"= list("gauss"=c(2,5),"gamma"=c(3,5)))) #' params #' # Set via update list -#' params <- setParams(params,update = list("nGenes"=list("1"=100, -#' "2"=list("gauss"=200,"gamma"=100)),"means"=list("1"=c(1,3),"2"=c(2,4)), +#' params <- setParams(params,update = list("nGenes"=list("1"=10, +#' "2"=list("gauss"=20,"gamma"=10)),"means"=list("1"=c(1,3),"2"=c(2,4)), #' "nPatients"=700)) #' params +#' } #' @export setParams <- function(params, update = NULL, ...) { @@ -216,7 +219,7 @@ setParams <- function(params, update = NULL, ...) { #' Show pretty print #' -#' Function used for pretty printing params object. +#' Function used for pretty printing Params object. #' #' @param params object to show. #' @param pp list specifying how the object should be displayed. @@ -250,4 +253,3 @@ showPP <- function(params, pp) { cat("\n") } } - diff --git a/R/plotting.R b/R/plotting.R new file mode 100644 index 0000000..820446f --- /dev/null +++ b/R/plotting.R @@ -0,0 +1,418 @@ +#' Plot Gene Expression +#' +#' Plots the (simulated) gene expression data frame, creates a pdf file +#' @param expression The gene expression data frame that shall be plotted +#' @param plotsPerPage The number of plots per Page (has to be a divisor of +#' GenesToPlot) +#' @param GenesToPlot The number of genes to be plotted +#' @param plotName The name of the plot without .pdf +#' @examples +#' \dontrun{ +#' params <- newParams(nGenes = list("1"=5,"2"=list("gauss"= 5,"gamma"= 5))) +#' simulation <- simulateExpression(params= params,verbose=FALSE) +#' expression <- simulation$expressionData +#' plotExpression(expression,plotsPerPage = 4,GenesToPlot = 12,plotName = "plot12Genes") +#' plotExpression(expression,plotsPerPage = 5,GenesToPlot = nrow(expression), +#' plotName = "plotAllGenes") +#' } +#' +#' @return The name of the plot in the pdf file +#' @importFrom ggplot2 aes ggplot geom_density facet_wrap labs theme element_text +#' @importFrom grDevices dev.off pdf +#' @importFrom reshape2 melt +#' @export +plotExpression <- function(expression,plotsPerPage = 1,GenesToPlot = 100, + plotName="expression"){ + plotName <- sprintf(paste0(plotName, ".pdf")) + grDevices::pdf(plotName, 7, 5) + + for (i in seq(0,((GenesToPlot/plotsPerPage))-1,1)){ + section <- (c(((i*plotsPerPage)+1):((i+1)*plotsPerPage))) + #models[((i*plotsPerPage)+1):((i+1)*plotsPerPage)] + expressionVector<-c() + for(j in section){ + expressionVector<-rbind2(expressionVector, expression[j,]) + #transformate data for ggplot + meltedExpression <- reshape2::melt(data = t(expressionVector)) + } + #facetted Plot + value <- meltedExpression + a<- ggplot2::ggplot(meltedExpression, ggplot2::aes(x= value)) + + ggplot2::geom_density()+ + ggplot2::facet_wrap(~Var2)+ + ggplot2::labs(x = "log2 Expression",y = "Density")+ + ggplot2::theme( + axis.title.x = ggplot2::element_text(size = 12), + axis.title.y = ggplot2::element_text(size = 12) + ) + plot(a) + } + grDevices::dev.off() + return(plotName) +} + +#' Source: http://tinyheero.github.io/2015/10/13/mixture-model.html +#' Plot a Mixture Component +#' +#' @param x Input data +#' @param mu Mean of component +#' @param sigma Standard deviation of component +#' @param lam Mixture weight of component +plot_mix_comps <- function(x, mu, sigma, lam) { + lam * dnorm(x, mu, sigma) +} + +#' Plot Output +#' +#' Plots the output of algorithms with the right data format +#' with the (simulated) gene expression, creates a pdf file +#' @param expression The gene expression data frame that shall be plotted +#' @param algorithmOutput The output of the algorithm +#' @param GenesToPlot The number of genes to be plotted +#' @param name The name of the plot without .pdf +#' @examples +#' \dontrun{ +#' params <- newParams(nGenes = list("1"=5,"2"=list("gauss"= 5,"gamma"= 5))) +#' simulation <- simulateExpression(params= params,verbose=FALSE) +#' expression <- simulation$expressionData +#' output <- useMclust(expression = expression, verbose = FALSE) +#' plotOutput(expression = expression,algorithmOutput = output, +#' GenesToPlot = 10,name = "plot10GenesWExpression") +#' plotOutput(expression = expression,algorithmOutput = output, +#' GenesToPlot = nrow(expression),name = "plotAllGenesWExpression") +#' } +#' +#' @return The plot name of the plot in the pdf file +#' @importFrom ggplot2 ggplot geom_density aes stat_function labs theme element_text +#' @importFrom grDevices dev.off pdf +#' @export +plotOutput <- function(expression,algorithmOutput,GenesToPlot = 100,name="output"){ + name <- sprintf(paste0(name, ".pdf")) + grDevices::pdf(name, 7, 5) + lapply(1:GenesToPlot, function(i){ + nPatients<- sum(algorithmOutput$Genes[[i]]$sizes) + titles <- rownames(expression) + title <- titles[i] + outputPlot <- ggplot2::ggplot(data.frame(t(expression[i,]))) + + ggplot2::geom_density(size = 1.1,ggplot2::aes(t(expression[i,])), + show.legend = TRUE) + + lapply(1:length(algorithmOutput$Genes[[i]]$means),function(x){ + ggplot2::stat_function(geom = "line", fun = plot_mix_comps, + args = list(algorithmOutput$Genes[[i]]$means[x], algorithmOutput$Genes[[i]]$sds[x], + algorithmOutput$Genes[[i]]$sizes[x]/nPatients), + colour = "red", lwd = 1.1,show.legend = TRUE) + })+ggplot2::labs(x = "log2 Expression",y = "Density",title= title)+ + ggplot2::theme( + plot.title = ggplot2::element_text(face = "bold",size = 16,hjust = 0.5), + axis.title.x = ggplot2::element_text(size = 12), + axis.title.y = ggplot2::element_text(size = 12) + ) + plot(outputPlot) + }) + grDevices::dev.off() + return(name) +} +#' Plot Statistics +#' +#' Plots the output of the statistics as a bar plot, creates a pdf file +#' @param statistics The output of the getStatistics function +#' @param name The name of the plot without .pdf +#' @examples +#' \dontrun{ +#' params <- newParams(nGenes = list("1"=5,"2"=list("gauss"= 5,"gamma"= 5))) +#' simulation <- simulateExpression(params= params,verbose=FALSE) +#' expression <- simulation$expressionData +#' validationData <- simulation$validationData +#' outputMclust <- useMclust(expression = expression, verbose = FALSE) +#' outputHdbscan <- useHdbscan(expression = expression, verbose = FALSE) +#' evaluationMclust <- evaluateAlgorithmOutput(modality = 2, +#' algorithmOutput = outputMclust, +#' validationData = validationData, +#' maxDifference = 10) +#' evaluationHDBSCAN <- evaluateAlgorithmOutput(modality = 2, +#' algorithmOutput = outputHdbscan, +#' validationData = validationData,maxDifference = 10) +#' statistics <- getStatistics(evaluations = list("mclust"=evaluationMclust, +#' "HDBSCAN"=evaluationHDBSCAN)) +#' plotStatistics(statistics = statistics,name = "mclustHDBSCAN") +#' } +#' +#' @importFrom ggplot2 ggplot aes geom_bar scale_fill_manual theme_bw theme element_text labs +#' @importFrom grDevices dev.off grey.colors pdf +#' @export +plotStatistics <- function(statistics,name="statistics"){ + statistics[is.na(statistics)] <- 0 + name <- sprintf(paste0(name, ".pdf")) + grDevices::pdf(name, 7, 5) + lapply(1:length(statistics),function(i){ + value<- statistics[,i] + name<-c("TRUE","FALSE") + data <- data.frame(name,value) + statPlot <- ggplot2::ggplot(data = data,ggplot2::aes(x=name,y = value))+ + ggplot2::geom_bar(stat = "identity", + ggplot2::aes(fill=grey.colors(2,0,1)))+ + ggplot2::scale_fill_manual(values = c("grey57","grey30"))+ + ggplot2::theme_bw()+ggplot2::theme(legend.position = "none",plot.title = ggplot2::element_text(hjust = 0.5))+ + ggplot2::labs(title=names(statistics)[i],x="Statistic",y="Rate in [%]") + plot(statPlot) + }) + grDevices::dev.off() + return(name) +} +#' Plot Classifications +#' +#' Plots the output of the classifications as a bar plot with percentages, +#' creates a pdf file +#' @param classifications The output of the getStatistics function +#' @param name The name of the plot without .pdf +#' @examples +#' \dontrun{ +#' params <- newParams(nGenes = list("1"=5,"2"=list("gauss"= 5,"gamma"= 5))) +#' simulation <- simulateExpression(params= params,verbose=FALSE) +#' expression <- simulation$expressionData +#' validationData <- simulation$validationData +#' outputMclust <- useMclust(expression = expression, verbose = FALSE) +#' outputHdbscan <- useHdbscan(expression = expression, verbose = FALSE) +#' classifications <- getClassifications(modality = 2, +#' validationData = validationData, +#' outputs = list("mclust"=outputMclust,"HDBSCAN"=outputHdbscan)) +#' plotClassifications(classifications = classifications, +#' name = "mclustHDBSCAN_Classification") +#' } +#' +#' @importFrom ggplot2 ggplot aes geom_bar scale_fill_manual theme_bw theme element_text labs +#' @importFrom grDevices dev.off pdf +#' @export +plotClassifications <- function(classifications, name="classifications"){ + classifications[is.na(classifications)] <- 0 + classificationsPerc <- classifications/sum(classifications[1,])*100 + name <- sprintf(paste0(name, ".pdf")) + grDevices::pdf(name, 7, 5) + lapply(1:nrow(classificationsPerc),function(i){ + nameClassifications=c("FN","FP","TN","TP") + value <- unname(unlist(classificationsPerc[i,])) + data <- data.frame(nameClassifications,value) + classPlot <- ggplot2::ggplot(data = data,ggplot2::aes(x=name,y=value))+ + ggplot2::geom_bar(stat = "identity",fill= c("red4","firebrick2","limegreen","lawngreen"))+ + ggplot2::scale_fill_manual(values = c("red4","firebrick2","limegreen","lawngreen"))+ + ggplot2::theme_bw()+ggplot2::theme(legend.position = "none",plot.title = ggplot2::element_text(hjust = 0.5))+ + ggplot2::labs(title=rownames(classificationsPerc[i,]),x="Classification", y="Rate in [%]") + plot(classPlot) + }) + grDevices::dev.off() + return(name) +} + +#' Plot Survival Curves +#' +#' Plots the survival curves of TCGA data and calculates whether survival curves of patient groups within a gene are significantly different with a log-rank test or Mantel-Haenszel test. p-Values are calculated between all patient groups. +#' @param metadata The metadata of the TCGA data +#' @param output (Filtered) Algorithm Output in the unified output data format +#' @param sampleType Sample Type of the TCGA data that was used +#' @param name The name of the plot without .pdf +#' +#' +#' @importFrom ggplot2 ggplot aes ggtitle scale_color_discrete theme element_text coord_fixed xlab ylab +#' @importFrom cowplot ggdraw add_sub +#' @importFrom grDevices dev.off pdf +#' @importFrom magrittr "%>%" +#' @importFrom stringr str_replace_all +#' @importFrom graphics hist par +#' @importFrom stats as.formula pchisq +#' @importFrom dplyr mutate select +#' @importFrom survival Surv survdiff +#' @importFrom ggkm geom_km +#' @export +plotSurvivalCurves <- function(metadata,output,sampleType="Primary Tumor",name="survivalCurves"){ + `%>%` <- magrittr::`%>%` + correctSampleType <- which(metadata$sampleType==sampleType) + metadataSampleType <- metadata[correctSampleType,] + metadataPatientIDs <- unlist(lapply(1:nrow(metadataSampleType),function(i){ + id <- stringr::str_replace_all(string = metadataSampleType$caseID[[i]],pattern = "-",replacement = ".") + if(any(c(0:9)==substr(x = id,start = 1,stop = 1))){ + id <- paste0("X",id) + } + id + })) + metadataUniqueIDs <- make.unique(metadataPatientIDs) + genes <- names(output$Genes) + + patients <- unlist(output$Genes[[1]]$groups) + patientMetadata <- sort(unlist(lapply(patients,function(i){ + patientNumber <- which(make.unique(metadataUniqueIDs)==i) + }))) + metadataSampleType$caseID <- metadataUniqueIDs + smallMetadata <- metadataSampleType[patientMetadata,] + + phenotypes <- smallMetadata %>% + dplyr::mutate(age = trunc(age*(-1)/365), + age_group = cut(age, breaks=c(-Inf, 30, 40, 50, 60, Inf), labels = c("0-30","31-40","41-50","51-60","61+")), + survival = ifelse(is.na(deathIn), followUpIn, deathIn), + is_dead = ifelse(is.na(deathIn), FALSE, TRUE)) %>% + dplyr::select(-deathIn, -followUpIn, -age) -> phenotypes + + pVals <- calcPValues(phenotypes = phenotypes,output = output,smallMetadata = smallMetadata) + pValsAll <- lapply(1:length(pVals),function(i){ + pVals[[i]]$All + }) + names(pValsAll) <- names(pVals) + + #pAdjAll <- p.adjust(unlist(pValsAll),method = pAdjMethod) + + pValsCombination <- lapply(1:length(pVals),function(i){ + pVals[[i]]$PerCombination + }) + names(pValsCombination) <- names(pVals) + # modalityGreaterTwo <- length(unlist(lapply(1:length(output$Genes),function(i){ + # if(output$Genes[[i]]$modus>2){i} + # }))) + # + # maxCombinations <- max(unlist(lapply(1:length(pValsCombination),function(i){length(pValsCombination[[i]])}))) + # pValuesperCombination <- lapply(1:maxCombinations,function(i){ + # unlist(lapply(1:length(pValsCombination),function(j){ + # pValsCombination[[j]][i] + # })) + # }) + # pAdjCombination <- lapply(1:maxCombinations,function(i){ + # perCombination <- p.adjust(unlist(pValuesperCombination[[i]]),method = pAdjMethod) + # perCombination <- c(rep(NA,(length(pValsCombination)-length(perCombination))),perCombination) + # names(perCombination) <- names(pValsCombination) + # perCombination + # }) + # pAdjPerGene <- lapply(1:length(pValsCombination),function(i){ + # unlist(lapply(1:maxCombinations,function(j){ + # perGene <- pAdjCombination[[j]][i] + # if(is.null(pValsCombination[[i]])){ + # names(perGene) <- NA + # }else{ + # names(perGene) <- names(pValsCombination[[i]])[j] + # } + # perGene + # })) + # }) + # names(pAdjPerGene) <- names(pValsCombination) + + grDevices::pdf(paste0(name,"_pVals.pdf"), 7, 5) + # graphics::par(mfrow=c(1,2)) + graphics::hist(unlist(c(pValsAll,pValsCombination)), breaks=10,xlab = "pValues",main = "Histogram of all pValues") + # graphics::hist(unlist(c(pAdjAll,pAdjPerGene)), breaks=10,xlab = "adjusted pValues",main = "Histogram of all adjusted pValues") + + graphics::hist(unlist(pValsAll), breaks=10,xlab = "pValues of all Groups",main = "Histogram of all group pValues") + # graphics::hist(unlist(pAdjAll), breaks=10,xlab = "adjusted pValues of all Groups",main = "Histogram of all group adj. pValues") + + graphics::hist(unlist(pValsCombination), breaks=10,xlab = "pValues of Combinations",main = "Histogram of all combination pValues") + # graphics::hist(unlist(pAdjPerGene), breaks=10,xlab = "adjusted pValues of Combinations",main = "Histogram of all combination adj. pValues") + # graphics::par(mfrow=c(1,1)) + dev.off() + + name <- sprintf(paste0(name, ".pdf")) + grDevices::pdf(name, 7, 5) + plots<- lapply(1:length(output$Genes),function(i){ + geneName <- names(output$Genes)[[i]] + allPatients <-unlist(output$Genes[[i]]$groups) + names(allPatients) <- rep(1:length(output$Genes[[i]]$groups),output$Genes[[i]]$sizes) + groupMembership <- unlist(lapply(1:nrow(smallMetadata),function(j){ + as.integer(names(allPatients[which(allPatients==smallMetadata$caseID[j])])) + })) + + main = paste(geneName,"(p-value", ifelse(pValsAll[[geneName]]<0.001, "< 0.001", paste("=",round(pValsAll[[geneName]],digits=3))),")") + annotation <- c() + if(!is.null(pValsCombination[[geneName]])){ + annotation <- unlist(lapply(1:length(pValsCombination[[geneName]]),function(j){ + paste(names(pValsCombination[[geneName]])[[j]],ifelse(pValsCombination[[geneName]][[j]]<0.001,"< 0.001",paste("=",round(pValsCombination[[geneName]][[j]],digits = 3))),"\n") + })) + annotation<- paste(annotation,collapse = "") + annotation <- paste0("p-Values:\n",annotation) + } + maxDays = max(phenotypes$survival, na.rm = T) + survivalCurve = ggplot2::ggplot(data = phenotypes, ggplot2::aes(time=phenotypes$survival, status=as.numeric(phenotypes$is_dead),color=as.factor(groupMembership))) + + ggkm::geom_km() + + ggplot2::coord_fixed(ratio = maxDays) + + ggplot2::xlab("Time") + + ggplot2::ylab("Survival") + + ggplot2::ggtitle(main)+ + ggplot2::scale_color_discrete(name = "groups")+ + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5),legend.justification = c(1,1),legend.position = c(1,1)) + plot(cowplot::ggdraw(cowplot::add_sub(survivalCurve,annotation))) + # plot(survivalCurve) + }) + dev.off() + # name <- sprintf(paste0(name, ".pdf")) + # grDevices::pdf(name, 7, 5) + # plots<- lapply(1:length(output$Genes),function(i){ + # geneName <- names(output$Genes)[[i]] + # allPatients <-unlist(output$Genes[[i]]$groups) + # names(allPatients) <- rep(1:length(output$Genes[[i]]$groups),output$Genes[[i]]$sizes) + # groupMembership <- unlist(lapply(1:nrow(smallMetadata),function(j){ + # as.integer(names(allPatients[which(allPatients==smallMetadata$caseID[j])])) + # })) + # + # main = paste(geneName,"(p-value", ifelse(pAdjAll[[paste0(geneName,".All")]]<0.001, "< 0.001", paste("=",round(pAdjAll[[paste0(geneName,".All")]],digits=3))),")") + # annotation <- c() + # if(!is.na(pAdjPerGene[[geneName]][1])){ + # annotation <- unlist(lapply(1:length(pAdjPerGene[[geneName]]),function(j){ + # paste(names(pAdjPerGene[[geneName]])[[j]],ifelse(pAdjPerGene[[geneName]][[j]]<0.001,"< 0.001",paste("=",round(pAdjPerGene[[geneName]][[j]],digits = 3))),"\n") + # })) + # annotation<- paste(annotation,collapse = "") + # annotation <- paste0("p-Values:\n",annotation) + # } + # maxDays = max(phenotypes$survival, na.rm = T) + # survivalCurve = ggplot2::ggplot(data = phenotypes, ggplot2::aes(time=phenotypes$survival, status=as.numeric(phenotypes$is_dead),color=as.factor(groupMembership))) + + # ggkm::geom_km() + + # ggplot2::coord_fixed(ratio = maxDays) + + # ggplot2::xlab("Time") + + # ggplot2::ylab("Survival") + + # ggplot2::ggtitle(main)+ + # ggplot2::scale_color_discrete(name = "groups")+ + # ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5),legend.justification = c(1,1),legend.position = c(1,1)) + # plot(cowplot::ggdraw(cowplot::add_sub(survivalCurve,annotation))) + # # plot(survivalCurve) + # }) + # dev.off() + return(name) +} +#'Function to calculate the p-Values of the survival curves +#' +#' @details calculates p-Values of the Chi-squared function for all combinations of patient groups per gene +#' @param phenotypes converted metadata with columns "survival" and "is_dead" +#' @param output (Filtered) Algorithm Output in the unified output data format +#' @param smallMetadata metadata of relevant patients +#' @return returns the p-Values of each gene as a list with categories "All" and "perCombination" +#' +#' @importFrom survival Surv survdiff +#' @importFrom utils combn +calcPValues <- function(phenotypes,output,smallMetadata){ + surv <- survival::Surv(time = phenotypes$survival, event = phenotypes$is_dead) + genes <- names(output$Genes) + pVals <- sapply(genes, function(i) { + allPatients <-unlist(output$Genes[[i]]$groups) + names(allPatients) <- rep(1:length(output$Genes[[i]]$groups),output$Genes[[i]]$sizes) + groupMembership <- unlist(lapply(1:nrow(smallMetadata),function(j){ + as.integer(names(allPatients[which(allPatients==smallMetadata$caseID[j])])) + })) + d <- survival::survdiff(formula = as.formula("surv ~ groupMembership"), data=phenotypes) + pValAll <- pchisq(d$chisq, length(d$n)-1,lower.tail = FALSE) + names(pValAll) <- "All" + if(max(groupMembership)>2){ + groups <- lapply(1:max(groupMembership),function(j){ + which(groupMembership==j) + }) + combinations <- utils::combn(1:max(groupMembership),2) + pVals1 <- unlist(lapply(1:ncol(combinations),function(j){ + group1 <- which(groupMembership==combinations[,j][1]) + group2 <- which(groupMembership==combinations[,j][2]) + smallPhenotypes <- phenotypes[c(group1,group2),] + smallGroupMembership<- groupMembership[(sort(c(group1,group2)))] + smallSurv <- survival::Surv(time = smallPhenotypes$survival,event = smallPhenotypes$is_dead) + d <- survival::survdiff(formula = as.formula("smallSurv ~ smallGroupMembership"),data = smallPhenotypes) + pVal <- pchisq(d$chisq,length(d$n)-1,lower.tail = FALSE) + names(pVal) <- paste(combinations[,j][1],":",combinations[,j][2]) + pVal + })) + pVals <- list("All"=pValAll,"PerCombination"=pVals1) + }else{ + pVals <- list("All"=pValAll) + } + }) + return(pVals) +} diff --git a/R/simulation.R b/R/simulation.R index fdcbaa5..ea667ed 100644 --- a/R/simulation.R +++ b/R/simulation.R @@ -8,28 +8,28 @@ #' @param ... any additional parameter settings to override what is provided in \code{params}. #' #' @details -#' The "modus" is the number of occurring distributions within one gene. -#' Modus "1" - Unimodal gaussian distribution -#' Gene expression for the unimodal gaussian distribution (scenario "1") is +#' The 'modus' is the number of occurring distributions within one gene. +#' Modus '1' - Unimodal gaussian distribution +#' Gene expression for the unimodal gaussian distribution (scenario '1') is #' simulated from the mean and the standard deviation taken from the validation data #' for this gene. The number of values that are simulated for each -#' distribution equals the number of Patients {"nPatients"}: +#' distribution equals the number of Patients {'nPatients'}: #' #' All other modi consist of two different scenarios: -#' In the "gauss" scenario all occurring distributions are gaussian distributions. -#' In the "gamma" scenario the first occuring distribution is a gamma distribution. +#' In the 'gauss' scenario all occurring distributions are gaussian distributions. +#' In the 'gamma' scenario the first occuring distribution is a gamma distribution. #' All other distributions are gaussian distributions. #' -#' Modus "2" or higher - Bimodal distributions or higher modalities +#' Modus '2' or higher - Bimodal distributions or higher modalities #' #' Gene expressions of the multimodal genes are #' simulated automatically for each distribution. -#' Scenario "gauss" +#' Scenario 'gauss' #' The different gaussian distributions are simulated with the corresponding mean #' and a standard deviation and number of values taken from the validation data #' for this gene. #' -#' Scenario "gamma" +#' Scenario 'gamma' #' The gamma distribution is simulated with the gamma shape and rate values from #' the params object. The other gaussian distributions are simulated with the values #' from the validation data. @@ -37,6 +37,7 @@ #' @return A named list with two components: 'validationData' and 'expressionData' #' #' @examples +#' \dontrun{ #' simulation <- simulateExpression(params = newParams()) #' # Override default parameters #' # Not recommended, better use setParams; See \code{\link{Params}} @@ -45,6 +46,7 @@ #' # without messages printed to the console #' params <- newParams() #' simulation <- simulateExpression(params = params,verbose = FALSE) +#' } #' @export #' @importFrom checkmate assertClass assertString assertCharacter assertList #' @importFrom stats dnorm rgamma rnorm runif sd @@ -234,9 +236,9 @@ createValidationData <- function(params = newParams(),verbose = TRUE){ patients <- 1 groups <- lapply(1:length(sizes),function(i){ if(i==1){ - sprintf("Patient%g",1:sizes[i]) + sprintf("Patient%04d",1:sizes[i]) }else{ - sprintf("Patient%g",((sum(sizes[1:(i-1)])+1):(sizes[i]+sum(sizes[1:(i-1)])))) + sprintf("Patient%04d",((sum(sizes[1:(i-1)])+1):(sizes[i]+sum(sizes[1:(i-1)])))) } }) scenario <-c() diff --git a/README.md b/README.md index 7bb026f..8ab87a3 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,2 @@ -# bimodalR -Simulation of bimodal gene expression and evaluation of algorithms for bimodality detection. +# multimodalR +Simulation of multimodal gene expression, evaluation of algorithms detecting multimodality in gene expression data sets, and detection of multimodal genes in gene expression data sets. diff --git a/data-raw/XYChromosomeGenes.R b/data-raw/XYChromosomeGenes.R new file mode 100644 index 0000000..6812af9 --- /dev/null +++ b/data-raw/XYChromosomeGenes.R @@ -0,0 +1,19 @@ +library(rtracklayer) + +my_file <- ("data-raw/gencode.v28.annotation.gtf.gz") +gtf <- import(gzfile(my_file)) +entryX <- which(gtf@seqnames@values=="chrX") +entryY <- which(gtf@seqnames@values=="chrY") +lengthX <- gtf@seqnames@lengths[entryX] +lengthY <- gtf@seqnames@lengths[entryY] +lengthsBeforeX <- sum(gtf@seqnames@lengths[1:(entryX-1)]) +lengthsBeforeY <- sum(gtf@seqnames@lengths[1:(entryY-1)]) + +gtfX <- gtf[(lengthsBeforeX+1):(lengthsBeforeX+lengthX)] +gtfY <- gtf[(lengthsBeforeY+1):(lengthsBeforeY+lengthY)] + +geneNamesX <- unique(gtfX$gene_name) +geneNamesY <- unique(gtfY$gene_name) + +devtools::use_data(geneNamesX,compress = "xz",overwrite=T) +devtools::use_data(geneNamesY,compress = "xz",overwrite=T) diff --git a/data-raw/gencode.v28.annotation.gtf.gz b/data-raw/gencode.v28.annotation.gtf.gz new file mode 100644 index 0000000..5da7c0d Binary files /dev/null and b/data-raw/gencode.v28.annotation.gtf.gz differ diff --git a/data/geneNamesX.rda b/data/geneNamesX.rda new file mode 100644 index 0000000..369ab3a Binary files /dev/null and b/data/geneNamesX.rda differ diff --git a/data/geneNamesY.rda b/data/geneNamesY.rda new file mode 100644 index 0000000..7317933 Binary files /dev/null and b/data/geneNamesY.rda differ diff --git a/man/DipTestOverP.Rd b/man/DipTestOverP.Rd index 6c27416..2513a42 100644 --- a/man/DipTestOverP.Rd +++ b/man/DipTestOverP.Rd @@ -8,7 +8,7 @@ This function evaluates to TRUE if the Hartigans Dip Test for unimodality can be DipTestOverP(p, R = 100, na.rm = T) } \arguments{ -\item{p}{The P-value that is used to reject Hartigans Dip Test of unimodality (} +\item{p}{The P-value that is used to reject Hartigans Dip Test of unimodality} \item{R}{The number of bootstrap replicates for each test.} diff --git a/man/Params.Rd b/man/Params.Rd index 0c9675f..a67ad8e 100644 --- a/man/Params.Rd +++ b/man/Params.Rd @@ -6,5 +6,28 @@ \alias{Params-class} \title{The Params virtual class} \description{ -Virtual S4 class that all other Params classes inherit from. +Virtual S4 class that stores all parameters needed for the simulation of gene expression data. } +\section{Parameters}{ + +The Params class defines the following parameters: +\describe{ + \item{\code{nGenes}}{A list containing the numbers of genes to simulate for + each scenario and modality.} + \item{\code{means}}{A list containing the parameter range for the mean for + each scenario and modality.} + \item{\code{foldChanges}}{A list containing the parameter range for the log2 + foldChanges between distributions for multimodal genes. Has to contain one + entry less than the modality that is simulated.} + \item{\code{sd}}{The parameters mu and lambda for the generation of the + standard deviation of each distribution via an Inverse Gaussian distribution.} + \item{\code{gamma}}{The shape and rate parameter for generating the gamma + distributions.} + \item{\code{proportions}}{Parameters for the allowed proportions of + multimodal distributions.} + \item{\code{nPatients}}{The number of patients to simulate and how + many columns the simulated gene expression should have per gene.} + \item{\code{seed}}{Seed to use for generating random numbers.} +} +} + diff --git a/man/SilvermanOverP.Rd b/man/SilvermanOverP.Rd index 5970c7c..597fadb 100644 --- a/man/SilvermanOverP.Rd +++ b/man/SilvermanOverP.Rd @@ -2,13 +2,12 @@ % Please edit documentation in R/filter.R \name{SilvermanOverP} \alias{SilvermanOverP} -\title{SilvermanOverP returns a filter function with bindings for P. -This function evaluates to TRUE if the Silverman Test's for unimodality can be rejected at the significance level P.} +\title{SilvermanOverP returns a filter function with bindings for P.} \usage{ SilvermanOverP(p, R = 100, na.rm = T) } \arguments{ -\item{p}{The P-value that is used to reject Silvermans Test for unimodality (given by k=1, m = 1 using the Hall/York adjustments)} +\item{p}{The p-value that is used to reject Silvermans Test for unimodality (given by k=1 using the Hall/York adjustments)} \item{R}{The number of bootstrap replicates for each test.} @@ -19,5 +18,13 @@ A function with bindings for p and R. } \description{ SilvermanOverP returns a filter function with bindings for P. -This function evaluates to TRUE if the Silverman Test's for unimodality can be rejected at the significance level P. +} +\details{ +The null hypothesis of a silvermantest is that the underlying +density has at most k modes (H0: number of modes <= k). By rejecting +this hypothesis (returned p-value is smaller than given level of +significance) one can verify that the underlying density has more +than k modes. +This function evaluates to TRUE if the Silverman Test's for unimodality (k=1) +can be rejected at the significance level p. } diff --git a/man/calcPValues.Rd b/man/calcPValues.Rd new file mode 100644 index 0000000..2bf8d1d --- /dev/null +++ b/man/calcPValues.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{calcPValues} +\alias{calcPValues} +\title{Function to calculate the p-Values of the survival curves} +\usage{ +calcPValues(phenotypes, output, smallMetadata) +} +\arguments{ +\item{phenotypes}{converted metadata with columns "survival" and "is_dead"} + +\item{output}{(Filtered) Algorithm Output in the unified output data format} + +\item{smallMetadata}{metadata of relevant patients} +} +\value{ +returns the p-Values of each gene as a list with categories "All" and "perCombination" +} +\description{ +Function to calculate the p-Values of the survival curves +} +\details{ +calculates p-Values of the Chi-squared function for all combinations of patient groups per gene +} diff --git a/man/cleanOutput.Rd b/man/cleanOutput.Rd new file mode 100644 index 0000000..6193245 --- /dev/null +++ b/man/cleanOutput.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter.R +\name{cleanOutput} +\alias{cleanOutput} +\title{cleanOutput +returns the cleaned output of algorithms} +\usage{ +cleanOutput(cancerGroupOutput, cancerGroupExpressionmatrix, + minPercentage = 10, coresToUse = 20, parallel = TRUE) +} +\arguments{ +\item{cancerGroupOutput}{output of one cancer group} + +\item{cancerGroupExpressionmatrix}{expression matrix of this one cancer group} + +\item{minPercentage}{minimum percentage of groups. +Groups with lower percentages will be sorted into the other groups} + +\item{coresToUse}{The number of cores to use for parallel computing} + +\item{parallel}{logical. Whether cleanOutput should be computed in parallel} +} +\value{ +cleaned output of algorithms +} +\description{ +cleanOutput +returns the cleaned output of algorithms +} +\details{ +This function cleans up the output as empty groups will be deleted, +genes with groups < minPercentage of the sum of patients are corrected and +modus, means, sds, group sizes and patients belonging to each group will be updated +} diff --git a/man/evaluateAlgorithmOutput.Rd b/man/evaluateAlgorithmOutput.Rd index 35d1918..0516acc 100644 --- a/man/evaluateAlgorithmOutput.Rd +++ b/man/evaluateAlgorithmOutput.Rd @@ -36,6 +36,7 @@ FALSE (not correctly estimated) for this value. If the difference is less or equal to 10%, the estimation is classified as TRUE (correctly estimated) } \examples{ +\dontrun{ params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) simulation <- simulateExpression(params= params,verbose=FALSE) expression <- simulation$expressionData @@ -48,3 +49,4 @@ validationData = validationData) evaluation <- evaluateAlgorithmOutput(modality = 2,algorithmOutput = output, validationData = validationData,maxDifference = 20) } +} diff --git a/man/filter0sumGenes.Rd b/man/filter0sumGenes.Rd new file mode 100644 index 0000000..8068c6b --- /dev/null +++ b/man/filter0sumGenes.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter.R +\name{filter0sumGenes} +\alias{filter0sumGenes} +\title{Filter 0-sum genes} +\usage{ +filter0sumGenes(cancerData, parallel = TRUE) +} +\arguments{ +\item{cancerData}{TCGA cancer expression data of one cancer group} + +\item{parallel}{logical. Whether to calculate in parallel} +} +\value{ +cleared data without 0 sum genes +} +\description{ +Filter 0-sum genes +} diff --git a/man/filterForFoldChange.Rd b/man/filterForFoldChange.Rd new file mode 100644 index 0000000..4a419eb --- /dev/null +++ b/man/filterForFoldChange.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter.R +\name{filterForFoldChange} +\alias{filterForFoldChange} +\title{filterForFoldChange +returns the output and expression matrix of filtered multimodal genes} +\usage{ +filterForFoldChange(filteredMeanOutput, filteredMeanExpressionmatrix, + minFoldChange = 2) +} +\arguments{ +\item{filteredMeanOutput}{output that was filtered for mean of multimodal genes of one cancer group} + +\item{filteredMeanExpressionmatrix}{expression matrix that was filtered for mean of the multimodal genes of one cancer group} + +\item{minFoldChange}{minimum fold change that has to exist between two adjacent means of the multimodal genes} +} +\value{ +list of the output and expression matrix of filtered multimodal genes +} +\description{ +filterForFoldChange +returns the output and expression matrix of filtered multimodal genes +} +\details{ +This function returns a list of the output and expression matrix of filtered multimodal genes +} diff --git a/man/filterForKnownInCancer.Rd b/man/filterForKnownInCancer.Rd new file mode 100644 index 0000000..9a43239 --- /dev/null +++ b/man/filterForKnownInCancer.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter.R +\name{filterForKnownInCancer} +\alias{filterForKnownInCancer} +\title{Filter output and expressionmatrix for genes that are known in KEGG's "Pathways in Cancer" +returns the filtered output and expression matrix} +\usage{ +filterForKnownInCancer(output, expressionmatrix) +} +\arguments{ +\item{output}{output of multimodal genes of one cancer group} + +\item{expressionmatrix}{expression matrix of the multimodal genes of one cancer group} +} +\value{ +list of the output and expression matrix of genes that were found in KEGG's "Pathways in cancer" +} +\description{ +Filter output and expressionmatrix for genes that are known in KEGG's "Pathways in Cancer" +returns the filtered output and expression matrix +} +\details{ +needs the updated Ensembl Gene Names +} diff --git a/man/filterForMean.Rd b/man/filterForMean.Rd new file mode 100644 index 0000000..6b1849e --- /dev/null +++ b/man/filterForMean.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter.R +\name{filterForMean} +\alias{filterForMean} +\title{filterForMean +returns the output and expression matrix of filtered multimodal genes} +\usage{ +filterForMean(multimodalOutput, multimodalExpressionmatrix, minMean = 2) +} +\arguments{ +\item{multimodalOutput}{cleaned output of multimodal genes of one cancer group} + +\item{multimodalExpressionmatrix}{expression matrix of the multimodal genes of one cancer group} + +\item{minMean}{minimum mean that has to exist in any mean of the multimodal genes} +} +\value{ +list of the output and expression matrix of filtered multimodal genes +} +\description{ +filterForMean +returns the output and expression matrix of filtered multimodal genes +} +\details{ +This function returns a list of the output and expression matrix of filtered multimodal genes +} diff --git a/man/filterForXChromosomeGenes.Rd b/man/filterForXChromosomeGenes.Rd new file mode 100644 index 0000000..59e1fe9 --- /dev/null +++ b/man/filterForXChromosomeGenes.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter.R +\name{filterForXChromosomeGenes} +\alias{filterForXChromosomeGenes} +\title{Filter output and expressionmatrix for genes that are on the X Chromosome of the Human Genome +returns filtered output and expression matrix without genes on the X chromosome} +\usage{ +filterForXChromosomeGenes(output, expressionmatrix) +} +\arguments{ +\item{output}{output of multimodal genes of one cancer group} + +\item{expressionmatrix}{expression matrix of the multimodal genes of one cancer group} +} +\value{ +list of the output and expression matrix of genes that were not found on the X Chromosome +} +\description{ +Filter output and expressionmatrix for genes that are on the X Chromosome of the Human Genome +returns filtered output and expression matrix without genes on the X chromosome +} +\details{ +needs the updated Ensembl Gene Names; uses gencode.v28.annotation.gtf +} diff --git a/man/filterForYChromosomeGenes.Rd b/man/filterForYChromosomeGenes.Rd new file mode 100644 index 0000000..9c285cc --- /dev/null +++ b/man/filterForYChromosomeGenes.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter.R +\name{filterForYChromosomeGenes} +\alias{filterForYChromosomeGenes} +\title{Filter output and expressionmatrix for genes that are on the Y Chromosome of the Human Genome +returns filtered output and expression matrix without genes on the Y chromosome} +\usage{ +filterForYChromosomeGenes(output, expressionmatrix) +} +\arguments{ +\item{output}{output of multimodal genes of one cancer group} + +\item{expressionmatrix}{expression matrix of the multimodal genes of one cancer group} +} +\value{ +list of the output and expression matrix of genes that were not found on the Y Chromosome +} +\description{ +Filter output and expressionmatrix for genes that are on the Y Chromosome of the Human Genome +returns filtered output and expression matrix without genes on the Y chromosome +} +\details{ +needs the updated Ensembl Gene Names; uses gencode.v28.annotation.gtf +} diff --git a/man/filterMultimodalGenes.Rd b/man/filterMultimodalGenes.Rd new file mode 100644 index 0000000..d7ad1c5 --- /dev/null +++ b/man/filterMultimodalGenes.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter.R +\name{filterMultimodalGenes} +\alias{filterMultimodalGenes} +\title{filterMultimodalGenes +returns the output and expression matrix of multimodal genes} +\usage{ +filterMultimodalGenes(cleanedOutput, processedExpressionmatrix) +} +\arguments{ +\item{cleanedOutput}{cleaned output of one cancer group} + +\item{processedExpressionmatrix}{expression matrix of the processed cancer group} +} +\value{ +list of the output and expression matrix of multimodal genes +} +\description{ +filterMultimodalGenes +returns the output and expression matrix of multimodal genes +} +\details{ +This function returns a list of the output and expression matrix of multimodal genes +} diff --git a/man/filterRarelyExpressedGenes.Rd b/man/filterRarelyExpressedGenes.Rd new file mode 100644 index 0000000..b41f7fd --- /dev/null +++ b/man/filterRarelyExpressedGenes.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter.R +\name{filterRarelyExpressedGenes} +\alias{filterRarelyExpressedGenes} +\title{Filter rarely expressed genes} +\usage{ +filterRarelyExpressedGenes(possibles, minExpressedPercentage = 2, + parallel = TRUE) +} +\arguments{ +\item{possibles}{cancerData of possible multimodal genes} + +\item{minExpressedPercentage}{numerical. minimum percentage cutoff Value for +the minimum percentage of patients} + +\item{parallel}{logical. Whether to calculate in parallel} +} +\value{ +cancerData of genes that are not rarely expressed +} +\description{ +Filter rarely expressed genes +} +\details{ +filters out genes that are rarely expressed +(expressed by less than minPercentage of patients) +} diff --git a/man/geneNamesX.Rd b/man/geneNamesX.Rd new file mode 100644 index 0000000..915f15b --- /dev/null +++ b/man/geneNamesX.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/genesXY.R +\docType{data} +\name{geneNamesX} +\alias{geneNamesX} +\title{Genes on the X Chromosome of Homo sapiens} +\format{An object of class \code{character} of length 2317.} +\source{ +\url{https://www.gencodegenes.org/releases/current.html} +} +\usage{ +data("geneNamesX") +} +\description{ +Created from gencode.v28.annotation.gtf +The result is a vector of gene names +Is used in "filterForXChromosomeGenes" function +} +\keyword{datasets} diff --git a/man/geneNamesY.Rd b/man/geneNamesY.Rd new file mode 100644 index 0000000..d15cbb8 --- /dev/null +++ b/man/geneNamesY.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/genesXY.R +\docType{data} +\name{geneNamesY} +\alias{geneNamesY} +\title{Genes on the Y Chromosome of Homo sapiens} +\format{An object of class \code{character} of length 561.} +\source{ +\url{https://www.gencodegenes.org/releases/current.html} +} +\usage{ +data("geneNamesY") +} +\description{ +Created from gencode.v19.annotation.gtf +The result is a vector of gene names +Is used in "filterForYChromosomeGenes" function +} +\keyword{datasets} diff --git a/man/getClassification.Rd b/man/getClassification.Rd index 97bb04a..a7024b9 100644 --- a/man/getClassification.Rd +++ b/man/getClassification.Rd @@ -40,6 +40,7 @@ FN (FALSE Negative): validationData: unimodal/bimodal, FP (FALSE Positive): validationData: unimodal, assigned scenario: bimodal } \examples{ +\dontrun{ params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) simulation <- simulateExpression(params= params,verbose=FALSE) expression <- simulation$expressionData @@ -48,3 +49,4 @@ mclustOutput <- useMclust(expression, verbose=FALSE) getClassifications(modality = 2,validationData = validationData, outputs = list("mclust" = mclustOutput)) } +} diff --git a/man/getClassifications.Rd b/man/getClassifications.Rd index c761c84..6135d90 100644 --- a/man/getClassifications.Rd +++ b/man/getClassifications.Rd @@ -36,6 +36,7 @@ FN (FALSE Negative): validationData: unimodal/bimodal, FP (FALSE Positive): validationData: unimodal, assigned scenario: bimodal } \examples{ +\dontrun{ params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) simulation <- simulateExpression(params= params,verbose=FALSE) expression <- simulation$expressionData @@ -49,5 +50,6 @@ outputs = list("mclust"=mclustOutput)) #getClassifications of two or more algorithms getClassifications(modality=2,validationData, outputs = list("mclust"=mclustOutput,"HDBSCAN"=hdbscanOutput)) +} } diff --git a/man/getFilteredData.Rd b/man/getFilteredData.Rd new file mode 100644 index 0000000..8bc2073 --- /dev/null +++ b/man/getFilteredData.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TCGAFunctions.R +\name{getFilteredData} +\alias{getFilteredData} +\title{getFilteredData +Processes one cancer group with filtering steps} +\usage{ +getFilteredData(oneCancerGroupExpressionmatrix, maxModality = 3, + algorithm = "mclust", minClusterSize = 50, minSamples = NULL, + pathToClinicalData = "pathToClinicalData", + pathToExpressionmatrix = "pathToExpressionmatrix", + minExpressedPercentage = 2, SilvermanP = 0.05, minPercentage = 10, + minMean = 2, minFoldChange = 2, coresToUse = 6, parallel = FALSE, + updateToEnsembleGeneNames = FALSE, verbose = TRUE) +} +\arguments{ +\item{oneCancerGroupExpressionmatrix}{TCGA cancer expression data of one cancer group} + +\item{maxModality}{An integer specifying the highest modality to calculate models in with mclust and flexmix} + +\item{algorithm}{"mclust", "hdbscan" or "flexmix". Defines which algorithm should be used to process the cancer data} + +\item{minClusterSize}{Integer; The minimum number of samples +in a group for that group to be considered a cluster in hdbscan} + +\item{minSamples}{Integer for hdbscan; The number of samples +in a neighborhood for a point to be considered as a core point. +This includes the point itself. If NULL: defaults to the min_cluster_size.} + +\item{pathToClinicalData}{path to the clinical data} + +\item{pathToExpressionmatrix}{path to the expression matrix} + +\item{minExpressedPercentage}{integer specifying the percentage of patients that should at least be expressed for the gene to be analysed} + +\item{SilvermanP}{The p-value that is used to reject Silvermans Test for unimodality +(given by k=1 using the Hall/York adjustments)} + +\item{minPercentage}{minimum percentage of groups. +Groups with lower percentages will be sorted into the other groups} + +\item{minMean}{minimum mean that has to exist in any mean of the multimodal genes} + +\item{minFoldChange}{minimum fold change that has to exist between any adjacent means of multimodal genes} + +\item{coresToUse}{The number of cores to use for parallel computing of cleanOutput() if parallel is TRUE} + +\item{parallel}{logical. Whether cleanOutput() shall be computed in parallel} + +\item{updateToEnsembleGeneNames}{logical. Whether to convert Ensemble gene ids to gene names} + +\item{verbose}{logical. Whether to print progress messages} +} +\description{ +getFilteredData +Processes one cancer group with filtering steps +} +\details{ +Processes one cancer group by +a) filtering 0 sum rows, +b) using Silverman's test for unimodality, +c) filtering genes with <2% of patients that have expression values greater 0, +d) use useMclust on cancer data, +e) cleaning output of useMclust +f) filtering for multimodal genes +g) filtering for existing minimum mean of any mean of the multimodal genes +h) filtering for existing minimum fold change between any adjacent means of multimodal genes +i) getting gene names from Ensembl +j) returning a list of the filtered output and the filtered expression matrix +} diff --git a/man/getParam.Rd b/man/getParam.Rd index a37f076..cfee23f 100644 --- a/man/getParam.Rd +++ b/man/getParam.Rd @@ -22,7 +22,8 @@ The extracted parameter value Accessor function for getting parameter values. } \examples{ +\dontrun{ params <- newParams() getParam(object = params,name = "nGenes") - +} } diff --git a/man/getParams.Rd b/man/getParams.Rd index 0c54fda..59ffe9d 100644 --- a/man/getParams.Rd +++ b/man/getParams.Rd @@ -18,6 +18,8 @@ List with the values of the selected parameters. Get multiple parameter values from a Params object. } \examples{ +\dontrun{ params <- newParams() getParams(params = params,names = c("nGenes","foldChanges","proportions","means")) } +} diff --git a/man/getStatistic.Rd b/man/getStatistic.Rd index 36bdedd..e19fdb6 100644 --- a/man/getStatistic.Rd +++ b/man/getStatistic.Rd @@ -20,6 +20,7 @@ algorithms. The result is a data frame where the percentage of as TRUE and as FALSE evaluated values of the genes } \examples{ +\dontrun{ params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) simulation <- simulateExpression(params= params,verbose=FALSE) expression <- simulation$expressionData @@ -29,3 +30,4 @@ mclustEvaluation <- evaluateAlgorithmOutput(modality = 2, algorithmOutput = mclu validationData = validationData) getStatistics(evaluations = list("mclust" = mclustEvaluation)) } +} diff --git a/man/getStatistics.Rd b/man/getStatistics.Rd index 6fccb48..74ffcf1 100644 --- a/man/getStatistics.Rd +++ b/man/getStatistics.Rd @@ -18,6 +18,7 @@ algorithms. The result is a data frame where the percentage of as TRUE and as FALSE evaluated values of the genes } \examples{ +\dontrun{ params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) simulation <- simulateExpression(params= params,verbose=FALSE) expression <- simulation$expressionData @@ -34,3 +35,4 @@ getStatistics(evaluations = list("mclust"=mclustEvaluation)) #getStatistics of two or more algorithms getStatistics(evaluations = list("mclust"=mclustEvaluation,"HDBSCAN"=hdbscanEvaluation)) } +} diff --git a/man/isValidUnifiedOutputDataFormat.Rd b/man/isValidUnifiedOutputDataFormat.Rd new file mode 100644 index 0000000..c72e460 --- /dev/null +++ b/man/isValidUnifiedOutputDataFormat.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/evaluate.R +\name{isValidUnifiedOutputDataFormat} +\alias{isValidUnifiedOutputDataFormat} +\title{Checks if output fulfills the criteria of the unified output data format} +\usage{ +isValidUnifiedOutputDataFormat(output, expressionmatrix) +} +\arguments{ +\item{output}{to be tested} + +\item{expressionmatrix}{that was used by algorithm to create output} +} +\value{ +logical: TRUE if output is in the correct format, FALSE if any check was failed +} +\description{ +Checks if output fulfills the criteria of the unified output data format +} +\details{ +prints status messages about performed checks +} diff --git a/man/mclustClassification.Rd b/man/mclustClassification.Rd index ae05cac..0955780 100644 --- a/man/mclustClassification.Rd +++ b/man/mclustClassification.Rd @@ -4,13 +4,20 @@ \alias{mclustClassification} \title{mclustClassification} \usage{ -mclustClassification(models, patients) +mclustClassification(models, patients, pathToClinicalData, + pathToExpressionmatrix) } \arguments{ \item{models}{Unformatted output of the Mclust function from the mclust package of the genes from the gene expression data frame} \item{patients}{List of the patients} + +\item{pathToClinicalData}{If real data is used the path +to the clinical data can be stored here} + +\item{pathToExpressionmatrix}{If real data is used the path +to the gene expression matrix can be stored here} } \value{ The formatted output of the mclust algorithm diff --git a/man/newParams.Rd b/man/newParams.Rd index ee485c9..8873838 100644 --- a/man/newParams.Rd +++ b/man/newParams.Rd @@ -13,18 +13,18 @@ newParams(...) New Params object. } \description{ -Create a new Params object. Functions exist for each of the different -Params subtypes. +Create a new Params object. } \examples{ +\dontrun{ params <- newParams() -params <- newParams("nGenes"=list("1"=100,"2"=list("gauss"=100,"gamma"=100)), +params <- newParams("nGenes"=list("1"=10,"2"=list("gauss"=10,"gamma"=10)), "proportions"=c(30,70)) params <- newParams( -nGenes=list("1"=c(700),"2"=list(gauss = 150, gamma = 150), -"3"=list(gauss = 150, gamma = 150)), +nGenes=list("1"=c(70),"2"=list(gauss = 15, gamma = 15), +"3"=list(gauss = 15, gamma = 15)), means=list("1"= c(2, 4),"2"= c(2, 4),"3"= c(2, 4)), foldChanges = list("1"=NA,"2"= list(gauss = c(2, 4), gamma = c(2, 4)), "3"= list(gauss = list(c(2, 4), c(2, 4)), gamma = list(c(2, 4), c(2, 4))))) - +} } diff --git a/man/plotClassifications.Rd b/man/plotClassifications.Rd new file mode 100644 index 0000000..45caf72 --- /dev/null +++ b/man/plotClassifications.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{plotClassifications} +\alias{plotClassifications} +\title{Plot Classifications} +\usage{ +plotClassifications(classifications, name = "classifications") +} +\arguments{ +\item{classifications}{The output of the getStatistics function} + +\item{name}{The name of the plot without .pdf} +} +\description{ +Plots the output of the classifications as a bar plot with percentages, + creates a pdf file +} +\examples{ +\dontrun{ +params <- newParams(nGenes = list("1"=5,"2"=list("gauss"= 5,"gamma"= 5))) +simulation <- simulateExpression(params= params,verbose=FALSE) +expression <- simulation$expressionData +validationData <- simulation$validationData +outputMclust <- useMclust(expression = expression, verbose = FALSE) +outputHdbscan <- useHdbscan(expression = expression, verbose = FALSE) +classifications <- getClassifications(modality = 2, +validationData = validationData, +outputs = list("mclust"=outputMclust,"HDBSCAN"=outputHdbscan)) +plotClassifications(classifications = classifications, +name = "mclustHDBSCAN_Classification") +} + +} diff --git a/man/plotExpression.Rd b/man/plotExpression.Rd new file mode 100644 index 0000000..770d594 --- /dev/null +++ b/man/plotExpression.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{plotExpression} +\alias{plotExpression} +\title{Plot Gene Expression} +\usage{ +plotExpression(expression, plotsPerPage = 1, GenesToPlot = 100, + plotName = "expression") +} +\arguments{ +\item{expression}{The gene expression data frame that shall be plotted} + +\item{plotsPerPage}{The number of plots per Page (has to be a divisor of +GenesToPlot)} + +\item{GenesToPlot}{The number of genes to be plotted} + +\item{plotName}{The name of the plot without .pdf} +} +\value{ +The name of the plot in the pdf file +} +\description{ +Plots the (simulated) gene expression data frame, creates a pdf file +} +\examples{ +\dontrun{ +params <- newParams(nGenes = list("1"=5,"2"=list("gauss"= 5,"gamma"= 5))) +simulation <- simulateExpression(params= params,verbose=FALSE) +expression <- simulation$expressionData +plotExpression(expression,plotsPerPage = 4,GenesToPlot = 12,plotName = "plot12Genes") +plotExpression(expression,plotsPerPage = 5,GenesToPlot = nrow(expression), +plotName = "plotAllGenes") +} + +} diff --git a/man/plotOutput.Rd b/man/plotOutput.Rd new file mode 100644 index 0000000..119741e --- /dev/null +++ b/man/plotOutput.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{plotOutput} +\alias{plotOutput} +\title{Plot Output} +\usage{ +plotOutput(expression, algorithmOutput, GenesToPlot = 100, name = "output") +} +\arguments{ +\item{expression}{The gene expression data frame that shall be plotted} + +\item{algorithmOutput}{The output of the algorithm} + +\item{GenesToPlot}{The number of genes to be plotted} + +\item{name}{The name of the plot without .pdf} +} +\value{ +The plot name of the plot in the pdf file +} +\description{ +Plots the output of algorithms with the right data format +with the (simulated) gene expression, creates a pdf file +} +\examples{ +\dontrun{ +params <- newParams(nGenes = list("1"=5,"2"=list("gauss"= 5,"gamma"= 5))) +simulation <- simulateExpression(params= params,verbose=FALSE) +expression <- simulation$expressionData +output <- useMclust(expression = expression, verbose = FALSE) +plotOutput(expression = expression,algorithmOutput = output, +GenesToPlot = 10,name = "plot10GenesWExpression") +plotOutput(expression = expression,algorithmOutput = output, +GenesToPlot = nrow(expression),name = "plotAllGenesWExpression") +} + +} diff --git a/man/plotStatistics.Rd b/man/plotStatistics.Rd new file mode 100644 index 0000000..1735c37 --- /dev/null +++ b/man/plotStatistics.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{plotStatistics} +\alias{plotStatistics} +\title{Plot Statistics} +\usage{ +plotStatistics(statistics, name = "statistics") +} +\arguments{ +\item{statistics}{The output of the getStatistics function} + +\item{name}{The name of the plot without .pdf} +} +\description{ +Plots the output of the statistics as a bar plot, creates a pdf file +} +\examples{ +\dontrun{ +params <- newParams(nGenes = list("1"=5,"2"=list("gauss"= 5,"gamma"= 5))) +simulation <- simulateExpression(params= params,verbose=FALSE) +expression <- simulation$expressionData +validationData <- simulation$validationData +outputMclust <- useMclust(expression = expression, verbose = FALSE) +outputHdbscan <- useHdbscan(expression = expression, verbose = FALSE) +evaluationMclust <- evaluateAlgorithmOutput(modality = 2, +algorithmOutput = outputMclust, +validationData = validationData, +maxDifference = 10) +evaluationHDBSCAN <- evaluateAlgorithmOutput(modality = 2, +algorithmOutput = outputHdbscan, +validationData = validationData,maxDifference = 10) +statistics <- getStatistics(evaluations = list("mclust"=evaluationMclust, +"HDBSCAN"=evaluationHDBSCAN)) +plotStatistics(statistics = statistics,name = "mclustHDBSCAN") +} + +} diff --git a/man/plotSurvivalCurves.Rd b/man/plotSurvivalCurves.Rd new file mode 100644 index 0000000..c395fe5 --- /dev/null +++ b/man/plotSurvivalCurves.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{plotSurvivalCurves} +\alias{plotSurvivalCurves} +\title{Plot Survival Curves} +\usage{ +plotSurvivalCurves(metadata, output, sampleType = "Primary Tumor", + name = "survivalCurves") +} +\arguments{ +\item{metadata}{The metadata of the TCGA data} + +\item{output}{(Filtered) Algorithm Output in the unified output data format} + +\item{sampleType}{Sample Type of the TCGA data that was used} + +\item{name}{The name of the plot without .pdf} +} +\description{ +Plots the survival curves of TCGA data and calculates whether survival curves of patient groups within a gene are significantly different with a log-rank test or Mantel-Haenszel test. p-Values are calculated between all patient groups. +} diff --git a/man/plot_mix_comps.Rd b/man/plot_mix_comps.Rd new file mode 100644 index 0000000..8a36d1e --- /dev/null +++ b/man/plot_mix_comps.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{plot_mix_comps} +\alias{plot_mix_comps} +\title{Source: http://tinyheero.github.io/2015/10/13/mixture-model.html +Plot a Mixture Component} +\usage{ +plot_mix_comps(x, mu, sigma, lam) +} +\arguments{ +\item{x}{Input data} + +\item{mu}{Mean of component} + +\item{sigma}{Standard deviation of component} + +\item{lam}{Mixture weight of component} +} +\description{ +Source: http://tinyheero.github.io/2015/10/13/mixture-model.html +Plot a Mixture Component +} diff --git a/man/processOneCancerGroup.Rd b/man/processOneCancerGroup.Rd new file mode 100644 index 0000000..3494ba0 --- /dev/null +++ b/man/processOneCancerGroup.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TCGAFunctions.R +\name{processOneCancerGroup} +\alias{processOneCancerGroup} +\title{processOneCancerGroup +Processes one cancer group} +\usage{ +processOneCancerGroup(cancerData, minExpressedPercentage = 2, + algorithm = "mclust", maxModality = 3, minClusterSize = 50, + minSamples = NULL, pathToClinicalData = "pathToClinicalData", + pathToExpressionmatrix = "pathToExpressionmatrix", SilvermanP = 0.05, + verbose = TRUE) +} +\arguments{ +\item{cancerData}{TCGA cancer expression data of one cancer group} + +\item{minExpressedPercentage}{integer specifying the percentage of patients that should at least be expressed for the gene to be analysed} + +\item{algorithm}{"mclust", "hdbscan" or "flexmix". Defines which algorithm should be used to process the cancer data} + +\item{maxModality}{An integer specifying the highest modality to calculate models in with mclust and flexmix} + +\item{minClusterSize}{Integer; The minimum number of samples +in a group for that group to be considered a cluster in hdbscan} + +\item{minSamples}{Integer for hdbscan; The number of samples +in a neighborhood for a point to be considered as a core point. +This includes the point itself. If NULL: defaults to the min_cluster_size.} + +\item{pathToClinicalData}{path to the clinical data} + +\item{pathToExpressionmatrix}{path to the expression matrix} + +\item{SilvermanP}{The p-value that is used to reject Silvermans Test for unimodality +(given by k=1 using the Hall/York adjustments)} + +\item{verbose}{logical. Whether to print progress messages} +} +\description{ +processOneCancerGroup +Processes one cancer group +} +\details{ +Processes one cancer group by a) filtering 0 sum rows, +b) using Silverman's test for unimodality, +c) filtering genes with <2% of patients that have expression values greater 0, +d) use useMclust on cancer data, +e) creating list of output and filtered expression matrix +} diff --git a/man/setParam.Rd b/man/setParam.Rd index d79c5a1..0c12159 100644 --- a/man/setParam.Rd +++ b/man/setParam.Rd @@ -24,7 +24,8 @@ Object with new parameter value. Accessor function for setting parameter values. } \examples{ +\dontrun{ params <- newParams() params <- setParam(object = params,name = "nPatients",value = 250) - +} } diff --git a/man/setParams.Rd b/man/setParams.Rd index 6a69fb6..d2fdfa8 100644 --- a/man/setParams.Rd +++ b/man/setParams.Rd @@ -29,6 +29,7 @@ collecting parameter values in some way) or individually (useful when setting them manually), see examples. } \examples{ +\dontrun{ params <- newParams() params # Set individually @@ -36,8 +37,9 @@ params <- params <- setParams(params,proportions = c(80,20), foldChanges = list("1"=c(2,4),"2"= list("gauss"=c(2,5),"gamma"=c(3,5)))) params # Set via update list -params <- setParams(params,update = list("nGenes"=list("1"=100, -"2"=list("gauss"=200,"gamma"=100)),"means"=list("1"=c(1,3),"2"=c(2,4)), +params <- setParams(params,update = list("nGenes"=list("1"=10, +"2"=list("gauss"=20,"gamma"=10)),"means"=list("1"=c(1,3),"2"=c(2,4)), "nPatients"=700)) params } +} diff --git a/man/showPP.Rd b/man/showPP.Rd index 8d4c7af..89c42a5 100644 --- a/man/showPP.Rd +++ b/man/showPP.Rd @@ -15,5 +15,5 @@ showPP(params, pp) Print params object to console } \description{ -Function used for pretty printing params object. +Function used for pretty printing Params object. } diff --git a/man/simulateExpression.Rd b/man/simulateExpression.Rd index 0da292d..8456235 100644 --- a/man/simulateExpression.Rd +++ b/man/simulateExpression.Rd @@ -21,33 +21,34 @@ Simulate gene expression for the different simulation scenarios: Unimodal, multimodal with gaussian distributions, multimodal starting with a gamma distribution and followed by gaussian distributions } \details{ -The "modus" is the number of occurring distributions within one gene. -Modus "1" - Unimodal gaussian distribution -Gene expression for the unimodal gaussian distribution (scenario "1") is +The 'modus' is the number of occurring distributions within one gene. +Modus '1' - Unimodal gaussian distribution +Gene expression for the unimodal gaussian distribution (scenario '1') is simulated from the mean and the standard deviation taken from the validation data for this gene. The number of values that are simulated for each -distribution equals the number of Patients {"nPatients"}: +distribution equals the number of Patients {'nPatients'}: All other modi consist of two different scenarios: -In the "gauss" scenario all occurring distributions are gaussian distributions. -In the "gamma" scenario the first occuring distribution is a gamma distribution. +In the 'gauss' scenario all occurring distributions are gaussian distributions. +In the 'gamma' scenario the first occuring distribution is a gamma distribution. All other distributions are gaussian distributions. -Modus "2" or higher - Bimodal distributions or higher modalities +Modus '2' or higher - Bimodal distributions or higher modalities Gene expressions of the multimodal genes are simulated automatically for each distribution. -Scenario "gauss" +Scenario 'gauss' The different gaussian distributions are simulated with the corresponding mean and a standard deviation and number of values taken from the validation data for this gene. -Scenario "gamma" +Scenario 'gamma' The gamma distribution is simulated with the gamma shape and rate values from the params object. The other gaussian distributions are simulated with the values from the validation data. } \examples{ +\dontrun{ simulation <- simulateExpression(params = newParams()) # Override default parameters # Not recommended, better use setParams; See \\code{\\link{Params}} @@ -57,3 +58,4 @@ simulation <- simulateExpression(nGenes = list("1"=10, params <- newParams() simulation <- simulateExpression(params = params,verbose = FALSE) } +} diff --git a/man/toCancerDataGroups.Rd b/man/toCancerDataGroups.Rd new file mode 100644 index 0000000..97ea704 --- /dev/null +++ b/man/toCancerDataGroups.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TCGAFunctions.R +\name{toCancerDataGroups} +\alias{toCancerDataGroups} +\title{toCancerDataGroups +Sorts the TCGA expression data with the metadata into groups of different cancer types} +\usage{ +toCancerDataGroups(cancerDataExpression, metadata, verbose = TRUE) +} +\arguments{ +\item{cancerDataExpression}{TCGA cancer expression data} + +\item{metadata}{TCGA cancer metadata} + +\item{verbose}{logical. Whether to print progress messages} +} +\description{ +toCancerDataGroups +Sorts the TCGA expression data with the metadata into groups of different cancer types +} +\details{ +The groupsData that is returned consists of the different diagnosis groups +(http://www.icd10data.com/ICD10CM/Codes/C00-D49) +and the expression matrix of each of the diagnosis groups +} diff --git a/man/updateGeneNames.Rd b/man/updateGeneNames.Rd new file mode 100644 index 0000000..3b24167 --- /dev/null +++ b/man/updateGeneNames.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter.R +\name{updateGeneNames} +\alias{updateGeneNames} +\title{updateGeneNames +returns the output and expression matrix with the updated gene names} +\usage{ +updateGeneNames(filteredOutput, filteredExpressionmatrix) +} +\arguments{ +\item{filteredOutput}{output of multimodal genes of one cancer group that was filtered twice} + +\item{filteredExpressionmatrix}{expression matrix of the multimodal genes of one cancer group that was filtered twice} +} +\value{ +list of the output and expression matrix with updated gene names taken from Ensembl +} +\description{ +updateGeneNames +returns the output and expression matrix with the updated gene names +} +\details{ +returns the output and expression matrix with the updated gene names of the given output and expression matrix +the gene names are loaded from ensembl via the biomaRt package + +This function returns a list of the output and expression matrix of filtered multimodal genes +} diff --git a/man/useDiptest.Rd b/man/useDiptest.Rd new file mode 100644 index 0000000..a66ba1c --- /dev/null +++ b/man/useDiptest.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter.R +\name{useDiptest} +\alias{useDiptest} +\title{useDiptest uses the DipTestOverP function and returns filtered data of genes for that DipTestOverP was TRUE} +\usage{ +useDiptest(clearedData, parallel, DipTestP = 0.05) +} +\arguments{ +\item{clearedData}{cleared TCGA cancer expression data of one cancer group without 0sum genes} + +\item{parallel}{logical. Whether to calculate in parallel} + +\item{DipTestP}{The p-value that is used to reject Hartigans Dip Test} +} +\value{ +filtered gene expression data of possibly multimodal data +} +\description{ +useDiptest uses the DipTestOverP function and returns filtered data of genes for that DipTestOverP was TRUE +} diff --git a/man/useFlexmix.Rd b/man/useFlexmix.Rd index 61a27ac..ba8ac62 100644 --- a/man/useFlexmix.Rd +++ b/man/useFlexmix.Rd @@ -4,26 +4,35 @@ \alias{useFlexmix} \title{UseFlexmix} \usage{ -useFlexmix(expression, maxModality, verbose = TRUE, parallel = FALSE) +useFlexmix(expression, maxModality = 3, verbose = TRUE, parallel = FALSE, + pathToClinicalData = "pathTo/clinicalData", + pathToExpressionmatrix = "pathTo/expressionmatrix") } \arguments{ \item{expression}{(Simulated) Gene Expression data frame which genes shall be classified as unimodal, bimodal or other} -\item{maxModality}{numerical, maximum modality for that models will be calculated} +\item{maxModality}{numerical. maximum modality for that models will be calculated} \item{verbose}{logical. Whether to print progress messages} \item{parallel}{logical. Whether to calculate the models in parallel} + +\item{pathToClinicalData}{If real data is used the path +to the clinical data can be stored here} + +\item{pathToExpressionmatrix}{If real data is used the path +to the gene expression matrix can be stored here} } \value{ flexmixOutput The formatted output of the best fitting models from flexmix } \description{ -Use the FlexMix algorithm for creating a model to find the best fitting +Use the FlexMix algorithm to create models and to find the best fitting model to evaluate the classification (unimodal,bimodal or other) of genes. } \examples{ +\dontrun{ params <- newParams(nGenes = list("1"=75,"2"=list("gauss"= 75,"gamma"= 75))) simulation <- simulateExpression(params= params,verbose=FALSE) expression <- simulation$expressionData @@ -31,5 +40,5 @@ expression <- simulation$expressionData flexmixOutput <- useFlexmix(expression = expression, maxModality = 2) #without messages printed to console flexmixOutput <- useFlexmix(expression = expression, maxModality = 2, verbose = FALSE) - +} } diff --git a/man/useHdbscan.Rd b/man/useHdbscan.Rd index 1ae762a..8552aaa 100644 --- a/man/useHdbscan.Rd +++ b/man/useHdbscan.Rd @@ -4,15 +4,28 @@ \alias{useHdbscan} \title{UseHdbscan} \usage{ -useHdbscan(expression, minPts = 10, verbose = TRUE) +useHdbscan(expression, minClusterSize = 10, minSamples = NULL, + verbose = TRUE, pathToClinicalData = "pathTo/clinicalData", + pathToExpressionmatrix = "pathTo/expressionmatrix") } \arguments{ \item{expression}{(Simulated) Gene Expression data frame which genes shall be classified as unimodal, bimodal or other} -\item{minPts}{Integer; Minimum size of clusters. See details of ?hdbscan} +\item{minClusterSize}{** Integer; The minimum number of samples +in a group for that group to be considered a cluster} + +\item{minSamples}{** Integer; The number of samples +in a neighborhood for a point to be considered as a core point. +This includes the point itself. If NULL: defaults to the min_cluster_size.} \item{verbose}{logical. Whether to print progress messages} + +\item{pathToClinicalData}{If real data is used the path +to the clinical data can be stored here} + +\item{pathToExpressionmatrix}{If real data is used the path +to the gene expression matrix can be stored here} } \value{ hdbscanOutput The formatted output of the HDBSCAN clustering @@ -21,15 +34,21 @@ hdbscanOutput The formatted output of the HDBSCAN clustering Use the HDBSCAN algorithm for generating output to evaluate the classification (unimodal,bimodal or other) of genes. } +\details{ +For details on how HDBSCAN works and +for details on what parameters with ** do, +see http://hdbscan.readthedocs.io/en/latest/ +} \examples{ +\dontrun{ params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) simulation <- simulateExpression(params= params,verbose=FALSE) expression <- simulation$expressionData #with messages printed to console hdbscanOutput <- useHdbscan(expression = expression) #with specified minPts parameter -hdbscanOutput <- useHdbscan(expression = expression, minPts = 15) +hdbscanOutput <- useHdbscan(expression = expression, minClusterSize = 15) #without messages printed to console hdbscanOutput <- useHdbscan(expression = expression, verbose = FALSE) - +} } diff --git a/man/useMclust.Rd b/man/useMclust.Rd index 980e47a..8160793 100644 --- a/man/useMclust.Rd +++ b/man/useMclust.Rd @@ -5,7 +5,8 @@ \title{UseMclust} \usage{ useMclust(expression, clusterNumbers = NULL, verbose = TRUE, - parallel = FALSE) + parallel = FALSE, pathToClinicalData = "pathTo/clinicalData", + pathToExpressionmatrix = "pathTo/expressionmatrix") } \arguments{ \item{expression}{(Simulated) Gene Expression data frame which genes shall @@ -16,6 +17,12 @@ be classified as unimodal, bimodal or other} \item{verbose}{logical. Whether to print progress messages} \item{parallel}{logical. Whether to calculate the mclust models in parallel} + +\item{pathToClinicalData}{If real data is used the path +to the clinical data can be stored here} + +\item{pathToExpressionmatrix}{If real data is used the path +to the gene expression matrix can be stored here} } \value{ The formatted output of the mclust algorithm @@ -25,6 +32,7 @@ Use the mclust algorithm for generating output to evaluate the classification (unimodal,bimodal or other) of genes. } \examples{ +\dontrun{ params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10))) simulation <- simulateExpression(params= params,verbose=FALSE) expression <- simulation$expressionData @@ -33,5 +41,5 @@ mclustOutput <- useMclust(expression = expression) #without messages printed to console, at maximum 3 clusters mclustOutput <- useMclust(expression = expression, clusterNumbers = c(1:3), verbose = FALSE, parallel = FALSE) - +} } diff --git a/man/useSilvermantest.Rd b/man/useSilvermantest.Rd new file mode 100644 index 0000000..f4f415a --- /dev/null +++ b/man/useSilvermantest.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter.R +\name{useSilvermantest} +\alias{useSilvermantest} +\title{useSilvermantest uses the SilvermanOverP function and returns filtered data of genes for that SilvermanOverP was TRUE} +\usage{ +useSilvermantest(clearedData, parallel, SilvermanP = 0.05) +} +\arguments{ +\item{clearedData}{cleared TCGA cancer expression data of one cancer group without 0sum genes} + +\item{parallel}{logical. Whether to calculate in parallel} + +\item{SilvermanP}{The p-value that is used to reject Silvermans Test for unimodality (given by k=1 using the Hall/York adjustments)} +} +\value{ +filtered gene expression data of possibly multimodal data +} +\description{ +useSilvermantest uses the SilvermanOverP function and returns filtered data of genes for that SilvermanOverP was TRUE +} diff --git a/multimodalR.Rproj b/multimodalR.Rproj new file mode 100644 index 0000000..cba1b6b --- /dev/null +++ b/multimodalR.Rproj @@ -0,0 +1,21 @@ +Version: 1.0 + +RestoreWorkspace: No +SaveWorkspace: No +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace diff --git a/vignettes/Using-multimodalR-on-real-data.Rmd b/vignettes/Using-multimodalR-on-real-data.Rmd new file mode 100644 index 0000000..27e1b4e --- /dev/null +++ b/vignettes/Using-multimodalR-on-real-data.Rmd @@ -0,0 +1,450 @@ +--- +title: "Using multimodalR on TCGA data" +author: "Svenja K. Beck" +date: "`r Sys.Date()`" +#output: rmarkdown::word_document +output: + html_document: + toc: true + number_sections: true + theme: readable + highlight: haddock + +#output: pdf_document +vignette: > + %\VignetteIndexEntry{Using multimodalR on TCGA data} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +# Introduction +Welcome to multimodalR! multimodalR is an R package to simulate +multimodal gene expression data, to evaluate different algorithms +detecting multimodality and to detect bimodality or multimodality +in gene expression data sets. +This vignette gives an overview to multimodalR's usage on real data with the example of TCGA data. For an overview and introduction to multimodalR's functionalities, please use the vignette 'multimodalR'. +It was written in R Markdown, using the knitr package for production. +See help(package="multimodalR") for further details (and references provided by citation("multimodalR").) + +# Installation +To install the most recent development version from Github use: + +```{r install-github, eval = FALSE} +# biocLite("loosolab/multimodalR", dependencies = TRUE,build_vignettes = TRUE) +``` + +#About TCGA +The Cancer Genome Atlas (TCGA) is a collaboration between the National Cancer Institute (NCI) and the National Human Genome Research Institute (NHGRI) that has generated comprehensive, multi-dimensional maps of the key genomic changes in 33 types of cancer. The TCGA dataset, comprising more than two petabytes of genomic data, has been made publically available, and this genomic information helps the cancer research community to improve the prevention, diagnosis, and treatment of cancer. +(https://cancergenome.nih.gov/abouttcga) + +##About the data +TCGA collected and characterized high-quality tumor and matched normal samples from over 11,000 patients. The TCGA dataset, available in the Genomic Data Commons (GDC), contains: + + * Clinical information about participants + * Metadata about the samples (e.g. the weight of a sample portion, etc.) + * Histopathology slide images from sample portions + * Molecular information derived from the samples (e.g. mRNA/miRNA expression, + protein expression, copy number, etc.) + +(https://cancergenome.nih.gov/abouttcga/aboutdata) + +##Downloading TCGA data +TCGA data can be downloaded from https://portal.gdc.cancer.gov/repository . +Select the facet 'Cases' and check the box of 'TCGA' at the 'Program' section. +At the 'Primary Site' section, check the boxes of cancer types you want to process with multimodalR and download the files in the JSON notation. +If only one cancer type is selected, the steps before 'processing single cancer groups' may be skipped. + + +# Quickstart +The following steps are an overview of the steps to use multimodalR on TCGA data: + +1. Import your data +1. Clean up your data + 1. Log-transform your data + 1. Exclude 0-sum rows + 1. Sort into groups + +**Processing single groups:** + +1. Collect data as data.frame +1. Exclude 0-sum rows +1. Selecting potential multimodal genes (with Silverman Test's for unimodality) +1. Use useMclust() (or any other algorithm producing the unified data output format) on data to find potential multimodal genes +1. Filter output +1. Get gene names from Ensembl +1. Exclude genes on allosomes +1. Create a plot of the filtered gene expression matrix & create survival curves plots +1. Check found multimodals against known bimodal genes + +![Workflow to process TCGA data](WorkflowTCGAData.png) + +#Use multimodalR on TCGA data with few functions + +## Import TCGA data +The data exported from TCGA comes in two big files: one contains only metadate, the other contains the FPKM expression values. + +### Metadata +The metadata shows the following structure, which is the same for each of the dicts that are stored as a 'list of dicts' in JSON notation: +```{r metadata head, eval=FALSE} +[ + { + "caseID": "028e99e9-5b9a-4954-bb6e-6d4709a3cea8", # uniquely identifies the patient + "fileID": "a347bee8-651f-4c8a-9e57-f21fcf955a1d", # uniquely identifies the data file + "fileName": "5d259206-1431-48f8-abc3-3001bab5243d.FPKM.txt.gz", # the actual name of the data file + "sampleType": "Solid Tissue Normal", # the sample type + "diagnosis": "c34.3", # The ICD10 classification of diagnosis (http://www.icd10data.com/ICD10CM/Codes/C00-D49) + "morphology": "8252/3", # Some weird code for cancer morphology + "age": null, # The age in days at data generation + "gender": "female", + "status": "alive", # Vitality status at end of study + "deathIn": null, # Days until death (NA if alive) + "followUpIn": 3261 # Days until last followUp (NA if dead) + }, # .. next entries ... +] +``` + +###FPKM expression values +The expression values are stored in a h5 data store: expression-fpkm.h5 + +Usage of the h5ls tool allows a peek into the files structure: +```{r TCGA FPKM file structure,eval=FALSE} +additional.metastatic Group +additional.new.primary Group +blood.derived Group +metastatic Group +primary.tumor Group +recurrent.tumor Group +solid.tissue.normal Group +``` +Each group contains an expression matrix. Row names correspond to gene IDs, +column names correspond to the case ID from the metadata. +For patients were the same tumour was measured multiple times, +a running number is added to avoud duplicated column names. + +###Reading the data in R +First, the packages RJSONIO, plyr and rhdf5 have to be installed to load the data into R. +To read the metadata into a data.frame use + +```{r load TCGA metadata, eval=FALSE} +library(RJSONIO) +library(plyr) +x <- fromJSON("pathToData/expression-fpkm.json", nullValue = NA, simplify = FALSE) +metadata <- ldply(x, data.frame) +``` +The expression data can be load directly from the h5 data store: +```{r load TCGA expression data, eval=FALSE} +library(rhdf5) +primary.tumors <- h5read("pathToData/expression-fpkm.h5", "/primary.tumor") +``` + +The resulting object is a list with three elements: + + * rownames: A 1D-Array with the Ensembl Gene ID (can be coerced to character) + * colnames: A 1D-Array with the Case ID (can be coerced to character), + with running counts in case of duplication, resembling R’s make.unique function. + * values: A high-dimensional expression matrix (has to be transposed before use!) + +**Beware of R column names** + +When creating data frames or matrices from the data, keep in mind that some Case IDs start with a numeric character. Those IDs will be preceded by an X, when put as colnames of an R object. + +## Clean up your data + +###Log-transform your data + +The first step of cleaning up data usually is the log-transformation of data that are skewed. +For this, use: +```{r logTransformingData,eval = FALSE} +cancerData <- log2(data.frame(t(primary.tumors$values+1))) +rownames(cancerData) <- primary.tumors$rownames +colnames(cancerData) <- primary.tumors$colnames +``` + +###Exclude 0-sum rows & sort expression values into groups + +For processing a huge amount of genes the usage of parallel processing is advisable. +**Please check the size of the cancer data object, how many cores and how much memory you have available before specifying the number of cores to be used for parallel processing.** +With the toCancerDataGroups function 0sum Rows are excluded and the data are sorted into groups. + +```{r toCancerGroups, eval=FALSE} +groupsData <- toCancerDataGroups(cancerDataExpression = cancerData, + metadata = metadata,coresToUse = 18) +``` + +## Processing single groups + +After using the 'toCancerDataGroups' function, groups that are interesting should be selected and isolated. The following steps will be executed for breast cancer patients, who are in diagnosisGroup "c50". All diagnosis groups can be looked up at http://www.icd10data.com/ICD10CM/Codes/C00-D49 . + +###Collect data as data.frame + +```{r breastCancerData,eval=FALSE} +groupsDataSets <- groupsData$groupsExpressionDataSets +diagnosisGroups <- groupsData$diagnosisGroups + +breastCancerData <- data.frame(groupsDataSets[[which(diagnosisGroups=="c50")]]) +``` + +###Use multimodalR on breast cancer data +The next steps are: +Processes one cancer group by + + * filtering 0-sum rows to clear out 0sum genes + * using Silverman's test for unimodality to find potential multimodal genes + * filtering genes with <2% of patients that have expression values greater 0 to clear those genes out + * useMclust() on data to find potential multimodal genes + * cleaning the output of useMclust + * filtering for multimodal genes + * filtering for existing minimum mean of any mean of the multimodal genes + * filtering for existing minimum fold change between any adjacent means of multimodal genes + * getting gene names from Ensembl + * returning a list of the filtered output and the filtered expression matrix + +All those steps are part of the 'getFilteredData' function, which will now be used on the breast cancer data. + +```{r process Breast Cancer Data, eval=FALSE} +#process breast cancer data +filteredBreastCancerData <- getFilteredData(oneCancerGroupExpressionmatrix = breastCancerData,coresToUse = 1,updateToEnsembleGeneNames = TRUE) +#get filtered output +filteredBreastCancerOutput <- filteredBreastCancerData$Output +#get filtered expression matrix +filteredBreastCancerEm <- filteredBreastCancerData$Expressionmatrix +``` + +The steps integrated in the 'getFilteredData' function can be used as functions via the Double Colon Operator ("::"). +Here is a list of the steps and corresponding functions: + + * filtering 0-sum rows to clear out 0sum genes: filter0sumGenes() + * using Silverman's test for unimodality to find potential multimodal genes: useSilvermantest() + * filtering genes with <2% of patients that have expression values greater 0 to clear those genes out: filterRarelyExpressedGenes() + * process data with an algorithm (default: useMclust()) to find potential multimodal genes: useMclust(), useHdbscan(), useFlexmix() + * cleaning the algorithm's output: cleanOutput() + * filtering for multimodal genes: filterMultimodalGenes() + * filtering for existing minimum mean of any mean of the multimodal genes: filterForMean() + * filtering for existing minimum fold change between any adjacent means of multimodal genes: filterForFoldChange() + * getting gene names from Ensembl: updateGeneNames() + +How to use these functions to create another workflow is described in the section "Use multimodalR filter functions" in detail. + +### Create a plot of the filtered gene expression matrix + +The next step of this fast workflow is to plot the filtered gene expression matrix. +```{r plotting the filtered gene expression matrix, eval=FALSE} +plotExpression(expression = filteredBreastCancerEm, + plotsPerPage = 1, + GenesToPlot = nrow(filteredBreastCancerEm), + plotName = "filteredBreastCancerData") + +``` + +### Create plots of the survival curves of the genes + +The plotting of the survival curves of the filtered gene expression matrix is the last step of this fast workflow +and can be achieved with the plotSurvivalCurves() function. The gene names should be updated before using this function because then the correct gene names will be displayed above the plots. + +```{r plotting the survival curves, eval=FALSE} +plotSurvivalCurves(metadata = metadata, + output = filteredBreastCancerOutput) + +``` + +## Use multimodalR filter functions + +The filter functions described in this section are used in the getFilteredData() function which is a fast and straightforward way to analyse TCGA data. The usage of the separate functions allows the user to integrate self-written filtering functions and gives the user more control of the filtering process. These functions are specifically designed to be used after a single cancer group was collected as a data frame. + +### Filter 0 sum rows + +Genes that are not expressed in any of the patient can't be classified as multimodal. Therefore, they are useless for the further analysis and excluded from the data set with the 'filter0sumgenes()' function: + +```{r filter0sumGenes, eval=FALSE} +clearedData <- filter0sumGenes(cancerData = oneGroupCancerData,parallel = TRUE) +``` + +The result is a cleared gene expression data frame which only holds those genes were the sum of the gene expression was greater than 0. + +### Filter genes with different tests for unimodality + +The package multimodalR includes two different tests for unimodality: +Silverman's Test for Unimodality and Hartigans' Dip Test for Unimodality. +Both tests can be used to filter the genes for possible multimodal genes. +Genes that are classified as unimodal with the tests are excluded. +Silverman's test was used in the analysis of TCGA data because it has better recognized +multimodal genes in a simulated gene expression data set. Hartigans' Test for Unimodality may also be used by the user. + +#### Use Silverman's Test for Unimodality + +Silverman's Test for Unimodality tests the null hypothesis that an underlying density has at most 1 mode. +By rejecting this hypothesis (returned p-value is smaller than given level of significance SilvermanP), +one can verify that the underlying density has more than k modes. +Silverman's Test is used in the 'useSilvermantest()' function: +```{r useSilvermantest, eval=FALSE} +possiblyMultimodal <- useSilvermantest(clearedData =clearedData,parallel = TRUE,SilvermanP = 0.05) +``` + +The result is a gene expression data frame which only holds those genes for that the null hypothesis of the Silverman test was rejected at the given level of significance. + +#### Use Hartigans' Dip Test for Unimodality + +Hartigans' Dip Test for Unimodality tests the null hypothesis that a given distribution is unimodal. +By rejecting this hypothesis, one can verify that the given distribution is not unimodal. +The null hypothesis is rejected if the returned p-value is smaller than the given level of significance DipTestP. +Hartigans' Dip Test for Unimodality is used in the 'useDiptest()' function: + +```{r useDiptest, eval=FALSE} +possiblyMultimodal <- useDiptest(clearedData =clearedData,parallel = TRUE,DipTestP = 0.05) +``` + +The result is a gene expression data frame which only holds those genes +for that the null hypothesis of Hartigans' Dip Test was rejected at the given level of significance. + +### Filter rarely expressed genes + +Genes that are only expressed in a few patients can't be classified as multimodal. Therefore, they are useless for the further analysis and excluded from the data set with the 'filterRarelyExpressedGenes()' function: + +```{r filterRarelyExpressedGenes, eval=FALSE} +toBeProcessed <- filterRarelyExpressedGenes(possibles = possiblyMultimodal,minExpressedPercentage = 2,parallel = TRUE) +``` + +The result is a gene expression data frame which only holds those genes for that more than the given minPercentage of patients showed a gene expression value greater than 0. + +### Process data with an algorithm + +The filtered gene expression data frame can be processed with any of the implemented algorithms: mclust, HDBSCAN and FlexMix: +```{r useAlgorithm, eval=FALSE} +#useMclust +output <- useMclust(expression = toBeProcessed,clusterNumbers = c(1:3),verbose = T,parallel = T) + +#useHdbscan +output <- useHdbscan(expression = toBeProcessed,minClusterSize = 20,minSamples = NULL,verbose = T) + +#useFlexmix +output <- useFlexmix(expression = toBeProcessed,maxModality = 3,verbose = T,parallel = T) +``` + +The output is in the unified output data format which is explained in the `multimodalR` vignette in detail. + +### Clean the algorithm's output + +The output of the algorithms may need cleaning. Sometimes groups are found that only consist of very few patients, those groups will be sorted into the other groups. Also mistakes that might have occured (e.g. empty groups that falsify the modus) are cleared. + +```{r cleanOutput, eval=FALSE} +cleanedOutput <- cleanOutput(cancerGroupOutput = output,cancerGroupExpressionmatrix = toBeProcessed, + minPercentage = 10,coresToUse = 6,parallel = FALSE) +``` + +The returned output is the cleaned output in the unified output data format. + +### Filter for multimodal genes + +The cleaned output can be filtered for multimodal genes with the `filterMultimodalGenes()` function. With this function all unimodal genes are excluded: + +```{r filterMultimodalGenes, eval=FALSE} +multimodalData <- filterMultimodalGenes(cleanedOutput = cleanedOutput, + processedExpressionmatrix = toBeProcessed) +multimodalOutput <- multimodalData$Output +multimodalEm <- multimodalData$Expressionmatrix +``` + +The result is a list of the filtered expression matrix and output. +The output is in the unified output data format. +The filtered expression matrix and output only hold those genes which are classified as multimodal. + +### Filter for minimum mean value + +The multimodal output can be filtered for a minimum mean value with the `filterForMean()` function. +If any of the means of a gene is greater than `minMean`, the gene is kept. +All other genes are excluded from the output. + +```{r filterForMean, eval=FALSE} +filteredMeanData <- filterForMean(multimodalExpressionmatrix = multimodalEm, + multimodalOutput = multimodalOutput, + minMean = 2) +filteredMeanOutput <- filteredMeanData$Output +filteredMeanEm <- filteredMeanData$Expressionmatrix +``` + +The result is a list of the filtered expression matrix and output. +The output is in the unified output data format. +The filtered expression matrix and output consist of only those genes which have a mean greater than `minMean`. + +### Filter for minimum fold change + +The multimodal output can be filtered for a minimum fold change between the means with the `filterForFoldChange()` function. +If any of the differences between two neighbouring means of a gene is greater than `minFoldChange`, the gene is kept. +All other genes are excluded from the output. + +```{r filterForFoldChange, eval=FALSE} +filteredFCData <- filterForFoldChange(filteredMeanExpressionmatrix =filteredMeanEm, + filteredMeanOutput = filteredMeanOutput, + minFoldChange = 2) +filteredFCOutput <- filteredFCData$Output +filteredFCEm <- filteredFCData$Expressionmatrix +``` + +The result is a list of the filtered expression matrix and output. +The output is in the unified output data format. +The filtered expression matrix and output consist of only those genes which have a fold change greater than `minFoldChange` between any neighbouring means. + + +### Update gene names + +The gene names that are specified in the TCGA data are Ensembl gene ids which may be converted to the real gene names. This can be done with the `updateGeneName()` function. + +```{r updateGeneNames, eval=FALSE} +updatedNamesData <- updateGeneNames(filteredExpressionmatrix = filteredFCEm,filteredOutput=filteredFCOutput) +filteredOutput <- updatedNamesData$Output +filteredEm <- updatedNamesData$Expressionmatrix +``` + +The result is a list of the filtered expression matrix and output with the updated gene names. +The output is in the unified output data format. +The filtered expression matrix has the updated gene names as row names. + +### Filter for genes not on the X chromosome + +The filtered output and expression matrix with the updated gene names can be further filtered to find the genes that are located on the X chromosome. Those genes will be excluded by the usage of the 'filterForXChromosomeGenes()' function. + +```{r filterForXChromosomeGenes, eval=FALSE} +notOnXChromosomeData <- filterForXChromosomeGenes(output = filteredOutput,expressionmatrix = filteredEm) +notOnXChromosomeOutput <- notOnXChromosomeData$Output +notOnXChromosomeEm <- notOnXChromosomeData$Expressionmatrix +``` + +The result is a list of the expression matrix and output with all genes that were not located on the X chromosome. +The output is in the unified output data format. +The filtered expression matrix has the updated gene names as row names. + +### Filter for genes not on the Y chromosome + +The filtered output and expression matrix with the updated gene names can be further filtered to find the genes that are located on the Y chromosome. Those genes will be excluded by the usage of the 'filterForYChromosomeGenes()' function. + +```{r filterForYChromosomeGenes, eval=FALSE} +notOnYChromosomeData <- filterForYChromosomeGenes(output = notOnXChromosomeOutput,expressionmatrix = notOnXChromosomeEm) +notOnYChromosomeOutput <- notOnYChromosomeData$Output +notOnYChromosomeEm <- notOnYChromosomeData$Expressionmatrix +``` + +The result is a list of the expression matrix and output with all genes that were not located on the Y chromosome. +The output is in the unified output data format. +The filtered expression matrix has the updated gene names as row names. + +The filtered expression matrix can be plotted as described above. Of course, self-written filtering functions can be integrated in the described workflow. + +### Filter for genes known in cancer pathways + +The filtered ouput and expression matrix with updated gene names can be further filtered to find the genes that are known in cancer pathways. The gene names of the filtered output and expression matrix are compared to the genes in KEGG's Cancer Pathway. +This can be accomplished by the usage of the 'filterForKnownInCancer()' function. + +```{r filterForKnownInCancer, eval=FALSE} +knownInCancerData <- filterForKnownInCancer(output = filteredOutput,expressionmatrix = filteredEm) +KnownInCancerOutput <- KnownInCancerData$Output +KnownInCancerEm <- KnownInCancerData$Expressionmatrix +``` + +The result is a list of the expression matrix and output with all genes that were found in KEGG's Cancer Pathway. +The output is in the unified output data format. +The filtered expression matrix has the updated gene names as row names. + +### Notes on the filtering functions + +The functions filter0sumGenes(), useSilvermantest(), filterRarelyExpressedGenes(),and useMclust(), useHdbscan(), useFlexmix() are used in the `processOneCancerGroup()` function which itself is used in the `getFilteredData()` function. Therefore, in a smaller workflow the `processOneCancerGroup()` function can be used. +The filtering of the algorithm's output can then be done manually. diff --git a/vignettes/WorkflowTCGAData.png b/vignettes/WorkflowTCGAData.png new file mode 100644 index 0000000..16e54ad Binary files /dev/null and b/vignettes/WorkflowTCGAData.png differ diff --git a/vignettes/Workflow_multimodalR.png b/vignettes/Workflow_multimodalR.png new file mode 100644 index 0000000..f929071 Binary files /dev/null and b/vignettes/Workflow_multimodalR.png differ diff --git a/vignettes/Workflow_realData.png b/vignettes/Workflow_realData.png new file mode 100644 index 0000000..857f1cc Binary files /dev/null and b/vignettes/Workflow_realData.png differ diff --git a/vignettes/getClassifications.png b/vignettes/getClassifications.png new file mode 100644 index 0000000..e9d04f6 Binary files /dev/null and b/vignettes/getClassifications.png differ diff --git a/vignettes/getStatistics.png b/vignettes/getStatistics.png new file mode 100644 index 0000000..daa3984 Binary files /dev/null and b/vignettes/getStatistics.png differ diff --git a/vignettes/multimodalR.Rmd b/vignettes/multimodalR.Rmd new file mode 100644 index 0000000..096b835 --- /dev/null +++ b/vignettes/multimodalR.Rmd @@ -0,0 +1,931 @@ +--- +title: "An introduction to the multimodalR package" +author: "Svenja Kristina Beck" +date: "`r Sys.Date()`" +#output: rmarkdown::word_document +output: + html_document: + toc: true + number_sections: true + theme: readable + highlight: haddock +# output: rmarkdown::word_document +# output: pdf_document +vignette: > + %\VignetteIndexEntry{An introduction to the multimodalR package} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + + + + + + + + + + + + + + +# Introduction +Welcome to multimodalR! multimodalR is an R package to simulate +multimodal gene expression data, to evaluate different algorithms +detecting multimodality and to detect bimodality or multimodality +in gene expression data sets. +This vignette gives an overview and introduction to multimodalR's functionalities. +It was written in R Markdown, using the knitr package for production. +See help(package="multimodalR") for further details and references provided by citation("multimodalR"). + +# Installation +To install the most recent development version from Github use: + +```{r install-github, eval = FALSE} +# biocLite("loosolab/multimodalR", dependencies = TRUE,build_vignettes = TRUE) +``` + +# Quickstart + +Assuming you want to simulate gene expression data from scratch +there are two simple steps to create a simulated data set with multimodalR. +Here is an example: + +```{r quickstart,eval = FALSE} +# Load package +library(multimodalR) + +# Create parameter object +params <- newParams() + +# Simulate data and generating the validationData behind it +simulation <- simulateExpression(params = params) +validationData <- simulation$validationData +expression <- simulation$expressionData + +``` + +These steps will be explained in detail in the following sections but in summary +the first step generates a Params object containing the parameters for +the simulation. The second step takes those parameters and simulates a new data set +containing the simulated gene expressions. + +# The general background + +When observing gene expression data, mostly 3 different scenarios can be observed: + + 1. a unimodal distribution of the gene expression + 2. a multimodal (often bimodal) distribution of the gene expression, + where lesser and greater expression are observed + 3. a multimodal (often bimodal) distribution of the gene expression, + where a nonexistent expression (0, or nearly 0) + and higher expressions are observed + +For our simulation the first scenario is generated with a Gaussian distribution. +The second scenario is generated with multiple Gaussian distributions. +The third scenario is generated with a Gamma distribution and multiple Gaussian distributions. + +Examples for bimodal gene expression: + +* The gene expression of Plexin-B1 is bimodal in ErbB-2 over-expressing cells ([Worzfeld et al.]) + + +# Workflow + +The following section describes how multimodalR is used. The framework and all its properties are described and used in examples. + +![Workflow to simulate gene expression and evaluate which algorithm works best](Workflow_multimodalR.png){width=650px} + + +## Parameters +The parameters required for the gene expression simulation are briefly described here: + +* **Global parameters** + * `nGenes` - A list containing the numbers of genes to simulate for each scenario and modality. + * `nPatients` - The number of patients to simulate and + how many columns the simulated gene expression should have per gene. + * `seed` - Seed to use to generate random numbers. +* **Distribution parameters** + * `means` - A list containing the parameter range for the mean for each scenario and modality. + * `foldChanges` - A list containing the parameter range for the foldChanges between distributions for multimodal genes. Has to be one less than the modality that is simulated. + * `sd` - Parameters for generating the standard deviations via an Inverse Gaussian distribution + * `gamma` - Shape and rate parameter to generate the gamma distributions. + * `proportions` - Parameters for the allowed proportions of multimodal distributions. + +## The `Params` Object +All the parameters to simulate gene expression data +are stored in a `Params` object. +Let's create a new one and see what it looks like. + +```{r Params,eval = FALSE} +params <- newParams() +params +``` + +As well as telling us what type of object we have ("An object of class +`Params`") and showing us the values of the parameters +this output gives us some extra information. +We can see which parameters have been changed from their +default values (those in ALL CAPS). + +### `Params` object explained & Template +The names "1","2","3",... stand for the modality of the scenario. If you want to simulate gene expression with a modality > 3, it is very helpful to use the following template: + +```{r params-Template,eval = FALSE} +#template for highest modality >= 3 +params <- newParams( + nGenes=list( + "1"=c(300), + "2"=list(gauss = 150, gamma = 150), + "3"=list(gauss = 150, gamma = 150)#add more rows + ), + means=list( + "1"= c(2, 4), + "2"= c(2, 4), + "3"= c(2, 4) + ), + foldChanges = list( + "1"= NA, + "2"= list(gauss = c(2, 4), gamma = c(2, 4)), + "3"= list(gauss = list(c(2, 4), c(2, 4)), gamma = list(c(2, 4), c(2, 4))) + ), + sd = list( + "mu" = 0.61, + "lambda" = 2.21 + ), + gamma = list( + shape = 2, + rate = 2 + ), + proportions = c(10,20,30,40,50,60,70,80,90), + nPatients = c(200) + ) +``` +CAUTION: if you want to simulate only certain modalities (e.g. 2,4), you have to use blanks for the others: +```{r params-TemplateOnlyCertainModailities,eval = FALSE} +params <- newParams( + nGenes=list( + "1"= c(0), + "2"= list(gauss = 10, gamma = 10), + "3"= list(gauss = 0, gamma = 0), + "4"= list(gauss = 10, gamma = 10) + ), + means=list( + "1"= 0, + "2"= c(2, 4), + "3"= 0, + "4"= c(2,4) + ), + foldChanges = list( + "1"= NA, + "2"= list(gauss = c(2, 4), gamma = c(2, 4)), + "3"= NA, + "4"= list(gauss = list(c(2, 4),c(2,4),c(2,4)), gamma = list(c(2, 4),c(2,4),c(2,4))) + ), + sd = list( + "mu" = 0.61, + "lambda" = 2.21 + ), + gamma = list( + shape = 2, + rate = 2 + ), + proportions = c(10,20,30,40,50,60,70,80,90), + nPatients = c(200) + ) +``` + +### Getting and setting +If we want to look at a particular parameter, for example the number of genes to +simulate, we can extract it using the `getParam` function: + +```{r getParam,eval = FALSE} +getParam(params, "nGenes") +``` + +Alternatively, to give a parameter a new value we can use the `setParam` +function: + +```{r setParam,eval = FALSE} +params <- setParam(params, "nGenes", list("1" = 100,"2"=list("gauss"=150,"gamma"=200))) +getParam(params, "nGenes") +``` + +If we want to extract multiple parameters (as a list) or set multiple parameters +we can use the `getParams` or `setParams` functions: + +```{r getParams-setParams,eval = FALSE} +# Set multiple parameters at once (using a list) +params <- setParams(params, + update = list("nGenes" = list("1" = 100,"2"=list("gauss"=150,"gamma"=200)), + "nPatients" = 300)) +# Extract multiple parameters as a list +getParams(params, c("nGenes", "gamma")) +# Set multiple parameters at once (using additional arguments) +params <- setParams(params, + "nGenes" = list("1" = 100,"2"=list("gauss"=150,"gamma"=200)), + "nPatients" = 300) +params +``` + +The parameters which have changed are now shown in ALL CAPS to indicate that they have +been changed from the default. + +We can also set parameters directly when we call `newParams`: + +```{r newParams-set,eval = FALSE} +# # A basic example for simulating unimodal, bimodal and trimodal distributions +params <- newParams( + nGenes=list("1"=c(300), + "2"=list(gauss = 150, gamma = 150), + "3"=list(gauss = 150, gamma = 150)), + means=list("1"= c(2, 4), + "2"= c(2, 4), + "3"= c(2, 4)), + foldChanges = list("1"= NA, + "2"= list(gauss = c(2, 4), + gamma = c(2, 4)), + "3"= list(gauss = list(c(2, 4), c(2, 4)), + gamma = list(c(2, 4), c(2, 4)))), + nPatients=50) +params + +A Params object of class Params +Parameters can be 'Default' or 'NOT DEFAULT'. + +Global: + NGENES + 10, list(gauss = 10, gamma = 10) + + MEANS + c(2, 4), c(2, 4) + + FOLDCHANGES + NA, list(gauss = c(2, 4), gamma = c(2, 4)) + + sd + 0.61, 2.21 + + Gamma + 2, 2 + + Proportions + 10, 20, 30, 40, 50, 60, 70, 80, 90 + + NPATIENTS + 50 + + SEED + 61414 + +``` + +To change the parameters, please consider the following table. +*Type* shows with which type the slot has to be filled, +*Usage* shows further information for the usage of the values, +*Default values* describes the default values used to generate a `Params` +object. +```{r params-table, eval=FALSE} + Type Usage Default values +nGenes list values: unimodal,bimodal (gauss,gamma),... 700, list(gauss = 150, gamma = 150) +means list values: unimodal,bimodal (gauss),... c(2, 4), c(2, 4) +foldChanges list values: unimodal,bimodal (gauss,gamma),... c(2, 4), list(gauss = c(2, 4), gamma = c(3, 5)) +sd list use values as µ,lambda 0.61, 2.21 +gamma list use values as shape,rate 2, 2 +proportions vector pick random value from vector c(10,20,30,40,50,60,70,80,90) +nPatients numeric - 100 +seed numeric - sample(1:1e6,1) + +``` + + + +## The gene expression simulation +Now that we looked at the Params Objects and saw which parameters it contains, +let's look at how multimodalR simulates gene expression data. + +The core of the simulation is the validationData, which is created in the simulation +process with the parameters from the created Params object. +As the result of the simulation process, the simulated gene expression matrix +is returned with the validationData in a list object. + +The simulation of a gene expression data set requires the parameters (which are +stored in the Params object) and the validationData behind the simulation (which was +not created yet). +The Params object was already created and if needed, default values were +changed. The validationData is automatically created when the simulate() command is +executed. + +### validationData +The structure of the validationData is the following: + + * each simulated gene has its own entry in the validationData and can be accessed with validationData$[GeneName] + * each entry contains the following information: + * modus - modality of this gene shown as a number (1 = unimodal, 2 = bimodal, ...) + * scenario - scenario of this gene, either ("unimodal-gaussian","multimodal-gaussian" or "multimodal-gamma+gaussian") + * means - means of this gene + * foldChanges - foldChanges between the means (NA for unimodal genes) + * sds - standard deviation of each mean + * proportions - proportions of the different distributions + * sizes - sizes of the different groups + * groups - patientNames are listed in the corresponding group + +To generate the validationData, a `Params` object is needed. The parameters from this +object are used for generating the validationData by randomizing values starting with a +generated `seed`. This is done by drawing values from the `Params` object. +Mean values are generated from a span of values, standard +deviations are generated by using an inversed Gaussian distribution from the package [LaplacesDemon].That is, +because over a large data set a inverseGaussian-like distribution of all +standard deviations was noticed. + +Because of those needed parameters the validationData looks for example like that: +```{r validationData-example,eval=FALSE} + validationData$Gene0001 +$modus +[1] 2 + +$scenario +[1] "multimodal-gaussian" + +$means +[1] 3.561552 7.309845 + +$foldChanges +[1] 2.052433 + +$sds +[1] 0.2826536 0.6026159 + +$proportions +[1] 20 80 + +$sizes +[1] 10 40 + +$groups +$groups[[1]] + [1] "Patient0001" "Patient0002" "Patient0003" "Patient0004" + [5] "Patient0005" "Patient0006" "Patient0007" "Patient0008" + [9] "Patient0009" "Patient0010" + +$groups[[2]] + [1] "Patient0011" "Patient0012" "Patient0013" "Patient0014" + [5] "Patient0015" "Patient0016" "Patient0017" "Patient0018" + [9] "Patient0019" "Patient0020" "Patient0021" "Patient0022" +[13] "Patient0023" "Patient0024" "Patient0025" "Patient0026" +[17] "Patient0027" "Patient0028" "Patient0029" "Patient0030" +[21] "Patient0031" "Patient0032" "Patient0033" "Patient0034" +[25] "Patient0035" "Patient0036" "Patient0037" "Patient0038" +[29] "Patient0039" "Patient0040" "Patient0041" "Patient0042" +[33] "Patient0043" "Patient0044" "Patient0045" "Patient0046" +[37] "Patient0047" "Patient0048" "Patient0049" "Patient0050" +``` + +To understand why all those parameters are needed, it is essential to +understand how the validationData is used to simulate the gene expression. + +### Simulation +With the validationData the gene expression matrix is simulated. +For each gene (each entry of the validationData) the parameters in the validationData are used to simulate +the gene expression. +How many columns the gene expression matrix will have depends on the number of +patients. Per patient one gene expression is simulated. +If a negative value was simulated, it is changed to be 0 in the simulation process, +as normally no negative gene expressions can be observed. +How the gene expression is simulated depends on the modus. + +The "modus" is the number of occurring distributions within one gene. + +####Modus "1" - Unimodal gaussian distribution +Gene expression for the unimodal gaussian distribution (modus "1") is +simulated from the mean and the standard deviation taken from the +validation data for this gene. The Gaussian distribution is calculated +with [rnorm]. The number of values that are simulated for each distribution +equals the number of Patients {"nPatients"}. + +All other modi consist of two different scenarios: +In the "gauss" scenario all occurring distributions are Gaussian distributions +and calculated with [rnorm]. +In the "gamma" scenario the first occuring distribution is a Gamma distribution +and is calculated with [rgamma]. +All other distributions are gaussian distributions and are calculated with [rnorm]. + +####Modus "2" or higher - Bimodal distributions or higher modalities + +For each distribution gene expressions of the multimodal genes are +automatically simulated. + +#####Scenario "gauss" +The different gaussian distributions are simulated with the corresponding mean +and a standard deviation and number of values taken from the validation data +for this gene. + +#####Scenario "gamma" +The gamma distribution is simulated with the gamma shape and rate values from +the params object. The other gaussian distributions are simulated with the values +from the validation data. + +####Exemplary Gene Expression Matrix +After using the `simulateExpression()` command, the gene expression matrix looks like +this: +```{r expression-sample, eval=FALSE} + Patient0001 Patient0002 Patient0003 Patient0004 Patient0005 +Gene0001 3.9674492 2.80396473 3.78980664 4.095237721 4.00587345 +Gene0002 4.1045561 2.26303081 3.75889335 3.522737760 3.02592372 +Gene0003 2.4582632 2.22295497 3.17911283 2.947560965 2.74938887 +Gene0004 5.9032577 5.95497476 4.75760279 5.161985499 3.70910216 +Gene0005 3.7431061 3.68041039 3.62509463 3.740929320 3.66194851 +Gene0006 3.0784338 1.79963109 2.16929818 1.160848158 3.49503635 +Gene0007 0.7993267 2.21868266 2.62367680 0.152856110 1.58145211 +Gene0008 2.0018416 2.22588333 2.90464907 3.049822819 2.28450055 +Gene0009 2.2986108 2.08720943 2.02138626 2.407369792 2.00220259 +Gene0010 2.1793609 2.85387502 2.23943177 2.423610586 2.35072765 +``` + +### Executing the simulation +As the `Params`object can be generated before starting the +simulation, the `simulateExpression()` command can be used in two different ways: + 1. with a created params object (recommended) + 2. without a created params object (not recommended) + +The validationData is automatically generated when using the `simulateExpression()` command. +The `simulateExpression()` command returns a list object, which contains the created +validationData and the simulated expressionData. + +###Simulating the gene expression + +#### 1. With a created params object(recommended) +It is recommended to create a `Params` object before using the `simulateExpression()` +command, because then you can change the parameters and have the params object +saved in your global environment. +```{r simulateExpression-with-params,eval = FALSE} +#with messages printed to console +params <- newParams() +simulation <- simulateExpression(params = params) + +#without messages printed to console +simulation <- simulateExpression(params = params,verbose = FALSE) + +#storing the validation data and the simulated gene expression data as +#separate data.frames + +validationData <- simulation$validationData +expression <- simulation$expressionData +``` + +#### 2. Without a created params object (not recommended) +If the simulation should be executed with default parameters only (or with +only certain parameters changed) and you do not mind, having no separate +`Params`object in your global environment to be able to look up the different +parameters (especially in the case that you changed them), you can use this +commands, though it is not recommended, as it is useful to have a `Params` +object to look up the parameters behind the validationData. +In this case, without a created `Params`object the validationData is not saved as an +Rdata object, so there is no way of keeping track of changes to your params object. +```{r simulateExpression-without-params,eval = FALSE} +#with default parameters and messages printed to console +params <- newParams() +simulation <- simulateExpression(params = newParams()) + +#with one changed parameter and messages not printed to console +simulation <- simulateExpression(params = newParams("seed"=23),verbose = FALSE) + +#with some changed parameters and messages printed to console +simulation <- simulateExpression(params = newParams("seed"=23,"nPatients"=200)) + + +#storing the validation data and the simulated gene expression data as +#a list and a data.frame + +validationData <- simulation$validationData +expression <- simulation$expressionData + +``` + +## The unified output data format +The output data format is a unified data format, that is needed to work with +and use all of the evaluation functions. The output of all implemented +algorithms matches the unified data format. +The unified data format has the following structure: + +The data format is structured in three parts: + +* the Genes section, +* the Clinical Data section and +* expression matrix section + + +The different parts are accessible with: +```{r accessing-the-different-sections, eval=FALSE} +mclustOutput$Genes[1:10] #mclustOutput$Genes for all genes +mclustOutput$ClinicalData +mclustOutput$Expressionmatrix + + +``` + +The Genes section has many parts. For each gene a separate section is generated. +Each gene section contains the following information, which were calculated +by using one of the algorithms: + +* $modus - the modus is the number that equals the found number of occurring distributions +within one gene (the modality) +* $means - the means section contains the calculated mean values for each modus +* $sds - the sds section contains the calculated standard deviation values for each modus +* $sizes - the sizes section contains the calculated size of each group +* $groups - the groups section contains the patients that are assigned to the different groups + +The Genes can be accessed by using: +```{r accessing-a-specific-gene-section, eval=FALSE} +mclustOutput$Genes$Gene0001 + +#Accessing a specific section within the genes is possible by using the $ tags + +mclustOutput$Genes$Gene0001$modus +mclustOutput$Genes$Gene0001$means +mclustOutput$Genes$Gene0001$sds +mclustOutput$Genes$Gene0001$sizes +mclustOutput$Genes$Gene0001$groups + +output +$Genes +$Genes$Gene0001 + +$Genes$Gene0001$modus +[1] 2 + +$Genes$Gene0001$means +[1] 3.613995 7.221655 + +$Genes$Gene0001$sds +[1] 0.1851148 0.5368126 + +$Genes$Gene0001$sizes +[1] 10 40 + +$Genes$Gene0001$groups +$Genes$Gene0001$groups[[1]] + [1] "Patient0001" "Patient0002" "Patient0003" "Patient0004" + [5] "Patient0005" "Patient0006" "Patient0007" "Patient0008" + [9] "Patient0009" "Patient0010" + +$Genes$Gene0001$groups[[2]] + [1] "Patient0011" "Patient0012" "Patient0013" "Patient0014" + [5] "Patient0015" "Patient0016" "Patient0017" "Patient0018" + [9] "Patient0019" "Patient0020" "Patient0021" "Patient0022" +[13] "Patient0023" "Patient0024" "Patient0025" "Patient0026" +[17] "Patient0027" "Patient0028" "Patient0029" "Patient0030" +[21] "Patient0031" "Patient0032" "Patient0033" "Patient0034" +[25] "Patient0035" "Patient0036" "Patient0037" "Patient0038" +[29] "Patient0039" "Patient0040" "Patient0041" "Patient0042" +[33] "Patient0043" "Patient0044" "Patient0045" "Patient0046" +[37] "Patient0047" "Patient0048" "Patient0049" "Patient0050" + +$ClinicalData +[1] "mnt/data/clinicalData.Rdata" + +$Expressionmatrix +[1] "mnt/data/expression.Rdata" +``` + +If this package is used with an algorithm which was not implemented in this package, +the output has to be changed to match this unified data format. +Then, all the evaluation functions will work with the output of not implemented algorithms. + +### Requirements for implementing new algorithms + +The output of algorithms that are implemented in multimodalR have to be in the unified output data format. +Whether the output fulfils all requirements of the unified output data format can be easily checked +with the 'isValidUnifiedOuputDataFormat()' function. + +```{r isValidUnifiedOutputDataFormat, eval=FALSE} +isValidUnifiedOutputDataFormat(output = output,expressionmatrix = expression) +``` + +The requirements are listed as the structure of the unified output data format is constructed: + +* output + * should be of type 'list' + * should hold the 'Genes','ClinicalData' and 'Expressionmatrix' section + * should have a length of 3 + * ClinicalData + * should be of type 'character' + * should have a length of 1 + * ExpressionMatrix + * should be of type 'character' + * should have a length of 1 + * Genes + * should be of type 'list' + * should have a length of nrow(expressionmatrix) + + * EACH GENE + * modus + * should be of type 'integer' + * means + * should be of type 'double' + * should have a lenth of modus + * sds + * should be of type 'double' + * should have a lenth of modus + * sizes + * should be of type 'integer' + * should have a lenth of modus + * sum should equal ncol(expressionmatrix) + * groups + * should be of type 'list' and should have a lenth of modus + * EACH GROUP + * should be of type 'character' + * size of each group should correspond to sizes[[groupnumber]] + +If all of those requirements are fulfilled, the function should return 'TRUE', if any of the requirements is not fulfilled, the function returns 'FALSE'. The messages printed to the console will help to determine which requirements are not fulfilled yet. + + +## Usage of the algorithms + +In this package three algorithms that can be used for detecting multimodality, +are implemented. + +### mclust +Mclust uses a finite Gaussian mixture modeling fitted via EM algorithm for +model-based clustering, classification, and density estimation, including +Bayesian regularization and dimension reduction. +For more information look at the vignette of [mclust] "A quick tour of mclust" + + +#### mclust References +Scrucca, L. et al. (2016) ‘mclust 5: Clustering, Classification and Density Estimation Using Gaussian Finite Mixture Models’, +The R journal, 8(1), pp. 289–317. + +C. Fraley and A. E. Raftery (2002). Model-based clustering, discriminant +analysis, and density estimation. Journal of the American Statistical +Association 97:611:631. + +### HDBSCAN +The HDBSCAN (Hierarchical Density-based +spatial clustering of applications with noise) clustering algorithm +computes the hierarchical cluster tree representing density estimates +along with the stability-based flat cluster extraction proposed by Campello et +al. (2013). HDBSCAN essentially computes the hierarchy of all DBSCAN* +clusterings, and then uses a stability-based extraction method to find optimal +cuts in the hierarchy, thus producing a flat solution. +For more information look at the way [HDBSCAN] works or use the documentation +at [HDBSCANdocs] + +#### HDBSCAN References +Campello, R.J.G.B., Moulavi, D. and Sander, J. (2013) ‘Density-Based Clustering Based on Hierarchical Density Estimates’, +in Pei, J. et al. (eds.) Advances in knowledge discovery and data mining: 17th Pacific Asia Conference, +PAKDD 2013, Gold Coast, Australia, April 14-17, 2013 ; proceedings. +(Lecture Notes in Computer Science, 7819 : Lecture notes in artificial intelligence). Berlin: Springer, pp. 160–172 + +Ester, M. et al. (1996) ‘A Density-based Algorithm for Discovering Clusters in Large Spatial Databases with Noise’, +Proceedings of the Second International Conference on Knowledge Discovery and Data Mining: AAAI Press, +pp. 226–231. Available at: http://dl.acm.org/citation.cfm?id=3001460.3001507. + +McInnes, L., Healy, J. and Astels, S. (2016) How HDBSCAN Works. +Available at: https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html. + +McInnes, L., Healy, J. and Astels, S. (2017) ‘hdbscan: Hierarchical density based clustering’, +The Journal of Open Source Software, 2(11), p. 205. doi: 10.21105/joss.00205 + +McInnes, L. and Healy, J. (2017) ‘Accelerated Hierarchical Density Based Clustering’, +2017 IEEE International Conference on Data Mining Workshops (ICDMW), +pp. 33–42. doi: 10.1109/ICDMW.2017.12 + +### flexmix +FlexMix implements a general framework for finite mixtures of regression +models. Parameter estimation is performed using the EM algorithm: the E-step +is implemented by flexmix, while the user can specify the M-step. +For more information look at the vignette of [flexmix] "flexmix-intro" + +#### flexmix References +Friedrich Leisch. FlexMix: A general framework for finite mixture models and +latent class regression in R. Journal of Statistical Software, 11(8), 2004. +doi:10.18637/jss.v011.i08 + +Bettina Gruen and Friedrich Leisch. Fitting finite mixtures of generalized +linear regressions in R. Computational Statistics & Data Analysis, 51(11), +5247-5252, 2007. doi:10.1016/j.csda.2006.08.014 + +Bettina Gruen and Friedrich Leisch. FlexMix Version 2: Finite mixtures with +concomitant variables and varying and constant parameters Journal of +Statistical Software, 28(4), 1-35, 2008. doi:10.18637/jss.v028.i04 + +### Using the different algorithms +To use the different algorithms, it is only needed to have created a simulated +expression data frame (which can be done with the `simulateExpression()` command). +Of course, it is also possible to use the different algorithms with a data-set +that was created elsewhere, if it has the same structure used in this packag, then +it is useful to type in the path to the expression data and clinical data. +To use this package for detecting bimodality in your data set, we recommend +using the **not clear yet** command, because our tests with a stratified +data set (included as the test-data set (**not yet**)) have shown that this +algorithm is the most accurate. +The output of all implemented algorithms was unified and overall shows the +same structure. +In the following it is shown, how to use the different implemented algorithms +and how the output should roughly look like. + +####Using mclust +To use the mclust algorithm on your expression data frame (simulated or not) +and gain information which genes are bimodal use the useMclust() command: +```{r useMclust,eval = FALSE} +#use mclust with the expression data frame expression and messages printed to the console +mclustOutput <- useMclust(expression = expression) + +#use mclust with the expression data frame expression and messages not printed to the console +mclustOutput <- useMclust(expression = expression,verbose = FALSE) + +``` + +####Using HDBSCAN +To use the HDBSCAN algorithm on your expression data frame (simulated or not) +and gain information which genes are bimodal use the useHdbscan() command. +Using HDBSCAN also needs the component `minClusterSize`, which acts as a +minimum cluster size to detect: +```{r useHdbscan,eval = FALSE} +#use HDBSCAN with the expression data frame expression +hdbscanOutput <- useHdbscan(expression = expression,minClusterSize = 10) + +``` + +####Using FlexMix: +To use the FlexMix algorithm on your expression data frame (simulated or not) +and gain information which genes are bimodal use the useFlexmix() command: +```{r useFlexmix,eval = FALSE} +#use FlexMix with the expression data frame expression +flexmixOutput <- useFlexmix(expression = expression,maxModality = 2,reps = 1) + +``` + +## Evaluating the algorithms output +After using one or more of the implemented algorithms on the gene expression +matrix, the packages functions to evaluate which genes are bimodal can be +used. All outputs can be used with the same evaluation functions. +Using the `evaluateAlgorithmOutput()` command is the first step for using the +`getStatistics()` function. +```{r evaluateAlgorithmOutput,eval = FALSE} +#evaluating for bimodal genes after using useMclust() +mclustEvaluation <- evaluateAlgorithmOutput(modality= 2,algorithmOutput = mclustOutput,validationData = validationData, maxDifference = 10) + +#evaluating for bimodal genes after using useHdbscan() +hdbscanEvaluation <- evaluateAlgorithmOutput(modality= 2,algorithmOutput = hdbscanOutput,validationData = validationData, maxDifference = 10) + +#evaluating for bimodal genes after using useFlexmix() +flexmixEvaluation <- evaluateAlgorithmOutput(modality= 2,algorithmOutput = flexmixOutput,validationData = validationData, maxDifference = 10) + +#the evaluation output looks like this: +head(mclustEvaluation) +``` + +After using the algorithms and using the `evaluateAlgorithmOutput` command, all +requirements to create the statistics and classifications were met. + +### Statistics +The `getStatistics()` function calculates the percentage of values that were +correctly (*TRUE*) and not correctly (*FALSE*) estimated by one or more of the +algorithms. To get this statistic, execute the following code: +```{r getStatistic(),eval = FALSE} +#getStatistic of one evaluation +getStatistics(evaluations = list("mclust"=mclustEvaluation)) + +#getStatistic of two or more evaluations +getStatistics(evaluations = list("mclust"=mclustEvaluation,"HDBSCAN"=hdbscanEvaluation)) + +``` + + +### Classifications +To get the classification of correctly and not correctly estimated scenarios, +the `getClassifications()` function can be used. + +#### Reading the output + * FN (FALSE Negatives) - validationData: modality, estimated scenario: not bimodal + (unimodal or more than bimodal) + * FP (FALSE Positives) - validationData: unimodal, estimated scenario: bimodal + * TN (TRUE Negatives) - validationData: unimodal, estimated scenario: unimodal + * TP (TRUE Positives) - validationData: bimodal, estimated scenario: bimodal + +#### Using the `getClassifications()` command +```{r getClassifications(),eval = FALSE} +#get the classification of one algorithmOutput +getClassifications(modality = 2,validationData = validationData,outputs = list("mclust"=mclustOutput)) + +#get the classification of two or more algorithms +getClassifications(modality = 2,validationData = validationData, +outputs = list("mclust"=mclustOutput,"HDBSCAN"=hdbscanOutput)) + +``` + +## Plotting + +###Plotting the gene expression + +In order to plot the gene expression of all genes, four components should be +specified: + + * `expression` - The gene expression matrix that should be plotted + * `plotsPerPage` - The number of genes that are to be plotted on the same + page. This number has to be a divisor of the number of `GenesToPlot`. + Please consider that too many plots on one page make the plots unreadable! + * `GenesToPlot` - The number of genes to be plotted. Should be the + number of genes. + * `plotName` - The name under which a pdf file with the plots will be stored + in your folder. The pdf must not be added in the plotName! + +```{r plotExpression(),eval=FALSE} +#This will create a pdf called "expression.pdf" with a plot of each gene on a separate page. +plotExpression(expression = expression,plotsPerPage = 1,GenesToPlot = 1000,plotName = "expression") + +#This will create a pdf called "expression200.pdf" with the first 200 genes plotted with two plots on each page. +plotExpression(expression = expression,plotsPerPage = 2,GenesToPlot = 200,plotName = "expression200") +``` + +The plotted gene expression will look like this: +![Example picture for how the plotted output will look like.](plotExpression.png) + +###Plotting the gene expression and output + +The outputs of all algorithms producing output with the unified output format can be plotted. +In order to plot the gene expression with the output, four components should be +specified: + + * `expression` - The gene expression matrix that should be plotted + * `algorithmOutput` - The algorithm output that should be plotted + * `name` - The name under which a pdf file with the plots will be stored + in your folder. The pdf must not be added in the plotName! + * `GenesToPlot` - The number of genes to be plotted. Should be the + number of genes. + +```{r plotOutput, eval=FALSE} +#Plotting one algorithms output +## This function call will create a pdf called "mclustOutput.pdf", where the expression for each gene and the estimated expression of mclust is plotted. +## The expression is plotted in black, the algorithms output is plotted in red. +plotOutput(expression = expression,algorithmOutput = mclustOutput,name = "mclustOutput",GenesToPlot = nrow(expression)) +``` + +The plotted output with the gene expression will look like this: +![Example picture for how the plotted output will look like. Black: gene expression, red: estimated distribution](PlottingExample.png) + +###Plotting the statistics + +The produced statistics can be plotted. +In order to plot the statistics, two components should be +specified: + + * `statistics` - The statistics that should be plotted + * `name` - The name under which a pdf file with the plots will be stored + in your folder. The pdf must not be added in the plotName! + +```{r plotStatistics, eval=FALSE} +# This function call will create a pdf called "statistics.pdf",where the statistics are plotted. + +plotStatistics(statistics = statistics,name = "statistics") +``` + +The plotted statistics will look like this: +![Example picture for how the plotted statistics will look like.](getStatistics.png) + +###Plotting the classifications + +The produced classifications can be plotted. +In order to plot the classifications, two components should be +specified: + + * `classifications` - The classifications that should be plotted + * `name` - The name under which a pdf file with the plots will be stored + in your folder. The pdf must not be added in the plotName! + +```{r plotClassifications, eval=FALSE} +# This function call will create a pdf called "classifications.pdf",where the classifications are plotted. + +plotClassifications(classifications = classifications,name = "classifications") +``` + +The plotted classifications will look like this: +![Example picture for how the plotted classification will look like.](getClassifications.png) + +# Citing multimodalR + +If you use multimodalR in your work please cite our paper: + +```{r citation,eval = FALSE} +citation("multimodalR") +``` + +# Session information {-} + +```{r sessionInfo} +sessionInfo() +``` + + + +[Worzfeld et al.]: https://doi.org/10.1172/JCI60568 +[rnorm]: https://stat.ethz.ch/R-manual/R-devel/library/stats/html/Normal.html +[rgamma]: https://stat.ethz.ch/R-manual/R-devel/library/stats/html/GammaDist.html +[mclust]: http://127.0.0.1:40225/help/library/mclust/doc/mclust.html +[hdbscan]: http://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html +[HDBSCANdocs]: http://hdbscan.readthedocs.io/ +[flexmix]: https://cran.r-project.org/web/packages/flexmix/vignettes/flexmix-intro.pdf +[LaplacesDemon]: https://www.rdocumentation.org/packages/LaplacesDemon/versions/16.0.1/topics/dist.Inverse.Gamma diff --git a/vignettes/plotExpression.png b/vignettes/plotExpression.png new file mode 100644 index 0000000..c081f70 Binary files /dev/null and b/vignettes/plotExpression.png differ