Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#' UseMclust
#'
#' Use the mclust algorithm for generating output to evaluate the
#' classification (unimodal,bimodal or other) of genes.
#'
#' @param expression (Simulated) Gene Expression data frame which genes shall
#' be classified as unimodal, bimodal or other
#' @param clusterNumbers span of the clusters to be calculated at maximum
#' @param verbose logical. Whether to print progress messages
#' @param parallel logical. Whether to calculate the mclust models in parallel
#' @param pathToClinicalData If real data is used the path
#' to the clinical data can be stored here
#' @param pathToExpressionmatrix If real data is used the path
#' to the gene expression matrix can be stored here
#' @return The formatted output of the mclust algorithm
#'
#' @examples
#' \dontrun{
#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10)))
#' simulation <- simulateExpression(params= params,verbose=FALSE)
#' expression <- simulation$expressionData
#' #with messages printed to console
#' mclustOutput <- useMclust(expression = expression)
#' #without messages printed to console, at maximum 3 clusters
#' mclustOutput <- useMclust(expression = expression, clusterNumbers = c(1:3),
#' verbose = FALSE, parallel = FALSE)
#' }
#' @import mclust
#' @importFrom parallel detectCores makeCluster clusterExport clusterEvalQ parLapply stopCluster
#' @export
#'
useMclust <- function(expression,clusterNumbers = NULL,verbose = TRUE,parallel = FALSE,pathToClinicalData="pathTo/clinicalData",pathToExpressionmatrix="pathTo/expressionmatrix") {
if (verbose) {message("mclust-models are calculated...")}
if(is.null(clusterNumbers)){
if(parallel==FALSE){
models <- lapply(1:nrow(expression), function(i) {
mclust::Mclust((expression[i,]),modelNames=c("V"),verbose = FALSE) })
}
if(parallel==TRUE){
n.cores = parallel::detectCores()-1
cl = parallel::makeCluster(n.cores)
parallel::clusterExport(cl, "expression")
parallel::clusterEvalQ(cl, library(mclust))
models <- parallel::parLapply(cl, 1:nrow(expression), function(i) {
mclust::Mclust((expression[i,]), modelNames=c("V")) })
}
}else{
if(parallel==FALSE){
models <- lapply(1:nrow(expression), function(i) {
mclust::Mclust((expression[i,]),modelNames=c("V"),verbose = FALSE,G = clusterNumbers) })
}
if(parallel==TRUE){
n.cores = parallel::detectCores()-1
cl = parallel::makeCluster(n.cores)
parallel::clusterExport(cl, "expression")
parallel::clusterEvalQ(cl, library(mclust))
models <- parallel::parLapply(cl, 1:nrow(expression), function(i) {
mclust::Mclust((expression[i,]), modelNames=c("V"),G = clusterNumbers) })
}
}
if(parallel==TRUE){parallel::stopCluster(cl)}
names(models) = rownames(expression)
patients <- names(expression)
mclustOutput <- mclustClassification(models = models,patients = patients,pathToClinicalData,pathToExpressionmatrix)
cleanOutput <- (lapply(1:length(mclustOutput$Genes),function(i){
#corrects genes with an empty group to the correct modus, means, sds, sizes and groups
isNull <- unlist(lapply(1:length(mclustOutput$Genes[[i]]$groups),function(j){
if(is.null(unlist(mclustOutput$Genes[[i]]$groups[j]))||length(unlist(mclustOutput$Genes[[i]]$groups[j]))==0){j}
}))
if(!is.null(isNull)){
modus <- mclustOutput$Genes[[i]]$modus -1
means <- mclustOutput$Genes[[i]]$means[-isNull]
sds <- mclustOutput$Genes[[i]]$sds[-isNull]
groups <- plyr::compact(mclustOutput$Genes[[i]]$groups)
groups <- groups[lapply(groups,length)>0]
sizes <- unlist(lapply(1:length(groups),function(j){
length(unlist(expression[i,groups[[j]]]))
}))
}else{
modus <- mclustOutput$Genes[[i]]$modus
means <- mclustOutput$Genes[[i]]$means
sds <- mclustOutput$Genes[[i]]$sds
sizes <- mclustOutput$Genes[[i]]$sizes
groups <- mclustOutput$Genes[[i]]$groups
}
if(length(groups)!=modus){
modus <- length(groups)
means <- unlist(lapply(1:length(groups),function(j){
mean(unlist(expression[i,groups[[j]]]))
}))
sizes <- unlist(lapply(1:length(groups),function(j){
length(groups[[j]])
}))
sds <- unlist(lapply(1:length(groups),function(j){
sd(unlist(expression[i,groups[[j]]]))
}))
}
mclustOutput <- list("modus"=as.integer(modus),"means"=means,"sds"=sds,"sizes"=as.integer(sizes),"groups"=groups)
mclustOutput
}))
names(cleanOutput) <- names(mclustOutput$Genes)
mclustOutput <- list("Genes"=cleanOutput,
"ClinicalData"=mclustOutput$ClinicalData,
"Expressionmatrix"=mclustOutput$Expressionmatrix)
return(mclustOutput)
}
#' mclustClassification
#'
#' Using the output of the mclust algorithm for generating output in a unified
#' format to evaluate the classification (unimodal,bimodal or other) of genes.
#'
#' @param models Unformatted output of the Mclust function from the mclust
#' package of the genes from the gene expression data frame
#' @param patients List of the patients
#' @param pathToClinicalData If real data is used the path
#' to the clinical data can be stored here
#' @param pathToExpressionmatrix If real data is used the path
#' to the gene expression matrix can be stored here
#'
#' @return The formatted output of the mclust algorithm
#'
mclustClassification <- function(models,patients,pathToClinicalData,pathToExpressionmatrix){
genes <- names(models)
outputs <- lapply(1:length(genes),function(i){
classification <- unname(models[[i]]$classification)
groups <- lapply(1:max(classification),function(j){
patients[(which(classification==j))]
})
proportions <- models[[i]]$parameters$pro
modus <- length(models[[i]]$parameters$mean)
output <- list(
"modus" = modus,
"means" = unname(models[[i]]$parameters$mean),
"sds" = sqrt(models[[i]]$parameters$variance$sigmasq),
"sizes" = unname(table(classification[])),
"groups" = groups
)
})
names(outputs)<- genes
mclustOutput <- list(Genes = outputs,"ClinicalData"=pathToClinicalData,"Expressionmatrix"=pathToExpressionmatrix)
return(mclustOutput)
}
#' UseHdbscan
#'
#' Use the HDBSCAN algorithm for generating output to evaluate the
#' classification (unimodal,bimodal or other) of genes.
#' @details
#' For details on how HDBSCAN works and
#' for details on what parameters with ** do,
#' see http://hdbscan.readthedocs.io/en/latest/
#' @param expression (Simulated) Gene Expression data frame which genes shall
#' be classified as unimodal, bimodal or other
#' @param minClusterSize ** Integer; The minimum number of samples
#' in a group for that group to be considered a cluster
#' @param minSamples ** Integer; The number of samples
#' in a neighborhood for a point to be considered as a core point.
#' This includes the point itself. If NULL: defaults to the min_cluster_size.
#' @param verbose logical. Whether to print progress messages
#' @param pathToClinicalData If real data is used the path
#' to the clinical data can be stored here
#' @param pathToExpressionmatrix If real data is used the path
#' to the gene expression matrix can be stored here
#'
#' @return hdbscanOutput The formatted output of the HDBSCAN clustering
#'
#' @examples
#' \dontrun{
#' params <- newParams(nGenes = list("1"=10,"2"=list("gauss"= 10,"gamma"= 10)))
#' simulation <- simulateExpression(params= params,verbose=FALSE)
#' expression <- simulation$expressionData
#' #with messages printed to console
#' hdbscanOutput <- useHdbscan(expression = expression)
#' #with specified minPts parameter
#' hdbscanOutput <- useHdbscan(expression = expression, minClusterSize = 15)
#' #without messages printed to console
#' hdbscanOutput <- useHdbscan(expression = expression, verbose = FALSE)
#' }
#' @importFrom reticulate import
#' @importFrom plyr compact
#' @export
useHdbscan <- function(expression,minClusterSize = 10,minSamples=NULL,verbose = TRUE,pathToClinicalData="pathTo/clinicalData",pathToExpressionmatrix="pathTo/expressionmatrix"){
if(is.null(minSamples)){
minSamples=minClusterSize
}
if(verbose){message("HDBSCAN-clusters are calculated...")}
py.hdbscan <- reticulate::import("hdbscan")
py.numpy <- reticulate::import("numpy")
hdbscan <- py.hdbscan$HDBSCAN(min_cluster_size = as.integer(minClusterSize),min_samples = as.integer(minSamples),allow_single_cluster = TRUE)
genes <- rownames(expression)
patients <- names(expression)
outputs <- lapply(1:nrow(expression),function(i){
expr <- t(expression[i,])
clustering <- data.frame(expr,cluster=(hdbscan$fit_predict(as.matrix(unlist(expr)))))
means <- plyr::compact(unname(by(as.numeric(expression[i,]),INDICES = clustering$cluster,
FUN = mean)))
sds <- plyr::compact(unname(by(as.numeric(expression[i,]),INDICES = clustering$cluster,
FUN = sd)))
classification <- clustering$cluster
if(length(means)>1){
classificationMinDistance <- unlist(lapply(1:ncol(expression), function(j){
if(classification[[j]]==-1){
difference <- abs(means[2:length(table(classification))]-expression[i,j])
minDif <- min(difference)
which(difference == minDif)-1
}else{
classification[j]
}
}))
means <- plyr::compact(unname(by(as.numeric(expression[i,]),INDICES = classificationMinDistance,
FUN = mean)))
sds <- plyr::compact(unname(by(as.numeric(expression[i,]),INDICES = classificationMinDistance,
FUN = sd)))
if(length(means)>1){
groups <- lapply(0:(length(means)-1),function(j){
patients[(which(classificationMinDistance==j))]
})
}else{
groups <- list(patients)
}
}else{
groups <- list(patients)
classificationMinDistance <- classification
}
modus <- "modal"
if((max(classification))==0||(max(classification))==-1){
modus <- 1
}else{
modus <- max(classification)+1
}
output <- list(
"modus" = as.integer(modus),
"means" = means,
"sds" = sds,
"sizes" = unname(table(classificationMinDistance)),
"groups" = groups
)
})
names(outputs)<- genes
hdbscanOutput <- list(Genes = outputs,"ClinicalData"=pathToClinicalData,"Expressionmatrix"=pathToExpressionmatrix)
cleanOutput <- (lapply(1:length(hdbscanOutput$Genes),function(i){
#corrects genes with an empty group to the correct modus, means, sds, sizes and groups
isNull <- unlist(lapply(1:length(hdbscanOutput$Genes[[i]]$groups),function(j){
if(is.null(unlist(hdbscanOutput$Genes[[i]]$groups[j]))||length(unlist(hdbscanOutput$Genes[[i]]$groups[j]))==0){j}
}))
if(!is.null(isNull)){
modus <- hdbscanOutput$Genes[[i]]$modus -1
means <- hdbscanOutput$Genes[[i]]$means[-isNull]
sds <- hdbscanOutput$Genes[[i]]$sds[-isNull]
groups <- plyr::compact(hdbscanOutput$Genes[[i]]$groups)
groups <- groups[lapply(groups,length)>0]
sizes <- unlist(lapply(1:length(groups),function(j){
length(unlist(expression[i,groups[[j]]]))
}))
}else{
modus <- hdbscanOutput$Genes[[i]]$modus
means <- hdbscanOutput$Genes[[i]]$means
sds <- hdbscanOutput$Genes[[i]]$sds
sizes <- hdbscanOutput$Genes[[i]]$sizes
groups <- hdbscanOutput$Genes[[i]]$groups
}
hdbscanOutput <- list("modus"=as.integer(modus),"means"=means,"sds"=sds,"sizes"=as.integer(sizes),"groups"=groups)
hdbscanOutput
}))
names(cleanOutput) <- names(hdbscanOutput$Genes)
hdbscanOutput <- list("Genes"=cleanOutput,
"ClinicalData"=hdbscanOutput$ClinicalData,
"Expressionmatrix"=hdbscanOutput$Expressionmatrix)
return(hdbscanOutput)
}
#' UseFlexmix
#'
#' Use the FlexMix algorithm to create models and to find the best fitting
#' model to evaluate the classification (unimodal,bimodal or other) of genes.
#'
#' @param expression (Simulated) Gene Expression data frame which genes shall
#' be classified as unimodal, bimodal or other
#' @param maxModality numerical. maximum modality for that models will be calculated
#' @param verbose logical. Whether to print progress messages
#' @param parallel logical. Whether to calculate the models in parallel
#' @param pathToClinicalData If real data is used the path
#' to the clinical data can be stored here
#' @param pathToExpressionmatrix If real data is used the path
#' to the gene expression matrix can be stored here
#' @return flexmixOutput The formatted output of the best fitting models from flexmix
#'
#' @examples
#' \dontrun{
#' params <- newParams(nGenes = list("1"=75,"2"=list("gauss"= 75,"gamma"= 75)))
#' simulation <- simulateExpression(params= params,verbose=FALSE)
#' expression <- simulation$expressionData
#' #with messages printed to console
#' flexmixOutput <- useFlexmix(expression = expression, maxModality = 2)
#' #without messages printed to console
#' flexmixOutput <- useFlexmix(expression = expression, maxModality = 2, verbose = FALSE)
#' }
#' @import flexmix
#' @importFrom doParallel registerDoParallel
#' @importFrom parallel detectCores makeCluster stopCluster
#' @importFrom foreach foreach "%dopar%"
#' @importFrom plyr compact
#' @importFrom stats BIC
#' @export
#'
useFlexmix <- function(expression,maxModality=3,verbose = TRUE,parallel=FALSE,pathToClinicalData="pathTo/clinicalData",pathToExpressionmatrix="pathTo/expressionmatrix"){
'%dopar%' <- foreach::`%dopar%`
if (verbose){message("flexmix models for gaussian and gamma mixture models are calculated and compared...")}
genes <- rownames(expression)
patients <- names(expression)
expression2 <- (lapply(1:nrow(expression),function(i){
unlist(lapply(1:length(expression),function(j){
if((expression[i,j]>0)){
c(expression[i,j])
}else{
c(1)
}
}))
}))
expression2 <- data.frame(do.call('rbind',expression2))
colnames(expression2) <- colnames(expression)
rownames(expression2) <- rownames(expression)
if(parallel){
modelGauss <- flexmix::FLXMCnorm1()
modelGamma <- flexmix::FLXMCdist1(dist = "gamma")
nCores = parallel::detectCores()-1
cl <- parallel::makeCluster(nCores)
doParallel::registerDoParallel(cl)
outputs <- foreach::foreach(i = 1:nrow(expression),.packages = c("flexmix")) %dopar% {
flexModelGammaAll <- NULL
flexModelGaussAll <- NULL
flexModelGaussAll <- plyr::compact(lapply(1:maxModality,function(j){
modeled <- try(flexmix::flexmix(t(expression[i,])~1,model = modelGauss,k=j),silent = TRUE)
if(class(modeled)!="try-error"){
modeled
}
}))
if(length(flexModelGaussAll)>0){
BICModelGauss <- unlist(lapply(1:length(flexModelGaussAll),function(j){
stats::BIC(flexModelGaussAll[[j]])
}))
bestBICGauss <- unique(min(BICModelGauss))
bestModelGauss <- flexModelGaussAll[[min(which(BICModelGauss==bestBICGauss))]]
}
flexModelGammaAll <- plyr::compact(lapply(1:maxModality,function(j){
modeled <- try(flexmix::flexmix(t(expression[i,])~1,model = modelGamma,k=j),silent = TRUE)
if(class(modeled)!="try-error"){
modeled
}
}))
if(length(flexModelGammaAll)>0){
BICModelGamma <- unlist(lapply(1:length(flexModelGammaAll),function(j){
stats::BIC(flexModelGammaAll[[j]])
}))
bestBICGamma <- unique(min(BICModelGamma))
bestModelGamma <- flexModelGammaAll[[min(which(BICModelGamma==bestBICGamma))]]
}
if(length(flexModelGaussAll)==0||length(flexModelGammaAll)==0){
if(length(flexModelGaussAll)==0&&length(flexModelGammaAll)==0){
stop('no model fit')
}else{
if(length(flexModelGammaAll)==0){
bestModel <- "gauss"
}else{
bestModel <- "gamma"
}
}
}else{
bestModel <- c()
if(bestBICGauss<bestBICGamma){
bestModel <- "gauss"
}else{
bestModel <- "gamma"
}
}
means <- sds <- sizes <- classification <- groups <- modus <- c()
if(bestModel=="gauss"){
means <- unname(flexmix::parameters(bestModelGauss)[1,])
sds <- unname(flexmix::parameters(bestModelGauss)[2,])
sds <- sds[order(means,decreasing = FALSE)]
means <- sort(means,decreasing = FALSE)
sizes <- unname(bestModelGauss@size)
classification <- unname(bestModelGauss@cluster)
groups <- lapply(1:max(classification),function(j){
patients[(which(classification==j))]
})
modus <- length(means)
}else{
sizes <- unname(bestModelGamma@size)
classification <- unname(bestModelGamma@cluster)
groups <- lapply(1:max(classification),function(j){
patients[(which(classification==j))]
})
shape <- unname(flexmix::parameters(bestModelGamma)[1,])
rate <- unname(flexmix::parameters(bestModelGamma)[2,])
means <- shape/rate
sds <- sqrt(shape/(rate*rate))
sds <- sds[order(means,decreasing = FALSE)]
means <- sort(means,decreasing = FALSE)
modus <- length(means)
}
output <- list(
"modus" = modus,
"means" = means,
"sds" = sds,
"sizes" = sizes,
"groups" = groups
)
}
if(parallel==TRUE){parallel::stopCluster(cl)}
}else{
modelGauss <- flexmix::FLXMCnorm1()
modelGamma <- flexmix::FLXMCdist1(dist = "gamma")
outputs <- lapply(1:nrow(expression),function(i){
flexModelGammaAll <- NULL
flexModelGaussAll <- NULL
flexModelGaussAll <- plyr::compact(lapply(1:maxModality,function(j){
modeled <- try(flexmix::flexmix(t(expression[i,])~1,model = modelGauss,k=j),silent = TRUE)
if(class(modeled)!="try-error"){
modeled
}
}))
if(length(flexModelGaussAll)>0){
BICModelGauss <- unlist(lapply(1:length(flexModelGaussAll),function(j){
stats::BIC(flexModelGaussAll[[j]])
}))
bestBICGauss <- unique(min(BICModelGauss))
bestModelGauss <- flexModelGaussAll[[min(which(BICModelGauss==bestBICGauss))]]
}
flexModelGammaAll <- plyr::compact(lapply(1:maxModality,function(j){
modeled <- try(flexmix::flexmix(t(expression[i,])~1,model = modelGamma,k=j),silent = TRUE)
if(class(modeled)!="try-error"){
modeled
}
}))
if(length(flexModelGammaAll)>0){
BICModelGamma <- unlist(lapply(1:length(flexModelGammaAll),function(j){
stats::BIC(flexModelGammaAll[[j]])
}))
bestBICGamma <- unique(min(BICModelGamma))
bestModelGamma <- flexModelGammaAll[[min(which(BICModelGamma==bestBICGamma))]]
}
if(length(flexModelGaussAll)==0||length(flexModelGammaAll)==0){
if(length(flexModelGaussAll)==0&&length(flexModelGammaAll)==0){
stop('no model fit')
}else{
if(length(flexModelGammaAll)==0){
bestModel <- "gauss"
}else{
bestModel <- "gamma"
}
}
}else{
bestModel <- c()
if(bestBICGauss<bestBICGamma){
bestModel <- "gauss"
}else{
bestModel <- "gamma"
}
}
means <- sds <- sizes <- classification <- groups <- modus <- c()
if(bestModel=="gauss"){
means <- unname(flexmix::parameters(bestModelGauss)[1,])
sds <- unname(flexmix::parameters(bestModelGauss)[2,])
sds <- sds[order(means,decreasing = FALSE)]
means <- sort(means,decreasing = FALSE)
sizes <- unname(bestModelGauss@size)
classification <- unname(bestModelGauss@cluster)
groups <- lapply(1:max(classification),function(j){
patients[(which(classification==j))]
})
modus <- length(means)
}else{
sizes <- unname(bestModelGamma@size)
classification <- unname(bestModelGamma@cluster)
groups <- lapply(1:max(classification),function(j){
patients[(which(classification==j))]
})
shape <- unname(flexmix::parameters(bestModelGamma)[1,])
rate <- unname(flexmix::parameters(bestModelGamma)[2,])
means <- shape/rate
sds <- sqrt(shape/(rate*rate))
sds <- sds[order(means,decreasing = FALSE)]
means <- sort(means,decreasing = FALSE)
modus <- length(means)
}
output <- list(
"modus" = modus,
"means" = means,
"sds" = sds,
"sizes" = sizes,
"groups" = groups
)
})
}
names(outputs)<- genes
flexmixOutput <- list(Genes = outputs,"ClinicalData"=pathToClinicalData,"Expressionmatrix"=pathToExpressionmatrix)
cleanOutput <- (lapply(1:length(flexmixOutput$Genes),function(i){
#corrects genes with an empty group to the correct modus, means, sds, sizes and groups
isNull <- unlist(lapply(1:length(flexmixOutput$Genes[[i]]$groups),function(j){
if(is.null(unlist(flexmixOutput$Genes[[i]]$groups[j]))||length(unlist(flexmixOutput$Genes[[i]]$groups[j]))==0){j}
}))
if(!is.null(isNull)){
modus <- flexmixOutput$Genes[[i]]$modus -1
means <- flexmixOutput$Genes[[i]]$means[-isNull]
sds <- flexmixOutput$Genes[[i]]$sds[-isNull]
groups <- plyr::compact(flexmixOutput$Genes[[i]]$groups)
groups <- groups[lapply(groups,length)>0]
sizes <- unlist(lapply(1:length(groups),function(j){
length(unlist(expression[i,groups[[j]]]))
}))
}else{
modus <- flexmixOutput$Genes[[i]]$modus
means <- flexmixOutput$Genes[[i]]$means
sds <- flexmixOutput$Genes[[i]]$sds
sizes <- flexmixOutput$Genes[[i]]$sizes
groups <- flexmixOutput$Genes[[i]]$groups
}
flexmixOutput <- list("modus"=as.integer(modus),"means"=means,"sds"=sds,"sizes"=as.integer(sizes),"groups"=groups)
flexmixOutput
}))
names(cleanOutput) <- names(flexmixOutput$Genes)
flexmixOutput <- list("Genes"=cleanOutput,
"ClinicalData"=flexmixOutput$ClinicalData,
"Expressionmatrix"=flexmixOutput$Expressionmatrix)
return(flexmixOutput)
}