Skip to content
Permalink
7a26db24b2
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
@sbeck
Latest commit 5ca5931 Aug 30, 2018 History
Checked and updated all functions
Added multiple filtering functions
Added functions to process TCGA data
Added data to exclude genes on allosomes
Checked & updated vignettes
Changed Package name to multimodalR
Added function to check for valid unified output data format
Added function to plot Survival Curves for TCGA data
Added function to check for known bimodal genes in output
First Complete Version of multimodalR
2 contributors

Users who have contributed to this file

@sbeck @jenzopr
285 lines (270 sloc) 11.8 KB
#' Simulation
#'
#' Simulate gene expression for the different simulation scenarios: Unimodal,
#' multimodal with gaussian distributions, multimodal starting with a gamma distribution and followed by gaussian distributions
#'
#' @param params Params object containing simulation parameters for all scenarios.
#' @param verbose logical. Whether to print progress messages
#' @param ... any additional parameter settings to override what is provided in \code{params}.
#'
#' @details
#' The 'modus' is the number of occurring distributions within one gene.
#' Modus '1' - Unimodal gaussian distribution
#' Gene expression for the unimodal gaussian distribution (scenario '1') is
#' simulated from the mean and the standard deviation taken from the validation data
#' for this gene. The number of values that are simulated for each
#' distribution equals the number of Patients {'nPatients'}:
#'
#' All other modi consist of two different scenarios:
#' In the 'gauss' scenario all occurring distributions are gaussian distributions.
#' In the 'gamma' scenario the first occuring distribution is a gamma distribution.
#' All other distributions are gaussian distributions.
#'
#' Modus '2' or higher - Bimodal distributions or higher modalities
#'
#' Gene expressions of the multimodal genes are
#' simulated automatically for each distribution.
#' Scenario 'gauss'
#' The different gaussian distributions are simulated with the corresponding mean
#' and a standard deviation and number of values taken from the validation data
#' for this gene.
#'
#' Scenario 'gamma'
#' The gamma distribution is simulated with the gamma shape and rate values from
#' the params object. The other gaussian distributions are simulated with the values
#' from the validation data.
#'
#' @return A named list with two components: 'validationData' and 'expressionData'
#'
#' @examples
#' \dontrun{
#' simulation <- simulateExpression(params = newParams())
#' # Override default parameters
#' # Not recommended, better use setParams; See \code{\link{Params}}
#' simulation <- simulateExpression(nGenes = list("1"=10,
#' "2"=list("gauss"= 10,"gamma"= 10)), nPatients = 50)
#' # without messages printed to the console
#' params <- newParams()
#' simulation <- simulateExpression(params = params,verbose = FALSE)
#' }
#' @export
#' @importFrom checkmate assertClass assertString assertCharacter assertList
#' @importFrom stats dnorm rgamma rnorm runif sd
#'
simulateExpression <- function(params = newParams(), verbose = TRUE,...) {
checkmate::assertClass(params, "Params")
params <- setParams(params, ...)
seed <- getParam(params, "seed")
set.seed(seed)
valData <- createValidationData(params = params, verbose = verbose)
nPatients <- getParam(params,"nPatients")
numGenes <- sum(unlist(getParam(params,name = "nGenes")))
if (verbose){message("Drawing gene expression values...")}
simNotAdjusted <- lapply(1:numGenes,function(i){
if(valData[[i]]$scenario=="unimodal-gaussian"){
expression <- stats::rnorm(n = nPatients,mean = valData[[i]]$means,sd = valData[[i]]$sds)
}else if(valData[[i]]$scenario=="multimodal-gaussian"){
expression <- unlist(lapply(1:valData[[i]]$modus,function(j){
if((sum(valData[[i]]$sizes)-valData[[i]]$sizes[j]+ceiling(valData[[i]]$sizes[j]))<=nPatients){
expression <- stats::rnorm(n = ceiling(valData[[i]]$sizes[j]),mean = valData[[i]]$means[j],sd = valData[[i]]$sds[j])
}else{
expression <- stats::rnorm(n = (valData[[i]]$sizes[j]),mean = valData[[i]]$means[j],sd = valData[[i]]$sds[j])
}
}))
}else if(valData[[i]]$scenario=="multimodal-gamma+gaussian"){
expression <- unlist(lapply(1:valData[[i]]$modus,function(j){
if(j==1){
if((sum(valData[[i]]$sizes)-valData[[i]]$sizes[j]+ceiling(valData[[i]]$sizes[j]))<=nPatients){
expression <- stats::rgamma(n = ceiling(valData[[i]]$sizes[j]),shape = params@gamma$shape,rate = params@gamma$rate)
}else{
expression <- stats::rgamma(n = (valData[[i]]$sizes[j]),shape = params@gamma$shape,rate = params@gamma$rate)
}
}else{
if((sum(valData[[i]]$sizes)-valData[[i]]$sizes[j]+ceiling(valData[[i]]$sizes[j]))<=nPatients){
expression <- stats::rnorm(n = ceiling(valData[[i]]$sizes[j]),mean = valData[[i]]$means[j],sd = valData[[i]]$sds[j])
}else{
expression <- stats::rnorm(n = (valData[[i]]$sizes[j]),mean = valData[[i]]$means[j],sd = valData[[i]]$sds[j])
}
}
}))
}
expression
})
simNotAdjusted <- data.frame(do.call('rbind',simNotAdjusted))
sim <- lapply(1:nrow(simNotAdjusted),function(i){
values <- unlist(lapply(1:ncol(simNotAdjusted),function(j){
if(simNotAdjusted[i,j]<0){
0
}else{
simNotAdjusted[i,j]
}
}))
})
sim <- data.frame(do.call('rbind',sim))
genes <- names(valData)
patients <- sprintf("Patient%04d", 1:nPatients)
rownames(sim) <- genes
colnames(sim) <- patients
if(verbose){message("Finished.")}
return(list(validationData = valData, expressionData = sim))
}
#' Creating the validation data for algorithms.
#'
#' The validation data is generated by using the `Params` object and the parameters
#' that are stored in it. It is generated automatically when the
#' simulateExpression() function is executed
#' @param params Params object containing simulation parameters for all scenarios.
#' @param verbose logical. Whether to print progress messages
#'
#' @return A data.frame with validation data, from which expression values will be simulated.
#'
#' @details
#' For changing the parameters, please consider the following:
#'
#' Type shows with which type the slot has to be filled,
#' Usage shows further information for the usage of the values,
#' Default values describes the default values used to generate a `Params`
#' object.
#'
#' Parameter Type Default values
#'
#' nGenes: list, Default value: list("1"= 700, "2" = list("gauss"=150,"gamma"=150)),
#' means: list, Default value: list("1"=c(2,4),"2" = c(2,4)),
#' foldChanges: list, Default value: list("1" = c(2,4),"2" = list("gauss"= c(2,4),"gamma" = c(3,5))),
#' sd: list, Default value: list("mu"= 0.61,"lambda"=2.21),
#' gamma: list, Default value: list("shape" = 2, "rate" = 2),
#' proportions: vector, Default value: c(10,20,30,40,50,60,70,80,90),
#' nPatients: numeric, Default value: 200,
#' seed: numeric, Default value: sample(1:1e6, 1)
#'
#' @importFrom LaplacesDemon rinvgaussian
#'
createValidationData <- function(params = newParams(),verbose = TRUE){
numGenes <- sum(unlist(getParam(params,name = "nGenes")))
if (verbose) { message("Drawing parameters for expression simulation...") }
genes <- sprintf("Gene%04d",1:numGenes)
reps <- c()
reps <- unlist(lapply(1:length(unlist(params@nGenes)),function(i){
if(i == 1){
reps <- rep(x = i,times = params@nGenes$`1`)
}else{
if(i%%2==0){
reps <- c(reps,rep(x = i, times = sum(unlist(eval(parse(text=(paste0("params@nGenes$`",floor(i/2)+1,"`$gauss"))))))))
}else{
reps <- c(reps,rep(x = i, times = sum(unlist(eval(parse(text=(paste0("params@nGenes$`",floor(i/2)+1,"`$gamma"))))))))
}
}
}))
scenarios <- sample(x = reps,size = numGenes,replace = FALSE)
drawGeneParams <- function(nPatients,modality, proportions,means=NULL,sdParms,foldChanges = NULL,gammaParms = NULL){
ranMean1 <- c()
ranSd1 <- c()
if(!is.null(gammaParms)){
ranMean1 <- gammaParms$shape/gammaParms$rate
ranSd1 <- sqrt(gammaParms$shape/(gammaParms$rate^2))
}else{
ranMean1 <- stats::runif(1, min = means[1], max = means[2])
ranSd1 <- LaplacesDemon::rinvgaussian(n = 1, mu = sdParms$mu, lambda = sdParms$lambda)
}
ranProps <- 100
ranFCs <- NA
means <- c()
ranSd <- c()
if(modality >= 2){
repProp <- rep(proportions,modality)
combs <- t(utils::combn(x = repProp,m = modality,simplify=TRUE))
combinations <- unlist(lapply(1:nrow(combs),function(i){
if(sum(combs[i,])==100){
i
}
}))
selectedCombs <- combs[combinations,]
combsSorted <- lapply(1:nrow(selectedCombs),function(i){
sort(selectedCombs[i,])
})
combs <- do.call('rbind',combsSorted)
uniqueCombs <- unique.array(combs)
sampleProp <- sample(1:nrow(uniqueCombs),1,replace = FALSE)
ranProps <- sample(uniqueCombs[sampleProp,])
ranSd<- unlist(lapply(2:modality,function(i){
eval(parse(text = paste(paste0("ranSd",i),
"<- ",
LaplacesDemon::rinvgaussian(n = 1, mu = sdParms$mu, lambda = sdParms$lambda))))
}))
if(modality==2){
ranFCs <- unlist(lapply(1:1,function(i){
eval(parse(text = paste(paste0("ranFC",i),
"<-",(stats::runif(1, min = foldChanges[1], max = foldChanges[2])))))
}))
}else{
ranFCs <- unlist(lapply(1:(length(foldChanges)),function(i){
if(modality ==2){
eval(parse(text = paste(paste0("ranFC",i),
"<-",(stats::runif(1, min = foldChanges[1], max = foldChanges[2])))))
}else{
eval(parse(text = paste(paste0("ranFC",i),
"<-",(stats::runif(1, min = foldChanges[[i]][1], max = foldChanges[[i]][2])))))
}
}))
}
eval(parse(text = paste(paste0("ranMean",2:modality),
"<-",NA)))
for(i in 1:modality){
if(i!=1){
eval(parse(text = paste(paste0("ranMean",(i)),"<-",paste0("ranMean",(i-1)),"*",paste0("ranFCs[",(i-1),"]"))))
}
}
}
sds <- c(ranSd1,ranSd)
means <- unlist(lapply(1:modality,function(i){
eval(parse(text = paste(paste0("ranMean",(i)))))
}))
sizes <- (ranProps/100)*nPatients
patients <- 1
groups <- lapply(1:length(sizes),function(i){
if(i==1){
sprintf("Patient%04d",1:sizes[i])
}else{
sprintf("Patient%04d",((sum(sizes[1:(i-1)])+1):(sizes[i]+sum(sizes[1:(i-1)]))))
}
})
scenario <-c()
if(modality==1){
scenario = "unimodal-gaussian"
}else if(modality>1&&is.null(gammaParms)){
scenario = "multimodal-gaussian"
}else{
scenario <- "multimodal-gamma+gaussian"
}
return(list("modus"=modality,"scenario"=scenario,"means"=means,"foldChanges"=ranFCs,"sds"=sds,"proportions"=ranProps,"sizes"=sizes,"groups"=groups))
}
valData <- lapply(1:length(scenarios), function(i) {
if(scenarios[i]==1){
gP <- drawGeneParams(nPatients = params@nPatients,
modality = 1,
proportions = params@proportions,
means = params@means$`1`,
sdParms = params@sd)
}
if((scenarios[i]>1)&&(scenarios[i]%%2==0)){
gP <- drawGeneParams(nPatients = params@nPatients,
modality = (floor(scenarios[i]/2))+1,
proportions = params@proportions,
means = eval(parse(text = paste(paste0("params@means$`",((floor(scenarios[i]/2))+1),"`")))),
sdParms = params@sd,
foldChanges = eval(parse(text = paste(paste0("params@foldChanges$`",((floor(scenarios[i]/2))+1),"`$gauss")))))
}
if((scenarios[i]>1)&&(scenarios[i]%%2!=0)){
gP <- drawGeneParams(nPatients = params@nPatients,
modality = (floor(scenarios[i]/2))+1,
proportions = params@proportions,
sdParms = params@sd,
foldChanges = eval(parse(text = paste(paste0("params@foldChanges$`",((floor(scenarios[i]/2))+1),"`$gamma")))),
gammaParms = params@gamma)
}
return(gP)
})
names(valData)<- genes
if(verbose){ message("Finished creating validation data.") }
return(valData)
}