bimodalR to multimodalR

Checked and updated all functions Added multiple filtering functions Added functions to process TCGA data Added data to exclude genes on allosomes Checked & updated vignettes Changed Package name to multimodalR Added function to check for valid unified output data format Added function to plot Survival Curves for TCGA data Added function to check for known bimodal genes in output First Complete Version of multimodalR
loosolab · Aug 30, 2018 · 5ca5931 · 5ca5931
1 parent 8d34ba3
commit 5ca5931
Show file tree

Hide file tree

Showing 69 changed files with 4,286 additions and 368 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,2 +1,3 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
+^data-raw$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,9 +1,9 @@
-Package: bimodalR
-Title: Simulation of bimodal gene expression and evaluation of algorithms for bimodality detection
+Package: multimodalR
+Title: Simulation of multimodal gene expression, evaluation of algorithms detecting multimodality in gene expression data sets, and detection of multimodal genes in gene expression data sets
 Version: 0.0.0.9000
-Authors@R: c(person("Svenja", "Beck", email = "svenja-kristina.beck@mpi-bn.mpg.de", role = c("aut", "cre")),
+Authors@R: c(person("Svenja Kristina", "Beck", email = "svenja-kristina.beck@mpi-bn.mpg.de", role = c("aut", "cre")),
              person("Jens", "Preussner", email = "jens.preussner@mpi-bn.mpg.de", role = c("aut")))
-Description: What the package does (one paragraph).
+Description: multimodalR is an R package to simulate multimodal gene expression data, to evaluate different algorithms detecting multimodality and to detect bimodality or multimodality in data sets.
 Depends: R (>= 3.4.2)
 License: MIT + file LICENSE
 Encoding: UTF-8
@@ -17,8 +17,22 @@ Imports: LaplacesDemon,
     flexmix,
     plyr,
     mclust,
-    diptest
-Suggests: genefilter,
-    knitr,
+    diptest,
+    ggplot2,
+    reshape2,
+    biomaRt,
+    doParallel,
+    foreach,
+    reticulate,
+    stats,
+    dplyr,
+    magrittr,
+    stringr,
+    survival,
+    ggkm,
+    KEGGREST,
+    cowplot,
+    utils
+Suggests: knitr,
     rmarkdown
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,40 +1,106 @@
 # Generated by roxygen2: do not edit by hand
 
-export(DipTestOverP)
-export(SilvermanOverP)
+export(cleanOutput)
 export(evaluateAlgorithmOutput)
+export(filter0sumGenes)
+export(filterForFoldChange)
+export(filterForKnownInCancer)
+export(filterForMean)
+export(filterForXChromosomeGenes)
+export(filterForYChromosomeGenes)
+export(filterMultimodalGenes)
+export(filterRarelyExpressedGenes)
 export(getClassifications)
+export(getFilteredData)
 export(getParam)
 export(getParams)
 export(getStatistics)
+export(isValidUnifiedOutputDataFormat)
 export(newParams)
+export(plotClassifications)
+export(plotExpression)
+export(plotOutput)
+export(plotStatistics)
+export(plotSurvivalCurves)
+export(processOneCancerGroup)
 export(setParam)
 export(setParams)
 export(simulateExpression)
+export(toCancerDataGroups)
+export(updateGeneNames)
+export(useDiptest)
 export(useFlexmix)
 export(useHdbscan)
 export(useMclust)
+export(useSilvermantest)
 exportClasses(Params)
-import(dbscan)
-import(diptest)
 import(flexmix)
 import(mclust)
-import(silvermantest)
+importFrom(KEGGREST,keggGet)
 importFrom(LaplacesDemon,rinvgaussian)
+importFrom(biomaRt,getBM)
+importFrom(biomaRt,useMart)
 importFrom(checkmate,assertCharacter)
 importFrom(checkmate,assertClass)
 importFrom(checkmate,assertList)
 importFrom(checkmate,assertString)
+importFrom(cowplot,add_sub)
+importFrom(cowplot,ggdraw)
+importFrom(diptest,dip.test)
+importFrom(doParallel,registerDoParallel)
+importFrom(dplyr,mutate)
+importFrom(dplyr,select)
+importFrom(foreach,"%dopar%")
+importFrom(foreach,foreach)
+importFrom(ggkm,geom_km)
+importFrom(ggplot2,aes)
+importFrom(ggplot2,coord_fixed)
+importFrom(ggplot2,element_text)
+importFrom(ggplot2,facet_wrap)
+importFrom(ggplot2,geom_bar)
+importFrom(ggplot2,geom_density)
+importFrom(ggplot2,ggplot)
+importFrom(ggplot2,ggtitle)
+importFrom(ggplot2,labs)
+importFrom(ggplot2,scale_color_discrete)
+importFrom(ggplot2,scale_fill_manual)
+importFrom(ggplot2,stat_function)
+importFrom(ggplot2,theme)
+importFrom(ggplot2,theme_bw)
+importFrom(ggplot2,xlab)
+importFrom(ggplot2,ylab)
+importFrom(grDevices,dev.off)
+importFrom(grDevices,grey.colors)
+importFrom(grDevices,pdf)
+importFrom(graphics,hist)
+importFrom(graphics,par)
+importFrom(magrittr,"%>%")
 importFrom(methods,"slot<-")
 importFrom(methods,new)
 importFrom(methods,rbind2)
 importFrom(methods,slot)
 importFrom(methods,slotNames)
 importFrom(methods,validObject)
+importFrom(parallel,clusterEvalQ)
+importFrom(parallel,clusterExport)
+importFrom(parallel,detectCores)
+importFrom(parallel,makeCluster)
+importFrom(parallel,parLapply)
+importFrom(parallel,stopCluster)
 importFrom(plyr,compact)
+importFrom(reshape2,melt)
+importFrom(reticulate,import)
+importFrom(silvermantest,silverman.test)
+importFrom(stats,BIC)
+importFrom(stats,as.formula)
 importFrom(stats,dnorm)
+importFrom(stats,pchisq)
 importFrom(stats,rgamma)
 importFrom(stats,rnorm)
 importFrom(stats,runif)
 importFrom(stats,sd)
+importFrom(stringr,str_replace_all)
+importFrom(survival,Surv)
+importFrom(survival,survdiff)
+importFrom(utils,combn)
 importFrom(utils,head)
diff --git a/R/TCGAFunctions.R b/R/TCGAFunctions.R
@@ -0,0 +1,213 @@
+#TCGA Routine functions
+
+#' toCancerDataGroups
+#' Sorts the TCGA expression data with the metadata into groups of different cancer types
+#' @details The groupsData that is returned consists of the different diagnosis groups
+#' (http://www.icd10data.com/ICD10CM/Codes/C00-D49)
+#' and the expression matrix of each of the diagnosis groups
+#' @param cancerDataExpression TCGA cancer expression data
+#' @param metadata TCGA cancer metadata
+#' @param verbose logical. Whether to print progress messages
+#'
+#' @export
+toCancerDataGroups <- function(cancerDataExpression, metadata,
+                               verbose = TRUE) {
+  if (verbose) {
+    message("Filtering 0 sum rows...")
+  }
+  cancerDataCleared <- filter0sumGenes(cancerData = cancerDataExpression,parallel = TRUE)
+  if (verbose) {
+    message("Splitting the data sets into different cancer groups...")
+  }
+  icd10 <- unique(unlist(lapply(1:length(metadata$diagnosis), function(i) {
+    unlist(strsplit(metadata$diagnosis[i], split = "[.]"))[1]
+  })))
+  patientIds <- colnames(cancerDataCleared)
+  caseIdsUnique <- make.unique(names = as.character(metadata$caseID))
+  diagnosis <- unlist(lapply(1:length(patientIds), function(i) {
+    diagnosis <- metadata$diagnosis[(which(caseIdsUnique == patientIds[i]))]
+    names(diagnosis) <- patientIds[i]
+    diagnosis
+  }))
+  diagnosisGroups <- unique(unlist(lapply(1:length(diagnosis), function(i) {
+    unlist(strsplit(diagnosis[i], split = "[.]"))[1]
+  })))
+  shortDiagnosis <- (unlist(lapply(1:length(diagnosis), function(i) {
+    unlist(strsplit(diagnosis[i], split = "[.]"))[1]
+  })))
+  patientsInGroups <- (lapply(1:length(diagnosisGroups), function(i) {
+    names(diagnosis[which(shortDiagnosis == diagnosisGroups[i])])
+  }))
+  groupsDataSets <- lapply(1:length(patientsInGroups), function(i) {
+    expr <- lapply(1:length(patientsInGroups[[i]]), function(j) {
+      cancerDataCleared[, which(colnames(cancerDataCleared) == patientsInGroups[[i]][j])]
+    })
+    expression <- do.call("cbind", expr)
+    rownames(expression) <- rownames(cancerDataCleared)
+    colnames(expression) <- patientsInGroups[[i]]
+    expression
+  })
+  return(list(diagnosisGroups = diagnosisGroups, groupsExpressionDataSets = groupsDataSets))
+}
+
+#' processOneCancerGroup
+#' Processes one cancer group
+#' @details Processes one cancer group by a) filtering 0 sum rows,
+#' b) using Silverman's test for unimodality,
+#' c) filtering genes with <2% of patients that have expression values greater 0,
+#' d) use useMclust on cancer data,
+#' e) creating list of output and filtered expression matrix
+#' @param cancerData TCGA cancer expression data of one cancer group
+#' @param minExpressedPercentage integer specifying the percentage of patients that should at least be expressed for the gene to be analysed
+#' @param algorithm "mclust", "hdbscan" or "flexmix". Defines which algorithm should be used to process the cancer data
+#' @param maxModality An integer specifying the highest modality to calculate models in with mclust and flexmix
+#' @param minClusterSize Integer; The minimum number of samples
+#' in a group for that group to be considered a cluster in hdbscan
+#' @param minSamples Integer for hdbscan; The number of samples
+#' in a neighborhood for a point to be considered as a core point.
+#' This includes the point itself. If NULL: defaults to the min_cluster_size.
+#' @param pathToClinicalData path to the clinical data
+#' @param pathToExpressionmatrix path to the expression matrix
+#' @param SilvermanP The p-value that is used to reject Silvermans Test for unimodality
+#' (given by k=1 using the Hall/York adjustments)
+#' @param verbose logical. Whether to print progress messages
+#'
+#' @export
+processOneCancerGroup <- function(cancerData,minExpressedPercentage=2,algorithm="mclust", maxModality=3,
+                                  minClusterSize=50,minSamples=NULL,
+                                  pathToClinicalData = "pathToClinicalData",
+                                  pathToExpressionmatrix = "pathToExpressionmatrix",
+                                  SilvermanP = 0.05, verbose = TRUE) {
+  if (verbose) {
+    message("Filtering 0 sum rows...")
+  }
+  clearedData <- filter0sumGenes(cancerData = cancerData,parallel = TRUE)
+  if (verbose) {
+    message("Using SilvermanOverP...")
+  }
+  possibles <- useSilvermantest(clearedData =clearedData,parallel = TRUE,SilvermanP = SilvermanP)
+  if (verbose) {
+    message("Filter genes with only a few patients (<minExpressedPercentage) with expression greater 0...")
+  }
+  toBeProcessed <- filterRarelyExpressedGenes(possibles = possibles,minExpressedPercentage = minExpressedPercentage,parallel = TRUE)
+  if (verbose) {
+    message("UseMclust on the filtered data set...")
+  }
+  Sys.time()
+  output <- c()
+  if(algorithm=="mclust"){
+    output <- useMclust(expression = toBeProcessed, clusterNumbers = c(1:maxModality),verbose = FALSE, parallel = T,pathToClinicalData = pathToClinicalData,pathToExpressionmatrix = pathToExpressionmatrix)
+  }
+  if(algorithm=="hdbscan"){
+    output <- useHdbscan(expression = toBeProcessed,minClusterSize = minClusterSize,minSamples = minSamples,verbose = FALSE,pathToClinicalData = pathToClinicalData,pathToExpressionmatrix = pathToExpressionmatrix)
+  }
+  if(algorithm=="flexmix"){
+    output <- useFlexmix(expression = toBeProcessed,maxModality = maxModality,verbose = FALSE,parallel = T,pathToClinicalData = pathToClinicalData,pathToExpressionmatrix = pathToExpressionmatrix)
+  }
+
+  listProcessedBC <- unlist(lapply(1:length(output$Genes),function(i){
+    which(rownames(toBeProcessed)==names(output$Genes)[i])
+  }))
+  processedData <- toBeProcessed[listProcessedBC,]
+  if (verbose) {
+    message("Return output and expression matrix...")
+  }
+  return(list("Output"= output,"Expressionmatrix"=processedData))
+}
+
+#' getFilteredData
+#' Processes one cancer group with filtering steps
+#' @details Processes one cancer group by
+#' a) filtering 0 sum rows,
+#' b) using Silverman's test for unimodality,
+#' c) filtering genes with <2% of patients that have expression values greater 0,
+#' d) use useMclust on cancer data,
+#' e) cleaning output of useMclust
+#' f) filtering for multimodal genes
+#' g) filtering for existing minimum mean of any mean of the multimodal genes
+#' h) filtering for existing minimum fold change between any adjacent means of multimodal genes
+#' i) getting gene names from Ensembl
+#' j) returning a list of the filtered output and the filtered expression matrix
+#' @param oneCancerGroupExpressionmatrix TCGA cancer expression data of one cancer group
+#' @param algorithm "mclust", "hdbscan" or "flexmix". Defines which algorithm should be used to process the cancer data
+#' @param maxModality An integer specifying the highest modality to calculate models in with mclust and flexmix
+#' @param minClusterSize Integer; The minimum number of samples
+#' in a group for that group to be considered a cluster in hdbscan
+#' @param minSamples Integer for hdbscan; The number of samples
+#' in a neighborhood for a point to be considered as a core point.
+#' This includes the point itself. If NULL: defaults to the min_cluster_size.
+#' @param pathToClinicalData path to the clinical data
+#' @param pathToExpressionmatrix path to the expression matrix
+#' @param minExpressedPercentage integer specifying the percentage of patients that should at least be expressed for the gene to be analysed
+#' @param SilvermanP The p-value that is used to reject Silvermans Test for unimodality
+#' (given by k=1 using the Hall/York adjustments)
+#' @param minPercentage minimum percentage of groups.
+#' Groups with lower percentages will be sorted into the other groups
+#' @param minMean minimum mean that has to exist in any mean of the multimodal genes
+#' @param minFoldChange minimum fold change that has to exist between any adjacent means of multimodal genes
+#' @param coresToUse The number of cores to use for parallel computing of cleanOutput() if parallel is TRUE
+#' @param parallel logical. Whether cleanOutput() shall be computed in parallel
+#' @param updateToEnsembleGeneNames logical. Whether to convert Ensemble gene ids to gene names
+#' @param verbose logical. Whether to print progress messages
+#' @export
+getFilteredData <- function(oneCancerGroupExpressionmatrix,
+                            maxModality=3,algorithm="mclust",minClusterSize=50,minSamples=NULL,
+                            pathToClinicalData = "pathToClinicalData",
+                            pathToExpressionmatrix ="pathToExpressionmatrix",
+                            minExpressedPercentage=2,
+                            SilvermanP=0.05,
+                            minPercentage=10,
+                            minMean=2,
+                            minFoldChange=2,
+                            coresToUse=6,
+                            parallel=FALSE,
+                            updateToEnsembleGeneNames=FALSE,
+                            verbose=TRUE){
+  if(verbose){message("Filtering 0 sum rows...")}
+  if(verbose){message("Using Silverman's test for unimodality...")}
+  if(verbose){message("Filtering genes with <2% of patients that have expression values greater 0...")}
+  if(verbose){message("Use useMclust on cancer data... ")}
+  processedCancerData <- processOneCancerGroup(cancerData = oneCancerGroupExpressionmatrix,minExpressedPercentage = minExpressedPercentage,
+                                               algorithm = algorithm,maxModality = maxModality,
+                                               minClusterSize = minClusterSize,minSamples = minSamples,
+                                               pathToClinicalData = pathToClinicalData,
+                                               pathToExpressionmatrix = pathToExpressionmatrix,
+                                               SilvermanP = SilvermanP,
+                                               verbose = FALSE)
+  processedOutput <- processedCancerData$Output
+  processedEm <- processedCancerData$Expressionmatrix
+  if(verbose){message(
+    "Cleaning the output of useMclust")}
+  cleanedOutput <- cleanOutput(cancerGroupOutput = processedOutput,cancerGroupExpressionmatrix = processedEm,
+                               minPercentage = minPercentage,coresToUse = coresToUse,parallel = parallel)
+  if(verbose){message(
+    "Filtering for multimodal genes")}
+  multimodalData <- filterMultimodalGenes(cleanedOutput = cleanedOutput,
+                                          processedExpressionmatrix = processedEm)
+  multimodalOutput <- multimodalData$Output
+  multimodalEm <- multimodalData$Expressionmatrix
+  if(verbose){message(
+    "Filtering for existing minimum mean of any mean of the multimodal genes")}
+  filteredMeanData <- filterForMean(multimodalExpressionmatrix = multimodalEm,
+                                    multimodalOutput = multimodalOutput,
+                                    minMean = minMean)
+  filteredMeanOutput <- filteredMeanData$Output
+  filteredMeanEm <- filteredMeanData$Expressionmatrix
+  if(verbose){message(
+    "Filtering for existing minimum fold change between any adjacent means of multimodal genes")}
+  filteredFCData <- filterForFoldChange(filteredMeanExpressionmatrix =filteredMeanEm,
+                                        filteredMeanOutput = filteredMeanOutput,
+                                        minFoldChange = minFoldChange)
+  filteredFCOutput <- filteredFCData$Output
+  filteredFCEm <- filteredFCData$Expressionmatrix
+  if(updateToEnsembleGeneNames){
+    if(verbose){message("Getting gene names from Ensembl")}
+    updatedNamesData <- updateGeneNames(filteredExpressionmatrix = filteredFCEm,filteredOutput=filteredFCOutput)
+    filteredOutput <- updatedNamesData$Output
+    filteredEm <- updatedNamesData$Expressionmatrix
+  }else{
+    filteredOutput <- filteredFCOutput
+    filteredEm <- filteredFCEm
+  }
+  return(list("Output"=filteredOutput,"Expressionmatrix"=filteredEm))
+}