Skip to content

Commit

Permalink
bimodalR to multimodalR
Browse files Browse the repository at this point in the history
Checked and updated all functions
Added multiple filtering functions
Added functions to process TCGA data
Added data to exclude genes on allosomes
Checked & updated vignettes
Changed Package name to multimodalR
Added function to check for valid unified output data format
Added function to plot Survival Curves for TCGA data
Added function to check for known bimodal genes in output
First Complete Version of multimodalR
  • Loading branch information
sbeck committed Aug 30, 2018
1 parent 8d34ba3 commit 5ca5931
Show file tree
Hide file tree
Showing 69 changed files with 4,286 additions and 368 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
^.*\.Rproj$
^\.Rproj\.user$
^data-raw$
28 changes: 21 additions & 7 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
Package: bimodalR
Title: Simulation of bimodal gene expression and evaluation of algorithms for bimodality detection
Package: multimodalR
Title: Simulation of multimodal gene expression, evaluation of algorithms detecting multimodality in gene expression data sets, and detection of multimodal genes in gene expression data sets
Version: 0.0.0.9000
Authors@R: c(person("Svenja", "Beck", email = "svenja-kristina.beck@mpi-bn.mpg.de", role = c("aut", "cre")),
Authors@R: c(person("Svenja Kristina", "Beck", email = "svenja-kristina.beck@mpi-bn.mpg.de", role = c("aut", "cre")),
person("Jens", "Preussner", email = "jens.preussner@mpi-bn.mpg.de", role = c("aut")))
Description: What the package does (one paragraph).
Description: multimodalR is an R package to simulate multimodal gene expression data, to evaluate different algorithms detecting multimodality and to detect bimodality or multimodality in data sets.
Depends: R (>= 3.4.2)
License: MIT + file LICENSE
Encoding: UTF-8
Expand All @@ -17,8 +17,22 @@ Imports: LaplacesDemon,
flexmix,
plyr,
mclust,
diptest
Suggests: genefilter,
knitr,
diptest,
ggplot2,
reshape2,
biomaRt,
doParallel,
foreach,
reticulate,
stats,
dplyr,
magrittr,
stringr,
survival,
ggkm,
KEGGREST,
cowplot,
utils
Suggests: knitr,
rmarkdown
VignetteBuilder: knitr
76 changes: 71 additions & 5 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,40 +1,106 @@
# Generated by roxygen2: do not edit by hand

export(DipTestOverP)
export(SilvermanOverP)
export(cleanOutput)
export(evaluateAlgorithmOutput)
export(filter0sumGenes)
export(filterForFoldChange)
export(filterForKnownInCancer)
export(filterForMean)
export(filterForXChromosomeGenes)
export(filterForYChromosomeGenes)
export(filterMultimodalGenes)
export(filterRarelyExpressedGenes)
export(getClassifications)
export(getFilteredData)
export(getParam)
export(getParams)
export(getStatistics)
export(isValidUnifiedOutputDataFormat)
export(newParams)
export(plotClassifications)
export(plotExpression)
export(plotOutput)
export(plotStatistics)
export(plotSurvivalCurves)
export(processOneCancerGroup)
export(setParam)
export(setParams)
export(simulateExpression)
export(toCancerDataGroups)
export(updateGeneNames)
export(useDiptest)
export(useFlexmix)
export(useHdbscan)
export(useMclust)
export(useSilvermantest)
exportClasses(Params)
import(dbscan)
import(diptest)
import(flexmix)
import(mclust)
import(silvermantest)
importFrom(KEGGREST,keggGet)
importFrom(LaplacesDemon,rinvgaussian)
importFrom(biomaRt,getBM)
importFrom(biomaRt,useMart)
importFrom(checkmate,assertCharacter)
importFrom(checkmate,assertClass)
importFrom(checkmate,assertList)
importFrom(checkmate,assertString)
importFrom(cowplot,add_sub)
importFrom(cowplot,ggdraw)
importFrom(diptest,dip.test)
importFrom(doParallel,registerDoParallel)
importFrom(dplyr,mutate)
importFrom(dplyr,select)
importFrom(foreach,"%dopar%")
importFrom(foreach,foreach)
importFrom(ggkm,geom_km)
importFrom(ggplot2,aes)
importFrom(ggplot2,coord_fixed)
importFrom(ggplot2,element_text)
importFrom(ggplot2,facet_wrap)
importFrom(ggplot2,geom_bar)
importFrom(ggplot2,geom_density)
importFrom(ggplot2,ggplot)
importFrom(ggplot2,ggtitle)
importFrom(ggplot2,labs)
importFrom(ggplot2,scale_color_discrete)
importFrom(ggplot2,scale_fill_manual)
importFrom(ggplot2,stat_function)
importFrom(ggplot2,theme)
importFrom(ggplot2,theme_bw)
importFrom(ggplot2,xlab)
importFrom(ggplot2,ylab)
importFrom(grDevices,dev.off)
importFrom(grDevices,grey.colors)
importFrom(grDevices,pdf)
importFrom(graphics,hist)
importFrom(graphics,par)
importFrom(magrittr,"%>%")
importFrom(methods,"slot<-")
importFrom(methods,new)
importFrom(methods,rbind2)
importFrom(methods,slot)
importFrom(methods,slotNames)
importFrom(methods,validObject)
importFrom(parallel,clusterEvalQ)
importFrom(parallel,clusterExport)
importFrom(parallel,detectCores)
importFrom(parallel,makeCluster)
importFrom(parallel,parLapply)
importFrom(parallel,stopCluster)
importFrom(plyr,compact)
importFrom(reshape2,melt)
importFrom(reticulate,import)
importFrom(silvermantest,silverman.test)
importFrom(stats,BIC)
importFrom(stats,as.formula)
importFrom(stats,dnorm)
importFrom(stats,pchisq)
importFrom(stats,rgamma)
importFrom(stats,rnorm)
importFrom(stats,runif)
importFrom(stats,sd)
importFrom(stringr,str_replace_all)
importFrom(survival,Surv)
importFrom(survival,survdiff)
importFrom(utils,combn)
importFrom(utils,head)
213 changes: 213 additions & 0 deletions R/TCGAFunctions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
#TCGA Routine functions

#' toCancerDataGroups
#' Sorts the TCGA expression data with the metadata into groups of different cancer types
#' @details The groupsData that is returned consists of the different diagnosis groups
#' (http://www.icd10data.com/ICD10CM/Codes/C00-D49)
#' and the expression matrix of each of the diagnosis groups
#' @param cancerDataExpression TCGA cancer expression data
#' @param metadata TCGA cancer metadata
#' @param verbose logical. Whether to print progress messages
#'
#' @export
toCancerDataGroups <- function(cancerDataExpression, metadata,
verbose = TRUE) {
if (verbose) {
message("Filtering 0 sum rows...")
}
cancerDataCleared <- filter0sumGenes(cancerData = cancerDataExpression,parallel = TRUE)
if (verbose) {
message("Splitting the data sets into different cancer groups...")
}
icd10 <- unique(unlist(lapply(1:length(metadata$diagnosis), function(i) {
unlist(strsplit(metadata$diagnosis[i], split = "[.]"))[1]
})))
patientIds <- colnames(cancerDataCleared)
caseIdsUnique <- make.unique(names = as.character(metadata$caseID))
diagnosis <- unlist(lapply(1:length(patientIds), function(i) {
diagnosis <- metadata$diagnosis[(which(caseIdsUnique == patientIds[i]))]
names(diagnosis) <- patientIds[i]
diagnosis
}))
diagnosisGroups <- unique(unlist(lapply(1:length(diagnosis), function(i) {
unlist(strsplit(diagnosis[i], split = "[.]"))[1]
})))
shortDiagnosis <- (unlist(lapply(1:length(diagnosis), function(i) {
unlist(strsplit(diagnosis[i], split = "[.]"))[1]
})))
patientsInGroups <- (lapply(1:length(diagnosisGroups), function(i) {
names(diagnosis[which(shortDiagnosis == diagnosisGroups[i])])
}))
groupsDataSets <- lapply(1:length(patientsInGroups), function(i) {
expr <- lapply(1:length(patientsInGroups[[i]]), function(j) {
cancerDataCleared[, which(colnames(cancerDataCleared) == patientsInGroups[[i]][j])]
})
expression <- do.call("cbind", expr)
rownames(expression) <- rownames(cancerDataCleared)
colnames(expression) <- patientsInGroups[[i]]
expression
})
return(list(diagnosisGroups = diagnosisGroups, groupsExpressionDataSets = groupsDataSets))
}

#' processOneCancerGroup
#' Processes one cancer group
#' @details Processes one cancer group by a) filtering 0 sum rows,
#' b) using Silverman's test for unimodality,
#' c) filtering genes with <2% of patients that have expression values greater 0,
#' d) use useMclust on cancer data,
#' e) creating list of output and filtered expression matrix
#' @param cancerData TCGA cancer expression data of one cancer group
#' @param minExpressedPercentage integer specifying the percentage of patients that should at least be expressed for the gene to be analysed
#' @param algorithm "mclust", "hdbscan" or "flexmix". Defines which algorithm should be used to process the cancer data
#' @param maxModality An integer specifying the highest modality to calculate models in with mclust and flexmix
#' @param minClusterSize Integer; The minimum number of samples
#' in a group for that group to be considered a cluster in hdbscan
#' @param minSamples Integer for hdbscan; The number of samples
#' in a neighborhood for a point to be considered as a core point.
#' This includes the point itself. If NULL: defaults to the min_cluster_size.
#' @param pathToClinicalData path to the clinical data
#' @param pathToExpressionmatrix path to the expression matrix
#' @param SilvermanP The p-value that is used to reject Silvermans Test for unimodality
#' (given by k=1 using the Hall/York adjustments)
#' @param verbose logical. Whether to print progress messages
#'
#' @export
processOneCancerGroup <- function(cancerData,minExpressedPercentage=2,algorithm="mclust", maxModality=3,
minClusterSize=50,minSamples=NULL,
pathToClinicalData = "pathToClinicalData",
pathToExpressionmatrix = "pathToExpressionmatrix",
SilvermanP = 0.05, verbose = TRUE) {
if (verbose) {
message("Filtering 0 sum rows...")
}
clearedData <- filter0sumGenes(cancerData = cancerData,parallel = TRUE)
if (verbose) {
message("Using SilvermanOverP...")
}
possibles <- useSilvermantest(clearedData =clearedData,parallel = TRUE,SilvermanP = SilvermanP)
if (verbose) {
message("Filter genes with only a few patients (<minExpressedPercentage) with expression greater 0...")
}
toBeProcessed <- filterRarelyExpressedGenes(possibles = possibles,minExpressedPercentage = minExpressedPercentage,parallel = TRUE)
if (verbose) {
message("UseMclust on the filtered data set...")
}
Sys.time()
output <- c()
if(algorithm=="mclust"){
output <- useMclust(expression = toBeProcessed, clusterNumbers = c(1:maxModality),verbose = FALSE, parallel = T,pathToClinicalData = pathToClinicalData,pathToExpressionmatrix = pathToExpressionmatrix)
}
if(algorithm=="hdbscan"){
output <- useHdbscan(expression = toBeProcessed,minClusterSize = minClusterSize,minSamples = minSamples,verbose = FALSE,pathToClinicalData = pathToClinicalData,pathToExpressionmatrix = pathToExpressionmatrix)
}
if(algorithm=="flexmix"){
output <- useFlexmix(expression = toBeProcessed,maxModality = maxModality,verbose = FALSE,parallel = T,pathToClinicalData = pathToClinicalData,pathToExpressionmatrix = pathToExpressionmatrix)
}

listProcessedBC <- unlist(lapply(1:length(output$Genes),function(i){
which(rownames(toBeProcessed)==names(output$Genes)[i])
}))
processedData <- toBeProcessed[listProcessedBC,]
if (verbose) {
message("Return output and expression matrix...")
}
return(list("Output"= output,"Expressionmatrix"=processedData))
}

#' getFilteredData
#' Processes one cancer group with filtering steps
#' @details Processes one cancer group by
#' a) filtering 0 sum rows,
#' b) using Silverman's test for unimodality,
#' c) filtering genes with <2% of patients that have expression values greater 0,
#' d) use useMclust on cancer data,
#' e) cleaning output of useMclust
#' f) filtering for multimodal genes
#' g) filtering for existing minimum mean of any mean of the multimodal genes
#' h) filtering for existing minimum fold change between any adjacent means of multimodal genes
#' i) getting gene names from Ensembl
#' j) returning a list of the filtered output and the filtered expression matrix
#' @param oneCancerGroupExpressionmatrix TCGA cancer expression data of one cancer group
#' @param algorithm "mclust", "hdbscan" or "flexmix". Defines which algorithm should be used to process the cancer data
#' @param maxModality An integer specifying the highest modality to calculate models in with mclust and flexmix
#' @param minClusterSize Integer; The minimum number of samples
#' in a group for that group to be considered a cluster in hdbscan
#' @param minSamples Integer for hdbscan; The number of samples
#' in a neighborhood for a point to be considered as a core point.
#' This includes the point itself. If NULL: defaults to the min_cluster_size.
#' @param pathToClinicalData path to the clinical data
#' @param pathToExpressionmatrix path to the expression matrix
#' @param minExpressedPercentage integer specifying the percentage of patients that should at least be expressed for the gene to be analysed
#' @param SilvermanP The p-value that is used to reject Silvermans Test for unimodality
#' (given by k=1 using the Hall/York adjustments)
#' @param minPercentage minimum percentage of groups.
#' Groups with lower percentages will be sorted into the other groups
#' @param minMean minimum mean that has to exist in any mean of the multimodal genes
#' @param minFoldChange minimum fold change that has to exist between any adjacent means of multimodal genes
#' @param coresToUse The number of cores to use for parallel computing of cleanOutput() if parallel is TRUE
#' @param parallel logical. Whether cleanOutput() shall be computed in parallel
#' @param updateToEnsembleGeneNames logical. Whether to convert Ensemble gene ids to gene names
#' @param verbose logical. Whether to print progress messages
#' @export
getFilteredData <- function(oneCancerGroupExpressionmatrix,
maxModality=3,algorithm="mclust",minClusterSize=50,minSamples=NULL,
pathToClinicalData = "pathToClinicalData",
pathToExpressionmatrix ="pathToExpressionmatrix",
minExpressedPercentage=2,
SilvermanP=0.05,
minPercentage=10,
minMean=2,
minFoldChange=2,
coresToUse=6,
parallel=FALSE,
updateToEnsembleGeneNames=FALSE,
verbose=TRUE){
if(verbose){message("Filtering 0 sum rows...")}
if(verbose){message("Using Silverman's test for unimodality...")}
if(verbose){message("Filtering genes with <2% of patients that have expression values greater 0...")}
if(verbose){message("Use useMclust on cancer data... ")}
processedCancerData <- processOneCancerGroup(cancerData = oneCancerGroupExpressionmatrix,minExpressedPercentage = minExpressedPercentage,
algorithm = algorithm,maxModality = maxModality,
minClusterSize = minClusterSize,minSamples = minSamples,
pathToClinicalData = pathToClinicalData,
pathToExpressionmatrix = pathToExpressionmatrix,
SilvermanP = SilvermanP,
verbose = FALSE)
processedOutput <- processedCancerData$Output
processedEm <- processedCancerData$Expressionmatrix
if(verbose){message(
"Cleaning the output of useMclust")}
cleanedOutput <- cleanOutput(cancerGroupOutput = processedOutput,cancerGroupExpressionmatrix = processedEm,
minPercentage = minPercentage,coresToUse = coresToUse,parallel = parallel)
if(verbose){message(
"Filtering for multimodal genes")}
multimodalData <- filterMultimodalGenes(cleanedOutput = cleanedOutput,
processedExpressionmatrix = processedEm)
multimodalOutput <- multimodalData$Output
multimodalEm <- multimodalData$Expressionmatrix
if(verbose){message(
"Filtering for existing minimum mean of any mean of the multimodal genes")}
filteredMeanData <- filterForMean(multimodalExpressionmatrix = multimodalEm,
multimodalOutput = multimodalOutput,
minMean = minMean)
filteredMeanOutput <- filteredMeanData$Output
filteredMeanEm <- filteredMeanData$Expressionmatrix
if(verbose){message(
"Filtering for existing minimum fold change between any adjacent means of multimodal genes")}
filteredFCData <- filterForFoldChange(filteredMeanExpressionmatrix =filteredMeanEm,
filteredMeanOutput = filteredMeanOutput,
minFoldChange = minFoldChange)
filteredFCOutput <- filteredFCData$Output
filteredFCEm <- filteredFCData$Expressionmatrix
if(updateToEnsembleGeneNames){
if(verbose){message("Getting gene names from Ensembl")}
updatedNamesData <- updateGeneNames(filteredExpressionmatrix = filteredFCEm,filteredOutput=filteredFCOutput)
filteredOutput <- updatedNamesData$Output
filteredEm <- updatedNamesData$Expressionmatrix
}else{
filteredOutput <- filteredFCOutput
filteredEm <- filteredFCEm
}
return(list("Output"=filteredOutput,"Expressionmatrix"=filteredEm))
}
Loading

0 comments on commit 5ca5931

Please sign in to comment.