Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#TCGA Routine functions
#' toCancerDataGroups
#' Sorts the TCGA expression data with the metadata into groups of different cancer types
#' @details The groupsData that is returned consists of the different diagnosis groups
#' (http://www.icd10data.com/ICD10CM/Codes/C00-D49)
#' and the expression matrix of each of the diagnosis groups
#' @param cancerDataExpression TCGA cancer expression data
#' @param metadata TCGA cancer metadata
#' @param verbose logical. Whether to print progress messages
#'
#' @export
toCancerDataGroups <- function(cancerDataExpression, metadata,
verbose = TRUE) {
if (verbose) {
message("Filtering 0 sum rows...")
}
cancerDataCleared <- filter0sumGenes(cancerData = cancerDataExpression,parallel = TRUE)
if (verbose) {
message("Splitting the data sets into different cancer groups...")
}
icd10 <- unique(unlist(lapply(1:length(metadata$diagnosis), function(i) {
unlist(strsplit(metadata$diagnosis[i], split = "[.]"))[1]
})))
patientIds <- colnames(cancerDataCleared)
caseIdsUnique <- make.unique(names = as.character(metadata$caseID))
diagnosis <- unlist(lapply(1:length(patientIds), function(i) {
diagnosis <- metadata$diagnosis[(which(caseIdsUnique == patientIds[i]))]
names(diagnosis) <- patientIds[i]
diagnosis
}))
diagnosisGroups <- unique(unlist(lapply(1:length(diagnosis), function(i) {
unlist(strsplit(diagnosis[i], split = "[.]"))[1]
})))
shortDiagnosis <- (unlist(lapply(1:length(diagnosis), function(i) {
unlist(strsplit(diagnosis[i], split = "[.]"))[1]
})))
patientsInGroups <- (lapply(1:length(diagnosisGroups), function(i) {
names(diagnosis[which(shortDiagnosis == diagnosisGroups[i])])
}))
groupsDataSets <- lapply(1:length(patientsInGroups), function(i) {
expr <- lapply(1:length(patientsInGroups[[i]]), function(j) {
cancerDataCleared[, which(colnames(cancerDataCleared) == patientsInGroups[[i]][j])]
})
expression <- do.call("cbind", expr)
rownames(expression) <- rownames(cancerDataCleared)
colnames(expression) <- patientsInGroups[[i]]
expression
})
return(list(diagnosisGroups = diagnosisGroups, groupsExpressionDataSets = groupsDataSets))
}
#' processOneCancerGroup
#' Processes one cancer group
#' @details Processes one cancer group by a) filtering 0 sum rows,
#' b) using Silverman's test for unimodality,
#' c) filtering genes with <2% of patients that have expression values greater 0,
#' d) use useMclust on cancer data,
#' e) creating list of output and filtered expression matrix
#' @param cancerData TCGA cancer expression data of one cancer group
#' @param minExpressedPercentage integer specifying the percentage of patients that should at least be expressed for the gene to be analysed
#' @param algorithm "mclust", "hdbscan" or "flexmix". Defines which algorithm should be used to process the cancer data
#' @param maxModality An integer specifying the highest modality to calculate models in with mclust and flexmix
#' @param minClusterSize Integer; The minimum number of samples
#' in a group for that group to be considered a cluster in hdbscan
#' @param minSamples Integer for hdbscan; The number of samples
#' in a neighborhood for a point to be considered as a core point.
#' This includes the point itself. If NULL: defaults to the min_cluster_size.
#' @param pathToClinicalData path to the clinical data
#' @param pathToExpressionmatrix path to the expression matrix
#' @param SilvermanP The p-value that is used to reject Silvermans Test for unimodality
#' (given by k=1 using the Hall/York adjustments)
#' @param verbose logical. Whether to print progress messages
#'
#' @export
processOneCancerGroup <- function(cancerData,minExpressedPercentage=2,algorithm="mclust", maxModality=3,
minClusterSize=50,minSamples=NULL,
pathToClinicalData = "pathToClinicalData",
pathToExpressionmatrix = "pathToExpressionmatrix",
SilvermanP = 0.05, verbose = TRUE) {
if (verbose) {
message("Filtering 0 sum rows...")
}
clearedData <- filter0sumGenes(cancerData = cancerData,parallel = TRUE)
if (verbose) {
message("Using SilvermanOverP...")
}
possibles <- useSilvermantest(clearedData =clearedData,parallel = TRUE,SilvermanP = SilvermanP)
if (verbose) {
message("Filter genes with only a few patients (<minExpressedPercentage) with expression greater 0...")
}
toBeProcessed <- filterRarelyExpressedGenes(possibles = possibles,minExpressedPercentage = minExpressedPercentage,parallel = TRUE)
if (verbose) {
message("UseMclust on the filtered data set...")
}
Sys.time()
output <- c()
if(algorithm=="mclust"){
output <- useMclust(expression = toBeProcessed, clusterNumbers = c(1:maxModality),verbose = FALSE, parallel = T,pathToClinicalData = pathToClinicalData,pathToExpressionmatrix = pathToExpressionmatrix)
}
if(algorithm=="hdbscan"){
output <- useHdbscan(expression = toBeProcessed,minClusterSize = minClusterSize,minSamples = minSamples,verbose = FALSE,pathToClinicalData = pathToClinicalData,pathToExpressionmatrix = pathToExpressionmatrix)
}
if(algorithm=="flexmix"){
output <- useFlexmix(expression = toBeProcessed,maxModality = maxModality,verbose = FALSE,parallel = T,pathToClinicalData = pathToClinicalData,pathToExpressionmatrix = pathToExpressionmatrix)
}
listProcessedBC <- unlist(lapply(1:length(output$Genes),function(i){
which(rownames(toBeProcessed)==names(output$Genes)[i])
}))
processedData <- toBeProcessed[listProcessedBC,]
if (verbose) {
message("Return output and expression matrix...")
}
return(list("Output"= output,"Expressionmatrix"=processedData))
}
#' getFilteredData
#' Processes one cancer group with filtering steps
#' @details Processes one cancer group by
#' a) filtering 0 sum rows,
#' b) using Silverman's test for unimodality,
#' c) filtering genes with <2% of patients that have expression values greater 0,
#' d) use useMclust on cancer data,
#' e) cleaning output of useMclust
#' f) filtering for multimodal genes
#' g) filtering for existing minimum mean of any mean of the multimodal genes
#' h) filtering for existing minimum fold change between any adjacent means of multimodal genes
#' i) getting gene names from Ensembl
#' j) returning a list of the filtered output and the filtered expression matrix
#' @param oneCancerGroupExpressionmatrix TCGA cancer expression data of one cancer group
#' @param algorithm "mclust", "hdbscan" or "flexmix". Defines which algorithm should be used to process the cancer data
#' @param maxModality An integer specifying the highest modality to calculate models in with mclust and flexmix
#' @param minClusterSize Integer; The minimum number of samples
#' in a group for that group to be considered a cluster in hdbscan
#' @param minSamples Integer for hdbscan; The number of samples
#' in a neighborhood for a point to be considered as a core point.
#' This includes the point itself. If NULL: defaults to the min_cluster_size.
#' @param pathToClinicalData path to the clinical data
#' @param pathToExpressionmatrix path to the expression matrix
#' @param minExpressedPercentage integer specifying the percentage of patients that should at least be expressed for the gene to be analysed
#' @param SilvermanP The p-value that is used to reject Silvermans Test for unimodality
#' (given by k=1 using the Hall/York adjustments)
#' @param minPercentage minimum percentage of groups.
#' Groups with lower percentages will be sorted into the other groups
#' @param minMean minimum mean that has to exist in any mean of the multimodal genes
#' @param minFoldChange minimum fold change that has to exist between any adjacent means of multimodal genes
#' @param coresToUse The number of cores to use for parallel computing of cleanOutput() if parallel is TRUE
#' @param parallel logical. Whether cleanOutput() shall be computed in parallel
#' @param updateToEnsembleGeneNames logical. Whether to convert Ensemble gene ids to gene names
#' @param verbose logical. Whether to print progress messages
#' @export
getFilteredData <- function(oneCancerGroupExpressionmatrix,
maxModality=3,algorithm="mclust",minClusterSize=50,minSamples=NULL,
pathToClinicalData = "pathToClinicalData",
pathToExpressionmatrix ="pathToExpressionmatrix",
minExpressedPercentage=2,
SilvermanP=0.05,
minPercentage=10,
minMean=2,
minFoldChange=2,
coresToUse=6,
parallel=FALSE,
updateToEnsembleGeneNames=FALSE,
verbose=TRUE){
if(verbose){message("Filtering 0 sum rows...")}
if(verbose){message("Using Silverman's test for unimodality...")}
if(verbose){message("Filtering genes with <2% of patients that have expression values greater 0...")}
if(verbose){message("Use useMclust on cancer data... ")}
processedCancerData <- processOneCancerGroup(cancerData = oneCancerGroupExpressionmatrix,minExpressedPercentage = minExpressedPercentage,
algorithm = algorithm,maxModality = maxModality,
minClusterSize = minClusterSize,minSamples = minSamples,
pathToClinicalData = pathToClinicalData,
pathToExpressionmatrix = pathToExpressionmatrix,
SilvermanP = SilvermanP,
verbose = FALSE)
processedOutput <- processedCancerData$Output
processedEm <- processedCancerData$Expressionmatrix
if(verbose){message(
"Cleaning the output of useMclust")}
cleanedOutput <- cleanOutput(cancerGroupOutput = processedOutput,cancerGroupExpressionmatrix = processedEm,
minPercentage = minPercentage,coresToUse = coresToUse,parallel = parallel)
if(verbose){message(
"Filtering for multimodal genes")}
multimodalData <- filterMultimodalGenes(cleanedOutput = cleanedOutput,
processedExpressionmatrix = processedEm)
multimodalOutput <- multimodalData$Output
multimodalEm <- multimodalData$Expressionmatrix
if(verbose){message(
"Filtering for existing minimum mean of any mean of the multimodal genes")}
filteredMeanData <- filterForMean(multimodalExpressionmatrix = multimodalEm,
multimodalOutput = multimodalOutput,
minMean = minMean)
filteredMeanOutput <- filteredMeanData$Output
filteredMeanEm <- filteredMeanData$Expressionmatrix
if(verbose){message(
"Filtering for existing minimum fold change between any adjacent means of multimodal genes")}
filteredFCData <- filterForFoldChange(filteredMeanExpressionmatrix =filteredMeanEm,
filteredMeanOutput = filteredMeanOutput,
minFoldChange = minFoldChange)
filteredFCOutput <- filteredFCData$Output
filteredFCEm <- filteredFCData$Expressionmatrix
if(updateToEnsembleGeneNames){
if(verbose){message("Getting gene names from Ensembl")}
updatedNamesData <- updateGeneNames(filteredExpressionmatrix = filteredFCEm,filteredOutput=filteredFCOutput)
filteredOutput <- updatedNamesData$Output
filteredEm <- updatedNamesData$Expressionmatrix
}else{
filteredOutput <- filteredFCOutput
filteredEm <- filteredFCEm
}
return(list("Output"=filteredOutput,"Expressionmatrix"=filteredEm))
}