Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#' Filter 0-sum genes
#'
#' @param cancerData TCGA cancer expression data of one cancer group
#' @param parallel logical. Whether to calculate in parallel
#'
#' @return cleared data without 0 sum genes
#' @importFrom doParallel registerDoParallel
#' @importFrom parallel detectCores makeCluster stopCluster
#' @importFrom foreach foreach "%dopar%"
#' @importFrom plyr compact
#' @export
filter0sumGenes <- function(cancerData,parallel =TRUE){
'%dopar%' <- foreach::`%dopar%`
if(parallel){
cores <- parallel::detectCores() - 1
cl <- parallel::makeCluster(cores)
doParallel::registerDoParallel(cl)
i <- c()
notZeroSum <- unlist(plyr::compact(foreach::foreach(i = 1:nrow(cancerData)) %dopar% {
if (sum(cancerData[i, ]) != 0) {
i
}
}))
parallel::stopCluster(cl)
}else{
notZeroSum <- unlist(plyr::compact(lapply(1:nrow(cancerData),function(i){
if (sum(cancerData[i, ]) != 0) {
i
}
})))
}
clearedData <- cancerData[notZeroSum, ]
return(clearedData)
}
#' SilvermanOverP returns a filter function with bindings for P.
#'
#' @details The null hypothesis of a silvermantest is that the underlying
#' density has at most k modes (H0: number of modes <= k). By rejecting
#' this hypothesis (returned p-value is smaller than given level of
#' significance) one can verify that the underlying density has more
#' than k modes.
#' This function evaluates to TRUE if the Silverman Test's for unimodality (k=1)
#' can be rejected at the significance level p.
#' @param p The p-value that is used to reject Silvermans Test for unimodality (given by k=1 using the Hall/York adjustments)
#' @param R The number of bootstrap replicates for each test.
#' @param na.rm Whether NAs should be removed.
#'
#' @return A function with bindings for p and R.
#' @importFrom silvermantest silverman.test
#'
SilvermanOverP <- function(p, R = 100, na.rm = T) {
function(x) {
if (na.rm)
x <- x[!is.na(x)]
s1 <- silvermantest::silverman.test(x, k = 1, R = R, adjust = T)
s1@p_value < p
}
}
#' useSilvermantest uses the SilvermanOverP function and returns filtered data of genes for that SilvermanOverP was TRUE
#'
#' @param clearedData cleared TCGA cancer expression data of one cancer group without 0sum genes
#' @param parallel logical. Whether to calculate in parallel
#' @param SilvermanP The p-value that is used to reject Silvermans Test for unimodality (given by k=1 using the Hall/York adjustments)
#'
#' @return filtered gene expression data of possibly multimodal data
#' @importFrom parallel detectCores makeCluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom plyr compact
#' @importFrom foreach foreach "%dopar%"
#' @export
useSilvermantest <- function(clearedData,parallel,SilvermanP=0.05){
'%dopar%' <- foreach::`%dopar%`
if(parallel){
cores <- parallel::detectCores() - 1
cl <- parallel::makeCluster(cores)
doParallel::registerDoParallel(cl)
i <- c()
filtered <- foreach::foreach(i = 1:nrow(clearedData),.export="SilvermanOverP") %dopar%
{
apply(clearedData[i, ], 1, FUN = SilvermanOverP(p = SilvermanP))
}
parallel::stopCluster(cl)
}else{
filtered <- lapply(1:nrow(clearedData),function(i){
apply(clearedData[i, ], 1, FUN = SilvermanOverP(p = SilvermanP))
})
}
mayBeBimodal <- unlist(lapply(1:length(filtered),function(i){
if (filtered[i] == TRUE) {
i
}
}))
mayBeBimodal <- unlist(plyr::compact(mayBeBimodal))
possibles <- clearedData[mayBeBimodal, ]
return(possibles)
}
#' DipTestOverP returns a filter function with bindings for P.
#' This function evaluates to TRUE if the Hartigans Dip Test for unimodality can be rejected at the significance level P.
#' @param p The P-value that is used to reject Hartigans Dip Test of unimodality
#' @param R The number of bootstrap replicates for each test.
#' @param na.rm Whether NAs should be removed.
#' @return A function with bindings for p and R.
#' @importFrom diptest dip.test
DipTestOverP <- function(p, R = 100, na.rm = T) {
function(x) {
if (na.rm)
x <- x[!is.na(x)]
h <- diptest::dip.test(x, simulate.p.value = T, B = R)
h$p.value < p
}
}
#' useDiptest uses the DipTestOverP function and returns filtered data of genes for that DipTestOverP was TRUE
#'
#' @param clearedData cleared TCGA cancer expression data of one cancer group without 0sum genes
#' @param parallel logical. Whether to calculate in parallel
#' @param DipTestP The p-value that is used to reject Hartigans Dip Test
#'
#' @return filtered gene expression data of possibly multimodal data
#' @importFrom parallel detectCores makeCluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom plyr compact
#' @importFrom foreach foreach "%dopar%"
#' @export
useDiptest <- function(clearedData,parallel,DipTestP=0.05){
'%dopar%' <- foreach::`%dopar%`
if(parallel){
cores <- parallel::detectCores() - 1
cl <- parallel::makeCluster(cores)
doParallel::registerDoParallel(cl)
i <- c()
filtered <- foreach::foreach(i = 1:nrow(clearedData),.export = "DipTestOverP") %dopar%
{
apply(clearedData[i, ], 1, FUN = DipTestOverP(p = DipTestP))
}
parallel::stopCluster(cl)
}else{
filtered <- lapply(1:nrow(clearedData),function(i){
apply(clearedData[i, ], 1, FUN = DipTestOverP(p = DipTestP))
})
}
mayBeBimodal <- unlist(lapply(1:length(filtered),function(i){
if (filtered[i] == TRUE) {
i
}
}))
mayBeBimodal <- unlist(plyr::compact(mayBeBimodal))
possibles <- clearedData[mayBeBimodal, ]
return(possibles)
}
#' Filter rarely expressed genes
#'
#' @details filters out genes that are rarely expressed
#' (expressed by less than minPercentage of patients)
#' @param possibles cancerData of possible multimodal genes
#' @param minExpressedPercentage numerical. minimum percentage cutoff Value for
#' the minimum percentage of patients
#' @param parallel logical. Whether to calculate in parallel
#' @return cancerData of genes that are not rarely expressed
#' @importFrom parallel detectCores makeCluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach foreach "%dopar%"
#' @export
filterRarelyExpressedGenes<- function(possibles,minExpressedPercentage= 2,parallel=TRUE){
'%dopar%' <- foreach::`%dopar%`
if(parallel){
cores <- parallel::detectCores() - 1
cl <- parallel::makeCluster(cores)
doParallel::registerDoParallel(cl)
i <- c()
greaterZero <- unlist(foreach::foreach(i = 1:nrow(possibles)) %dopar%
{
areGreaterZero <- (table(unlist(possibles[i, ]) > 0)["TRUE"])
if (is.na(areGreaterZero)) {
areGreaterZero <- 0
}
if (unname(areGreaterZero > (length(possibles[i, ]) * minExpressedPercentage/100))) {
i
}
})
parallel::stopCluster(cl)
}else{
greaterZero <- unlist(lapply(1:nrow(possibles),function(i){
areGreaterZero <- (table(unlist(possibles[i, ]) > 0)["TRUE"])
if (is.na(areGreaterZero)) {
areGreaterZero <- 0
}
if (unname(areGreaterZero > (length(possibles[i, ]) * minExpressedPercentage/100))) {
i
}
}))
}
toBeProcessed <- possibles[greaterZero, ]
return(toBeProcessed)
}
#' cleanOutput
#' returns the cleaned output of algorithms
#' @details This function cleans up the output as empty groups will be deleted,
#' genes with groups < minPercentage of the sum of patients are corrected and
#' modus, means, sds, group sizes and patients belonging to each group will be updated
#' @param cancerGroupOutput output of one cancer group
#' @param cancerGroupExpressionmatrix expression matrix of this one cancer group
#' @param minPercentage minimum percentage of groups.
#' Groups with lower percentages will be sorted into the other groups
#' @param coresToUse The number of cores to use for parallel computing
#' @param parallel logical. Whether cleanOutput should be computed in parallel
#' @return cleaned output of algorithms
#' @importFrom parallel makeCluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom plyr compact
#' @importFrom foreach foreach "%dopar%"
#' @export
cleanOutput <- function(cancerGroupOutput,cancerGroupExpressionmatrix,minPercentage=10,coresToUse = 20,parallel=TRUE){
'%dopar%' <- foreach::`%dopar%`
i <- NULL
cleanOutput <- (lapply(1:length(cancerGroupOutput$Genes),function(i){
#corrects genes with an empty group to the correct modus, means, sds, sizes and groups
isNull <- unlist(lapply(1:length(cancerGroupOutput$Genes[[i]]$groups),function(j){
if(is.null(unlist(cancerGroupOutput$Genes[[i]]$groups[j]))||length(unlist(cancerGroupOutput$Genes[[i]]$groups[j]))==0){j}
}))
if(!is.null(isNull)){
modus <- cancerGroupOutput$Genes[[i]]$modus -1
means <- cancerGroupOutput$Genes[[i]]$means[-isNull]
sds <- cancerGroupOutput$Genes[[i]]$sds[-isNull]
groups <- plyr::compact(cancerGroupOutput$Genes[[i]]$groups)
groups <- groups[lapply(groups,length)>0]
sizes <- unlist(lapply(1:length(groups),function(j){
length(unlist(cancerGroupExpressionmatrix[i,groups[[j]]]))
}))
}else{
modus <- cancerGroupOutput$Genes[[i]]$modus
means <- cancerGroupOutput$Genes[[i]]$means
sds <- cancerGroupOutput$Genes[[i]]$sds
sizes <- cancerGroupOutput$Genes[[i]]$sizes
groups <- cancerGroupOutput$Genes[[i]]$groups
}
output <- list("modus"=as.integer(modus),"means"=means,"sds"=sds,"sizes"=as.integer(sizes),"groups"=groups)
output
}))
names(cleanOutput) <- names(cancerGroupOutput$Genes)
outputFiltered <- list("Genes"=cleanOutput,
"ClinicalData"=cancerGroupOutput$ClinicalData,
"Expressionmatrix"=cancerGroupOutput$Expressionmatrix)
##corrects genes with groups < minPercentage of patientsum --> patients are sorted to groups with nearest mean,
## sizes are then newly counted, groups are updated, as well as means, modus and sds
cores <- coresToUse
cleanedOutput <- c()
if(parallel){
cl <- parallel::makeCluster(cores)
doParallel::registerDoParallel(cl)
cleanedOutput <- (foreach::foreach(i= 1:length(outputFiltered$Genes)) %dopar% {
geneName <- names(outputFiltered$Genes)[i]
if(any(((outputFiltered$Genes[[geneName]]$sizes)/sum(outputFiltered$Genes[[geneName]]$sizes)*100)<minPercentage)){
modus <- outputFiltered$Genes[[geneName]]$modus
#toChange: group that has < 10% of patients
toChange <- which(((outputFiltered$Genes[[geneName]]$sizes)/sum(outputFiltered$Genes[[geneName]]$sizes)*100)<minPercentage)
while(length(toChange)>0){
changeNow <- which(outputFiltered$Genes[[geneName]]$sizes==min(outputFiltered$Genes[[geneName]]$sizes))
if(any(((outputFiltered$Genes[[geneName]]$sizes)/sum(outputFiltered$Genes[[geneName]]$sizes)*100)<minPercentage)){
if(changeNow==min(1:modus)||changeNow==max(1:modus)){
#changeNow is smallest mean
if(changeNow==min(1:modus)){
#mean>=3 --> first two groups are combined, rest remains unchanged
if(modus>=3){
sizes <- c(sum(outputFiltered$Genes[[geneName]]$sizes[1:2]),
outputFiltered$Genes[[geneName]]$sizes[3:modus])
groups <- list(unlist(c(outputFiltered$Genes[[geneName]]$groups[1],
outputFiltered$Genes[[geneName]]$groups[2])),
outputFiltered$Genes[[geneName]]$groups[3:modus])
means <- unlist(lapply(1:length(groups),function(j){
mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
sds <- unlist(lapply(1:length(groups),function(j){
sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
modus <- modus -1
}else{
#modus <=2 (==2 because unimodals can't have size numbers unequal to patient number)
#groups are combined
sizes <- c(sum(outputFiltered$Genes[[geneName]]$sizes[1:2]))
groups <- list(unlist(c(outputFiltered$Genes[[geneName]]$groups[1],
outputFiltered$Genes[[geneName]]$groups[2])))
means <- unlist(lapply(1:length(groups),function(j){
mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
sds <- unlist(lapply(1:length(groups),function(j){
sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
modus <- modus -1
}
}else if(changeNow==max(1:modus)&&modus>1){
#changeNow is greatest mean
if(modus>=3){
#highest two groups are combined (last 2 groups), groups 1 to modus-2 remain unchanged
sizes <- c(outputFiltered$Genes[[geneName]]$sizes[1:(modus-2)],
sum(outputFiltered$Genes[[geneName]]$sizes[(modus-1):(modus)]))
groups <- list(outputFiltered$Genes[[geneName]]$groups[1:(modus-2)],
unlist(c(outputFiltered$Genes[[geneName]]$groups[(modus-1)],
outputFiltered$Genes[[geneName]]$groups[modus])))
means <- unlist(lapply(1:length(groups),function(j){
mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
sds <- unlist(lapply(1:length(groups),function(j){
sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
modus <- modus -1
}else{
#modus <=2 (==2 because unimodals can't have size numbers unequal to patient number)
#groups are combined
sizes <- c(sum(outputFiltered$Genes[[geneName]]$sizes[1:2]))
groups <- list(unlist(c(outputFiltered$Genes[[geneName]]$groups[1],
outputFiltered$Genes[[geneName]]$groups[2])))
means <- unlist(lapply(1:length(groups),function(j){
mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
sds <- unlist(lapply(1:length(groups),function(j){
sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
modus <- modus -1
}
}
}else{
##changeNow is neither biggest nor smallest mean and
##modus is >=3 because if modus 2 it would have to be greatest/smallest
left <- changeNow -1
right <- changeNow +1
patientsToSort <- cancerGroupExpressionmatrix[geneName,unlist(outputFiltered$Genes[[geneName]]$groups[changeNow])]
means <- outputFiltered$Genes[[geneName]]$means
sorted <- unlist(lapply(1:length(patientsToSort),function(j){
if(abs(means[left]-patientsToSort[j])<abs(means[right]-patientsToSort[j])){
left
}else{right}
}))
#leftGroup contains all patients of group left to the changeNow,
#rightGroup contains all patients of group right to the changeNow,
##e.g. if change ==2; left=1, right=3;
leftGroup <- unlist(outputFiltered$Genes[[geneName]]$groups[left])
rightGroup <- unlist(outputFiltered$Genes[[geneName]]$groups[right])
for(j in 1:length(sorted)){
if(sorted[j]==left){
leftGroup <- c(leftGroup,names(patientsToSort[j]))
}
if(sorted[j]==right){
rightGroup <- c(rightGroup,names(patientsToSort[j]))
}
}
if(modus==3){
groups <- list(leftGroup,rightGroup)
sizes <- c(length(leftGroup),length(rightGroup))
means <- unlist(lapply(1:length(groups),function(j){
mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
sds <- unlist(lapply(1:length(groups),function(j){
sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
modus <- modus -1
}else if(modus==4){
if(left==1){
groups <- list(leftGroup,rightGroup,outputFiltered$Genes[[geneName]]$groups[4])
}else{
groups <- list(outputFiltered$Genes[[geneName]]$groups[1],leftGroup,rightGroup)
}
sizes <- unlist(lapply(1:length(groups),function(j){
length(unlist(groups[j]))
}))
means <- unlist(lapply(1:length(groups),function(j){
mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
sds <- unlist(lapply(1:length(groups),function(j){
sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
modus <- modus -1
}else{
if(left==1){
groups <- list(leftGroup,
rightGroup,
outputFiltered$Genes[[geneName]]$groups[(right+1):modus])
}
if(right==modus){
groups <- list(outputFiltered$Genes[[geneName]]$groups[1:(left-1)],
leftGroup,
rightGroup)
}else{
groups <- list(outputFiltered$Genes[[geneName]]$groups[1:(left-1)],
leftGroup,
rightGroup,
outputFiltered$Genes[[geneName]]$groups[(right+1):modus])
}
sizes <- unlist(lapply(1:length(groups),function(j){
length(unlist(groups[j]))
}))
means <- unlist(lapply(1:length(groups),function(j){
mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
sds <- unlist(lapply(1:length(groups),function(j){
sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
modus <- modus -1
}
}
}else{
modus <- outputFiltered$Genes[[geneName]]$modus
means <- outputFiltered$Genes[[geneName]]$means
sds <- outputFiltered$Genes[[geneName]]$sds
sizes <- outputFiltered$Genes[[geneName]]$sizes
groups <- outputFiltered$Genes[[geneName]]$groups
}
outputFiltered$Genes[[geneName]] <- list("modus"=as.integer(modus),"means"=means,"sds"=sds,"sizes"=as.integer(sizes),"groups"=groups)
toChange <- toChange[-which(toChange==changeNow)]
}
}else{
#gene does not have to be corrected
modus <- outputFiltered$Genes[[geneName]]$modus
means <- outputFiltered$Genes[[geneName]]$means
sds <- outputFiltered$Genes[[geneName]]$sds
sizes <- outputFiltered$Genes[[geneName]]$sizes
groups <- outputFiltered$Genes[[geneName]]$groups
}
output <- list("modus"=as.integer(modus),"means"=means,"sds"=sds,"sizes"=as.integer(sizes),"groups"=groups)
output
})
parallel::stopCluster(cl)
}else{
cleanedOutput <- lapply(1:length(outputFiltered$Genes),function(i){
geneName <- names(outputFiltered$Genes)[i]
if(any(((outputFiltered$Genes[[geneName]]$sizes)/sum(outputFiltered$Genes[[geneName]]$sizes)*100)<minPercentage)){
modus <- outputFiltered$Genes[[geneName]]$modus
#toChange: group that has < 10% of patients
toChange <- which(((outputFiltered$Genes[[geneName]]$sizes)/sum(outputFiltered$Genes[[geneName]]$sizes)*100)<minPercentage)
while(length(toChange)>0){
changeNow <- which(outputFiltered$Genes[[geneName]]$sizes==min(outputFiltered$Genes[[geneName]]$sizes))
if(any(((outputFiltered$Genes[[geneName]]$sizes)/sum(outputFiltered$Genes[[geneName]]$sizes)*100)<minPercentage)){
if(changeNow==min(1:modus)||changeNow==max(1:modus)){
#changeNow is smallest mean
if(changeNow==min(1:modus)){
#mean>=3 --> first two groups are combined, rest remains unchanged
if(modus>=3){
sizes <- c(sum(outputFiltered$Genes[[geneName]]$sizes[1:2]),
outputFiltered$Genes[[geneName]]$sizes[3:modus])
groups <- list(unlist(c(outputFiltered$Genes[[geneName]]$groups[1],
outputFiltered$Genes[[geneName]]$groups[2])),
outputFiltered$Genes[[geneName]]$groups[3:modus])
means <- unlist(lapply(1:length(groups),function(j){
mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
sds <- unlist(lapply(1:length(groups),function(j){
sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
modus <- modus -1
}else{
#modus <=2 (==2 because unimodals can't have size numbers unequal to patient number)
#groups are combined
sizes <- c(sum(outputFiltered$Genes[[geneName]]$sizes[1:2]))
groups <- list(unlist(c(outputFiltered$Genes[[geneName]]$groups[1],
outputFiltered$Genes[[geneName]]$groups[2])))
means <- unlist(lapply(1:length(groups),function(j){
mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
sds <- unlist(lapply(1:length(groups),function(j){
sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
modus <- modus -1
}
}else if(changeNow==max(1:modus)&&modus>1){
#changeNow is greatest mean
if(modus>=3){
#highest two groups are combined (last 2 groups), groups 1 to modus-2 remain unchanged
sizes <- c(outputFiltered$Genes[[geneName]]$sizes[1:(modus-2)],
sum(outputFiltered$Genes[[geneName]]$sizes[(modus-1):(modus)]))
groups <- list(outputFiltered$Genes[[geneName]]$groups[1:(modus-2)],
unlist(c(outputFiltered$Genes[[geneName]]$groups[(modus-1)],
outputFiltered$Genes[[geneName]]$groups[modus])))
means <- unlist(lapply(1:length(groups),function(j){
mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
sds <- unlist(lapply(1:length(groups),function(j){
sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
modus <- modus -1
}else{
#modus <=2 (==2 because unimodals can't have size numbers unequal to patient number)
#groups are combined
sizes <- c(sum(outputFiltered$Genes[[geneName]]$sizes[1:2]))
groups <- list(unlist(c(outputFiltered$Genes[[geneName]]$groups[1],
outputFiltered$Genes[[geneName]]$groups[2])))
means <- unlist(lapply(1:length(groups),function(j){
mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
sds <- unlist(lapply(1:length(groups),function(j){
sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
modus <- modus -1
}
}
}else{
##changeNow is neither biggest nor smallest mean and
##modus is >=3 because if modus 2 it would have to be greatest/smallest
left <- changeNow -1
right <- changeNow +1
patientsToSort <- cancerGroupExpressionmatrix[geneName,unlist(outputFiltered$Genes[[geneName]]$groups[changeNow])]
means <- outputFiltered$Genes[[geneName]]$means
sorted <- unlist(lapply(1:length(patientsToSort),function(j){
if(abs(means[left]-patientsToSort[j])<abs(means[right]-patientsToSort[j])){
left
}else{right}
}))
#leftGroup contains all patients of group left to the changeNow,
#rightGroup contains all patients of group right to the changeNow,
##e.g. if change ==2; left=1, right=3;
leftGroup <- unlist(outputFiltered$Genes[[geneName]]$groups[left])
rightGroup <- unlist(outputFiltered$Genes[[geneName]]$groups[right])
for(j in 1:length(sorted)){
if(sorted[j]==left){
leftGroup <- c(leftGroup,names(patientsToSort[j]))
}
if(sorted[j]==right){
rightGroup <- c(rightGroup,names(patientsToSort[j]))
}
}
if(modus==3){
groups <- list(leftGroup,rightGroup)
sizes <- c(length(leftGroup),length(rightGroup))
means <- unlist(lapply(1:length(groups),function(j){
mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
sds <- unlist(lapply(1:length(groups),function(j){
sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
modus <- modus -1
}else if(modus==4){
if(left==1){
groups <- list(leftGroup,rightGroup,outputFiltered$Genes[[geneName]]$groups[4])
}else{
groups <- list(outputFiltered$Genes[[geneName]]$groups[1],leftGroup,rightGroup)
}
sizes <- unlist(lapply(1:length(groups),function(j){
length(unlist(groups[j]))
}))
means <- unlist(lapply(1:length(groups),function(j){
mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
sds <- unlist(lapply(1:length(groups),function(j){
sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
modus <- modus -1
}else{
if(left==1){
groups <- list(leftGroup,
rightGroup,
outputFiltered$Genes[[geneName]]$groups[(right+1):modus])
}
if(right==modus){
groups <- list(outputFiltered$Genes[[geneName]]$groups[1:(left-1)],
leftGroup,
rightGroup)
}else{
groups <- list(outputFiltered$Genes[[geneName]]$groups[1:(left-1)],
leftGroup,
rightGroup,
outputFiltered$Genes[[geneName]]$groups[(right+1):modus])
}
sizes <- unlist(lapply(1:length(groups),function(j){
length(unlist(groups[j]))
}))
means <- unlist(lapply(1:length(groups),function(j){
mean(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
sds <- unlist(lapply(1:length(groups),function(j){
sd(unlist(cancerGroupExpressionmatrix[geneName,unlist(groups[j])]))
}))
modus <- modus -1
}
}
}else{
modus <- outputFiltered$Genes[[geneName]]$modus
means <- outputFiltered$Genes[[geneName]]$means
sds <- outputFiltered$Genes[[geneName]]$sds
sizes <- outputFiltered$Genes[[geneName]]$sizes
groups <- outputFiltered$Genes[[geneName]]$groups
}
outputFiltered$Genes[[geneName]] <- list("modus"=as.integer(modus),"means"=means,"sds"=sds,"sizes"=as.integer(sizes),"groups"=groups)
toChange <- toChange[-which(toChange==changeNow)]
}
}else{
#gene does not have to be corrected
modus <- outputFiltered$Genes[[geneName]]$modus
means <- outputFiltered$Genes[[geneName]]$means
sds <- outputFiltered$Genes[[geneName]]$sds
sizes <- outputFiltered$Genes[[geneName]]$sizes
groups <- outputFiltered$Genes[[geneName]]$groups
}
output <- list("modus"=as.integer(modus),"means"=means,"sds"=sds,"sizes"=as.integer(sizes),"groups"=groups)
output
})
}
names(cleanedOutput)<-rownames(cancerGroupExpressionmatrix)[1:length(cancerGroupOutput$Genes)]
cleanedOutputGroup <- list("Genes"=cleanedOutput,
"ClinicalData"=outputFiltered$ClinicalData,
"Expressionmatrix"=outputFiltered$Expressionmatrix)
cleanOutput2 <- (lapply(1:length(cleanedOutputGroup$Genes),function(i){
#corrects genes with an empty group to the correct modus, means, sds, sizes and groups
isNull <- unlist(lapply(1:length(cleanedOutputGroup$Genes[[i]]$groups),function(j){
if(is.null(unlist(cleanedOutputGroup$Genes[[i]]$groups[j]))||length(unlist(cleanedOutputGroup$Genes[[i]]$groups[j]))==0){j}
}))
if(!is.null(isNull)){
modus <- cleanedOutputGroup$Genes[[i]]$modus -1
means <- cleanedOutputGroup$Genes[[i]]$means[-isNull]
sds <- cleanedOutputGroup$Genes[[i]]$sds[-isNull]
groups <- plyr::compact(cleanedOutputGroup$Genes[[i]]$groups)
groups <- groups[lapply(groups,length)>0]
sizes <- unlist(lapply(1:length(groups),function(j){
length(unlist(cancerGroupExpressionmatrix[i,groups[[j]]]))
}))
}else{
modus <- cleanedOutputGroup$Genes[[i]]$modus
means <- cleanedOutputGroup$Genes[[i]]$means
sds <- cleanedOutputGroup$Genes[[i]]$sds
sizes <- cleanedOutputGroup$Genes[[i]]$sizes
groups <- cleanedOutputGroup$Genes[[i]]$groups
}
groups <- lapply(1:length(groups),function(j){
unlist(groups[j])
})
output <- list("modus"=as.integer(modus),"means"=means,"sds"=sds,"sizes"=as.integer(sizes),"groups"=groups)
output
}))
names(cleanOutput2) <- names(cleanedOutputGroup$Genes)
output <- list("Genes"=cleanOutput2,
"ClinicalData"=cleanedOutputGroup$ClinicalData,
"Expressionmatrix"=cleanedOutputGroup$Expressionmatrix)
return(output)
}
#' filterMultimodalGenes
#' returns the output and expression matrix of multimodal genes
#' @details This function returns a list of the output and expression matrix of multimodal genes
#' @param cleanedOutput cleaned output of one cancer group
#' @param processedExpressionmatrix expression matrix of the processed cancer group
#' @return list of the output and expression matrix of multimodal genes
#' @export
filterMultimodalGenes <- function(cleanedOutput,processedExpressionmatrix){
modus <- unlist(lapply(1:length(cleanedOutput$Genes),function(i){
cleanedOutput$Genes[[i]]$modus
}))
groupedByModus <- lapply(1:max(modus),function(i){
unlist(lapply(1:length(modus),function(j){
if(modus[[j]]==i){
j
}
}))
})
multimodalModi <- groupedByModus[2:length(groupedByModus)]
multimodalOutput<- list("Genes"= cleanedOutput$Genes[unlist(multimodalModi)],
"ClinicalData"=cleanedOutput$ClinicalData,
"Expressionmatrix"=cleanedOutput$Expressionmatrix)
multimodalEm<- processedExpressionmatrix[(unlist(names(multimodalOutput$Genes))),]
return(list("Output"=multimodalOutput,"Expressionmatrix"=multimodalEm))
}
#' filterForMean
#' returns the output and expression matrix of filtered multimodal genes
#' @details This function returns a list of the output and expression matrix of filtered multimodal genes
#' @param multimodalOutput cleaned output of multimodal genes of one cancer group
#' @param multimodalExpressionmatrix expression matrix of the multimodal genes of one cancer group
#' @param minMean minimum mean that has to exist in any mean of the multimodal genes
#' @return list of the output and expression matrix of filtered multimodal genes
#' @export
filterForMean <- function(multimodalOutput,multimodalExpressionmatrix,minMean = 2){
filtered <-(unlist(lapply(1:length(multimodalOutput$Genes),function(i){
geneName <- names(multimodalOutput$Genes)[i]
if(any(unlist(multimodalOutput$Genes[[i]]$means)>minMean)){
geneName}
})))
filteredOutput <- list("Genes"= multimodalOutput$Genes[unlist(filtered)],
"ClinicalData"=multimodalOutput$ClinicalData,
"Expressionmatrix"=multimodalOutput$Expressionmatrix)
filteredExpressionmatrix<- multimodalExpressionmatrix[(unlist(filtered)),]
return(list("Output"=filteredOutput,"Expressionmatrix"=filteredExpressionmatrix))
}
#' filterForFoldChange
#' returns the output and expression matrix of filtered multimodal genes
#' @details This function returns a list of the output and expression matrix of filtered multimodal genes
#' @param filteredMeanOutput output that was filtered for mean of multimodal genes of one cancer group
#' @param filteredMeanExpressionmatrix expression matrix that was filtered for mean of the multimodal genes of one cancer group
#' @param minFoldChange minimum fold change that has to exist between two adjacent means of the multimodal genes
#' @return list of the output and expression matrix of filtered multimodal genes
#' @export
filterForFoldChange <- function(filteredMeanOutput,filteredMeanExpressionmatrix,minFoldChange=2){
filtered <-(unlist(lapply(1:length(filteredMeanOutput$Genes),function(i){
geneName <- names(filteredMeanOutput$Genes)[i]
if(any(diff(unlist(filteredMeanOutput$Genes[[i]]$means))>minFoldChange)){
geneName}
})))
filteredOutput <- list("Genes"= filteredMeanOutput$Genes[unlist(filtered)],
"ClinicalData"=filteredMeanOutput$ClinicalData,
"Expressionmatrix"=filteredMeanOutput$Expressionmatrix)
filteredExpressionmatrix<- filteredMeanExpressionmatrix[(unlist(filtered)),]
return(list("Output"=filteredOutput,"Expressionmatrix"=filteredExpressionmatrix))
}
#' updateGeneNames
#' returns the output and expression matrix with the updated gene names
#' @details returns the output and expression matrix with the updated gene names of the given output and expression matrix
#' the gene names are loaded from ensembl via the biomaRt package
#' @details This function returns a list of the output and expression matrix of filtered multimodal genes
#' @param filteredOutput output of multimodal genes of one cancer group that was filtered twice
#' @param filteredExpressionmatrix expression matrix of the multimodal genes of one cancer group that was filtered twice
#' @return list of the output and expression matrix with updated gene names taken from Ensembl
#' @importFrom biomaRt useMart getBM
#' @export
updateGeneNames <- function(filteredOutput,filteredExpressionmatrix){
namesMulti <-(unlist(lapply(1:length(filteredOutput$Genes),function(i){
unlist(strsplit(names(filteredOutput$Genes)[i],split = "[.]"))[1]
})))
mart = biomaRt::useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl",host = "www.ensembl.org")
genes = lapply(1:length(namesMulti),function(i){
geneName <- biomaRt::getBM(attributes = c("ensembl_gene_id","external_gene_name"),
filters = "ensembl_gene_id",
values = namesMulti[i],
mart = mart,
quote = "\\")
if(nrow(geneName)==0){
geneName <- data.frame("ensembl_gene_id" =namesMulti[i],"external_gene_name"=namesMulti[i])
}
geneName
})
genes2 <- do.call('rbind',genes)
geneNames <- rownames(filteredExpressionmatrix)<- make.unique(unlist(genes2[,2]))
names(filteredOutput$Genes) <- rownames((filteredExpressionmatrix))
return(list("Output"=filteredOutput,"Expressionmatrix"=filteredExpressionmatrix))
}
#' Filter output and expressionmatrix for genes that are known in KEGG's "Pathways in Cancer"
#' returns the filtered output and expression matrix
#' @details needs the updated Ensembl Gene Names
#' @param output output of multimodal genes of one cancer group
#' @param expressionmatrix expression matrix of the multimodal genes of one cancer group
#' @return list of the output and expression matrix of genes that were found in KEGG's "Pathways in cancer"
#' @importFrom KEGGREST keggGet
#' @importFrom plyr compact
#' @export
filterForKnownInCancer <- function(output,expressionmatrix){
query <- KEGGREST::keggGet(c("hsa05200"))
geneEntries <- unlist(plyr::compact(lapply(1:length(query[[1]]$GENE),function(i){
if(i%%2==0){
entry <- query[[1]]$GENE[[i]]
strsplit(x = entry,split = ";")[[1]][[1]]
}
})))
geneNames <- names(output$Genes)
foundInCancerPathways <- unlist(plyr::compact(lapply(1:length(geneNames),function(i){
if(any(geneEntries==geneNames[[i]])){
geneNames[[i]]
}
})))
outputFoundInCancerPathways <- list("Genes"=output$Genes[foundInCancerPathways],"ClinicalData"=output$ClinicalData,"Expressionmatrix"=output$Expressionmatrix)
emFoundInCancerPathways <- expressionmatrix[foundInCancerPathways,]
return(list("Output"=outputFoundInCancerPathways,"Expressionmatrix"=emFoundInCancerPathways))
}
#' Filter output and expressionmatrix for genes that are on the X Chromosome of the Human Genome
#' returns filtered output and expression matrix without genes on the X chromosome
#' @details needs the updated Ensembl Gene Names; uses gencode.v28.annotation.gtf
#' @param output output of multimodal genes of one cancer group
#' @param expressionmatrix expression matrix of the multimodal genes of one cancer group
#' @return list of the output and expression matrix of genes that were not found on the X Chromosome
#' @importFrom plyr compact
#' @export
filterForXChromosomeGenes <- function(output,expressionmatrix){
geneNames <- names(output$Genes)
notXChromGenes <- unlist(plyr::compact(lapply(1:length(geneNames),function(i){
if(!any(geneNamesX==geneNames[[i]])){
geneNames[[i]]
}
})))
filteredOutput <- list("Genes"= output$Genes[unlist(notXChromGenes)],
"ClinicalData"=output$ClinicalData,
"Expressionmatrix"=output$Expressionmatrix)
filteredExpressionmatrix<- expressionmatrix[(unlist(notXChromGenes)),]
return(list("Output"=filteredOutput,"Expressionmatrix"=filteredExpressionmatrix))
}
#' Filter output and expressionmatrix for genes that are on the Y Chromosome of the Human Genome
#' returns filtered output and expression matrix without genes on the Y chromosome
#' @details needs the updated Ensembl Gene Names; uses gencode.v28.annotation.gtf
#' @param output output of multimodal genes of one cancer group
#' @param expressionmatrix expression matrix of the multimodal genes of one cancer group
#' @return list of the output and expression matrix of genes that were not found on the Y Chromosome
#' @importFrom plyr compact
#' @export
filterForYChromosomeGenes <- function(output,expressionmatrix){
geneNames <- names(output$Genes)
notYChromGenes <- unlist(plyr::compact(lapply(1:length(geneNames),function(i){
if(!any(geneNamesY==geneNames[[i]])){
geneNames[[i]]
}
})))
filteredOutput <- list("Genes"= output$Genes[unlist(notYChromGenes)],
"ClinicalData"=output$ClinicalData,
"Expressionmatrix"=output$Expressionmatrix)
filteredExpressionmatrix<- expressionmatrix[(unlist(notYChromGenes)),]
return(list("Output"=filteredOutput,"Expressionmatrix"=filteredExpressionmatrix))
}