Skip to content
Permalink
837de1d39b
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
244 lines (233 sloc) 13.9 KB
#' Use glmnet
#' @description This function uses glmnet to do variable selection within a classification of a response variable.
#' @param data A data frame or SummarizedExperiment.
#' @param assay_classification Integer - number of assay that contains the classification made by mclust
#' @param classification Matrix - single named vector or matrix of the classification of the samples
#' @param model_matrix Matrix - a model matrix as for example generated by /link [make_model_matrix]
#' @param col_names Character - set of column names to be used as predictors in regression
#' @param gene_names Character - vector of gene names to apply glmnet to
#' @param output_genewise Logical - Genewise results as list, otherwise a list with glm and cv results with all genes
#' @param use_glm Do a signle glmnet fit?
#' @param use_cv Logical - should nested cross validation be used to estimate error?
#' @param fast_cv Logical - faster calc because entries with zero coefficients are skipped.
#' @param return_plot Logical - if set to true, the output is a list of the Regression results and a correlation plot with AUC plot.
#' @param seed Integer - Number for set.seed
#' @param ... Parameters for glmnet
#' @param verbose Logical - Show Progress of function, not in parallel
#' @param return_plot Logical - If set to true, the output is a list of the Regression results and a correlation plot with AUC plot.
#' @param exclude character vector - factors/columns to be excluded. You dont want to include factors which are more like observatory variables as opposed to possible explanatory variables in the analysis.
#' @details Best use for summarized experiments. Output genewise T is better.
#' @examples
#' @import SummarizedExperiment
#' @importFrom tibble rownames_to_column as_tibble
#' @import magrittr
#' @export
use_glmnet <- function(data = NULL, assay_classification = NULL, classification = NULL, model_matrix = NULL, col_names = NULL, gene_names = NULL, use_glm = T, use_cv = T, fast_cv = F, stratify_class = F, type_measure = "class", type.multinomial = "grouped",
alpha = 0.5, verbose = T, parallel = T, no_cluster = NULL, seed = NULL, repeats = 1, k_outer_loop = 10, k_inner_loop = 5, standardize = F, return_both = T, output_genewise = T, ... ) {
#test for mode: data = null ? --> then only classification and model_matrix oder
###################### 1. Mise en Place ###########################################################
###### 1.1 test class of data if not NULL ######
if (!is.null(data)) {
if (!is.null(gene_names)) {
data <- data[gene_names,] #subset data by names
}
if (is_class_SE(data)) {
if (!is.null(assay_classification) && is.null(classification)) {
classification <- assay(data, assay_classification) #get classification matrix from Summarized Experiment
}
df <- colData(data) %>% data.frame(.)
} else {
df <- data.frame(data)
}
}
###### 1.2 make model matrix ######
if (is.null(model_matrix)) {
check_data(data = data, variables = col_names) #throw error if something is missing
if (verbose) print("Building model matrix from data")
model_matrix <- mmRmeta::make_model_matrix(df, predictors = col_names, impute_na = T, verbose = verbose)
} else if (!is.matrix(model_matrix)) {
warning("Model Matrix is not class 'matrix'. Trying to change it")
model_matrix <- as.matrix(model_matrix)
}
##############################################################################################
############ 2. Test if classification is either 1d or 2d apply test accordingly #############
if (is.matrix(classification)) {
if (1 %in% dim(classification) || is.null(dim(classification))) { #matrix has only one row
intersectNames <- intersect(rownames(model_matrix), names(classification)) %>% unique()
model_matrix <- model_matrix[intersectNames, ]
classification <- unlist(data.frame(classification), recursive = F) #change to vector, keep names
classification <- classification[intersectNames]
}
} else if (is.vector(classification) || is.factor(classification)) {
intersectNames <- intersect(rownames(model_matrix), names(classification)) %>% unique()
model_matrix <- model_matrix[intersectNames, ] #added this 0306
classification <- classification[intersectNames]
} else {
intersectNames <- intersect(rownames(model_matrix), names(classification)) %>% unique()
model_matrix <- model_matrix[intersectNames, ]
classification <- classification[, intersectNames]
}
####### if classification is only a vector #######
if (is.vector(classification) || is.factor(classification)) {
if (use_glm) {
if (verbose) { print("Processing glmnet!") }
glmFit <- mmRmeta::use_cvglmnet(model_matrix = model_matrix, stratify = stratify_class, response = classification, alpha = alpha, seed = seed, return_both = return_both, standardize = standardize, type_measure = type_measure, ... )
}
if (use_cv) {
if (verbose) { print("Processing nested cross validation") }
cv_glmFit <- use_nested_cvglmnet(model_matrix = model_matrix, response = classification, alpha = alpha, stratify = stratify_class, repeats = repeats, standardize = standardize, type_measure = type_measure,
k_outer_loop = k_outer_loop, k_inner_loop = k_inner_loop, seed = seed, ... )
}
######### if classification is a matrix ########
} else {
if (parallel) {
cl <- mmRmeta::get_no_cores(no_cluster)
doParallel::registerDoParallel(cl)
#normal glmnet
if (use_glm) {
if (verbose) { print("Processing parallel glmnet") }
glmFit <- foreach(i = seq_len(nrow(classification)), .packages = c("glmnet"), .verbose = verbose) %dopar% { #übergeben model.matrix
modality <- classification[i, ] #classification as factor/vector, throw error if no rownames in classMatrix
fittedModel <- mmRmeta::use_cvglmnet(model_matrix = model_matrix, response = modality, stratify = stratify_class, alpha = alpha, seed = seed, return_both = return_both, standardize = standardize, type_measure = type_measure, ... )
}
names(glmFit) <- rownames(classification)
}
#nested cross validation
if (use_cv) {
if (verbose) { print("Processing parallel cross-validation") }
#with faster option. needs glmFit to function
if (fast_cv && exists(to_string(glmFit))) { #test if fit is there
predictedList <- mmRmeta::calc_prediction(glmFit, model_matrix)
aucList <- mmRmeta::calc_auc(predictedList, classification, model_matrix)
names(aucList) <- names(glmFit)
cv_glmFit <- foreach(j = seq_len(nrow(classification)), .packages = c("glmnet"), .verbose = verbose, .export = c("aucList")) %:% when(aucList[[j]] > 0.5) %dopar% {
modality <- classification[j, ] #classification as factor/vector, throw error if no rownames in classMatrix
tryCatch({ mmRmeta::use_nested_cvglmnet(response = modality, model_matrix = model_matrix, alpha = alpha, stratify = stratify_class, standardize = standardize,
repeats = repeats, k_outer_loop = k_outer_loop, k_inner_loop = k_inner_loop, seed = seed, type_measure = type_measure, ... )
})}
#without fast cv
} else {
cv_glmFit <- foreach(j = seq_len(nrow(classification)), .packages = c("glmnet"), .verbose = verbose) %dopar% {
modality <- classification[j, ] #classification as factor/vector, throw error if no rownames in classMatrix
tryCatch({mmRmeta::use_nested_cvglmnet(response = modality, model_matrix = model_matrix, alpha = alpha, stratify = stratify_class, standardize = standardize,
repeats = repeats, k_outer_loop = k_outer_loop, k_inner_loop = k_inner_loop, seed = seed, type_measure = type_measure, ...)
})}
}
names(cv_glmFit) <- rownames(classification)
}
on.exit(parallel::stopCluster(cl))
###### no parallel computation ######
} else {
if (use_glm) {
if (verbose) { print("Processing glmnet") }
glmFit <- apply(classification, MARGIN = 1, function(x) mmRmeta::use_cvglmnet(model_matrix = model_matrix, stratify = stratify_class, response = x, type_measure = type_measure, standardize = standardize, alpha = alpha, seed = seed, return_both = return_both))
}
if (use_cv) {
if (verbose) { print("Processing cross-validation") }
#fast cv, checks simple prediciton of glmFit
if (fast_cv) {
if (exists("glmFit")) {
predictedList <- mmRmeta::calc_prediction(glmFit, model_matrix)
aucList <- mmRmeta::calc_auc(predictedList, classification, model_matrix)
names(aucList) <- names(glmFit)
}
#cv_glmFit <- apply(classification, MARGIN = 1, function(x) mmRmeta::use_nested_cvglmnet(response = x, model_matrix = model_matrix, alpha = alpha, stratify = stratify_class,
# repeats = repeats, k_outer_loop = k_outer_loop, k_inner_loop = k_inner_loop, seed = seed))
cv_glmFit <- lapply(aucList, function(x) { ifelse(x <= 0.5, 0.5, mmRmeta::use_nested_cvglmnet(response = classification[names(x),], model_matrix = model_matrix, alpha = alpha, stratify = stratify_class,
repeats = repeats, type_measure = type_measure, k_outer_loop = k_outer_loop, k_inner_loop = k_inner_loop, seed = seed, standardize = standardize, ... ))})
#without fast option
} else {
cv_glmFit <- apply(classification, MARGIN = 1, function(x) mmRmeta::use_nested_cvglmnet(response = x, model_matrix = model_matrix, alpha = alpha, stratify = stratify_class,
repeats = repeats, k_outer_loop = k_outer_loop,type_measure = type_measure, k_inner_loop = k_inner_loop, seed = seed, standardize = standardize, ... ))
}
}
}
}
#final <- list(glmnet = glmFit, cv = cvFit)
if (use_glm && use_cv && fast_cv) {
return(list(fit = glmFit, cv = cv_glmFit, auc = aucList))
} else if (use_glm && use_cv && !fast_cv) {
return(list(fit = glmFit, cv = cv_glmFit))
} else if (!use_cv && use_glm) {
return(glmFit)
} else {
return(cv_glmFit)
}
}
#apply(data, 1, function(x) {ifelse(any(x == 0), NA, length(unique(x)))})
#lapply(dflist, function(x) if(nrow(x)%%2==1) "ODD" else x)
#lapply(aucList, function(x) { ifelse(x <= 0.5), 0.5, mmRmeta::use_nested_cvglmnet(response = classification[names(x),], model_matrix = model_matrix, alpha = alpha, stratify = stratify_class,
#repeats = repeats, k_outer_loop = k_outer_loop, k_inner_loop = k_inner_loop, seed = seed)})
#https://www.r-bloggers.com/parallel-computing-exercises-foreach-and-doparallel-part-2/ parallel
#find out what parallelization is faster, of the length in nrow() oder in samples ncol()
# error when fast cv because not same length of list
#' Check Data
#' @description Short function to check unavailable data for use_glmnet_new
#' @param data A data frame or SummarizedExperiment.
#' @param variables Either a matrix containing response vector or an integer corresponding to the index of the assay in the SE containing the matrix.
check_data <- function(data = NULL, variables = NULL) {
if (is.null(data)) {
error("You need to supply data from where the model matrix is made from when 'make_model' is TRUE.")
}
if (is.null(variables)) {
error("You need to specify variables in the data to make a model matrix when 'make_model' is TRUE.")
}
}
#' Has Zeor Coefficients
#' @description Check if model has only coefficients with value = 0
#' @param glmnet_result A data frame or SummarizedExperiment.
#' @export
has_zero_coef <- function(glmnet_result) {
if (is.list(glmnet_result)) {
has_zero <- lapply(glmnet_result, function(x) sum(coef(x) != 0)) %>% unlist()
} else {
has_zero <- sum(coef(glmnet_result) != 0)
}
return(has_zero)
}
#' Get Predictors
#' @description This function uses glmnet to do variable selection within a classification of a response variable.
#' @param model_matrix Model matrix
#' @param data Either data frame like object, SE, or character vector
#' @importFrom rlang is_empty
#' @export
get_predictors <- function(model_matrix, data) {
#simplest case predictors already exist
if (is.character(data)) {
colNames <- data
} else if (is_class_SE(data)) {
colNames <- colnames(colData(data))
} else if (!is.null(dim(data))) { #array like object
colNames <- colnames(data)
}
isEmpty <- sapply(colNames, function(x) grep(x, colnames(model_matrix))) %>% sapply(., function(y) rlang::is_empty(y))
predictors <- colNames[!isEmpty]
#predictors <- names(matchedList)
return(predictors)
}
#' Glmnet List Transformaation
#' @description When using use_glmnet_new, the obtained results are two lists (fit, cv). Combine the two lists genewise.
#' @param glmnet_result Result of function use_glmnet_new
#' @param gene_names Character - gene names
#' @param index_fit Integer - index of results of glm fit in list
#' @param index_cv Integer - index of results of cross validation in list
#' @importFrom rlist list.zip
#' @export
combine_list <- function(glmnet_result, gene_names = NULL, index_fit = 1, index_cv = 2) {
if (!is.null(gene_names)) {
combinedList <- vector(mode = "list", length = gene_names)
names(combinedList) <- gene_names
} else {
combinedList <- vector(mode = "list", length = length(glmnet_result[[1]]))
names(combinedList) <- names(glmnet_result[[1]])
}
cv <- lapply(glmnet_result[[index_cv]], function(x) unlist(x, recursive = F))
combined <- rlist::list.zip(glmnet_result[[1]], cv) #combine two lists stepwise
iter <- 0
for (i in (combined)) {
iter <- iter + 1
names(i) <- c("fit", "c")
combinedList[[iter]] <- i
}
return(combinedList)
}