Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
mmRmeta/R/preprocess_coldata.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
457 lines (435 sloc)
22.7 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Process Coldata/Clinical Data | |
#' @description This function applies changes to the colData/clinical data of a SummarizedExperiment. | |
#' @param summarized_experiment SummarizedExperiment object or a simple data frame | |
#' @param exception Character - Names of columns that should not be dropped. | |
#' @examples | |
#' lungMeta <- reorder_all(lungMeta, c("tumor_stage", "race"), 10) | |
#' @return Returns the data frame with filtered/ordered columns. | |
#' @export | |
preprocess_coldata <- function(summarized_experiment, to_lower = T, unnest = T, exception = NULL, ..., verbose = F) { | |
if (verbose) cat("Factorizing columns") | |
summarized_experiment <- mmRmeta::factorize_columns(summarized_experiment, unnest = unnest, exception = exception, ...) | |
if (verbose) cat("Filtering NA values") | |
summarized_experiment <- mmRmeta::filter_columns_with_na(summarized_experiment, to_lower = to_lower, unnest = unnest) | |
if (verbose) cat("Filtering single levels") | |
summarized_experiment <- mmRmeta::filter_single_level(summarized_experiment, unnest = unnest, exception = exception) | |
return(summarized_experiment) | |
} #all functions need unnest --> some functions dont work with DataFrames | |
#Error in xi == xj : comparison of these types is not implemented --> works after merge_duplicated | |
#' Set Character Columns to Factor | |
#' @description This function changes the columns to factors, excludes columns by name via grep function. | |
#' @param summarized_experiment SummarizedExperiment object or a simple data frame | |
#' @param unnest Logical - Should the data be unnested by mmRmeta::unnest_dataframe() | |
#' @param keep_unique Logical - Should columns with unique entries not changed to factors (Because they might be IDs) | |
#' @param exception Character - Column names or part of them that are not factorized | |
#' @param max_levels Integer - Maximum number of levels allowed for a character column to be changed to a factor column | |
#' @param factorize_logical Logical - Should all logical columns be changed to factors? | |
#' @details Be aware, even if columns should have unique values because they are IDs doesnt mean that the data set follows the rule. When a character column consist of more than max_levels unique values then this columns | |
#' is not changed to a factor column. Also logical columns are changed to factors. | |
#' @return Returns the data frame with filtered/ordered columns. | |
#' @import SummarizedExperiment | |
#' @importFrom Biobase isUnique | |
#' @export | |
factorize_columns <- function(summarized_experiment, unnest = T, keep_unique = TRUE, exception = NULL, max_levels = 100, factorize_logical = T) { | |
#dont change unique character values -> because probably IDs | |
#dont change certain column names, ID, | |
#for SummarizedExperiment | |
if (is_class_SE(summarized_experiment)) { | |
columnData <- data.frame(colData(summarized_experiment)) | |
#for data frames, uses any if class = data.table and data.frame | |
} else if (any(class(summarized_experiment) %in% c("data.frame", "DataFrame"))) { | |
columnData <- summarized_experiment | |
} #else throw error maybe | |
if (unnest) { | |
columnData <- mmRmeta::unnest_dataframe(columnData) | |
} | |
matches <- double() | |
numberRows <- nrow(columnData) | |
colNames <- colnames(columnData) | |
#deal with exception | |
for (col in exception) { | |
matches <- c(matches, do.call(grep, args = list(pattern = col, x = colNames))) #match colnames with exception, combine results | |
matches <- unique(matches) #unique column indices | |
} | |
#deal with columns with unique values and with columns with too many factor levels | |
for (index in 1:length(colNames)) { | |
if (sum(Biobase::isUnique(columnData[[index]]), na.rm = TRUE) == numberRows) { | |
matches <- c(matches, index) | |
} else if (!is.null(max_levels)) { | |
if (sum(Biobase::isUnique(columnData[[index]]), na.rm = TRUE) > max_levels) { | |
matches <- c(matches, index) | |
} | |
} | |
} | |
matches <- unique(matches) #only unique indices, those matches are excluded from factorization | |
#now actually factorize the columns | |
for (columnIndex in 1:length(colNames)) { | |
#check if column index doesnt match the exception (index/integer/column number) and if column is a character | |
if (!(columnIndex %in% matches) && class(columnData[[columnIndex]]) == "character") { #NOT index in matching indices | |
columnData[[columnIndex]] <- as.factor(columnData[[columnIndex]]) | |
} else if (factorize_logical && class(columnData[[columnIndex]]) == "logical") { | |
columnData[[columnIndex]] <- as.factor(columnData[[columnIndex]]) | |
} | |
} | |
if (class(summarized_experiment) %in% c("RangedSummarizedExperiment", "SummarizedExperiment")) { | |
columnData <- as(columnData, "DataFrame") | |
colData(summarized_experiment) <- columnData #assign to colData | |
return(summarized_experiment) | |
} else { | |
return(columnData) | |
} | |
} #add exception to be integers for column index | |
#' Delete Colums with only NA | |
#' @description Sometimes actual NA values are indicated as other strings (e.g. "not reported"). This function converts these strings to NA and ultimately deletes any column consisting only of NA values. | |
#' Additonally, you can drop columns by their name or parts of it via a grep command. Automatically uses \link{drop_unused_levels}. Be careful that you input unique matches for >keep<. | |
#' @param summarized_experiment Data frame | |
#' @param unnest Logical - Unnest data.frame / colData | |
#' @param value A value or character that should be converted to NA. | |
#' @param percentage Double - Allowed percentage of NA values in column. Columns above that value are dropped. E.g 0.5 for 50$ | |
#' @param col_char character or character vector - columns are dropped if the grep matches | |
#' @param keep character or character vector - exceptions that wont be dropped from the data frame if they match (i.e. "case_id" when you set col_char as "_id") | |
#' @param verbose Logical - print how many colums were NA | |
#' @importFrom magrittr %>% | |
#' @examples | |
#' df <- data.frame(num1 = c(1,2,3,4), num2= c(5,6,7, "not reported")) | |
#' df <- drop_columns_with_na(df, "not reported") | |
#' @details Be careful when | |
#' @return Returns the filtered data frame | |
#' @export | |
filter_columns_with_na <- function(summarized_experiment, to_lower = T, unnest = T, value = "not reported", percentage = NULL, col_char = NULL, keep = NULL, verbose = T) { | |
if (is_class_SE(summarized_experiment)) { | |
columnData <- colData(summarized_experiment) #colData to df | |
} else { | |
columnData <- summarized_experiment | |
} | |
if (unnest) { | |
columnData <- mmRmeta::unnest_dataframe(columnData) | |
} | |
if (to_lower) { | |
names <- rownames(columnData) | |
columnData <- lapply(columnData, function(x) { | |
if (is.factor(x)) return(tolower(x)) | |
else return(x) | |
}) %>% data.frame() | |
rownames(columnData) <- names | |
} | |
columnData <- dplyr::na_if(columnData, value) %>% dplyr::na_if(., "") #changes all occurences of >value< to NA, Default is "not reported" | |
columnData <- columnData %>% dplyr::select_if(~!all(is.na(.))) #deletes columns with only NA values | |
columnData <- mmRmeta::drop_unused_levels(columnData) #drop unused factor levels | |
if (!is.null(percentage)) { #drop columns with more than >percentage< NA obseervations | |
keepMatch <- numeric() | |
sumNA <- colSums(is.na(columnData))/nrow(columnData) %>% round(., digits = 2) | |
isAbove <- sumNA >= percentage | |
if (!is.null(keep)) { | |
for (i in 1:length(keep)) { | |
if (!is.null(keep)) keepMatch <- append(keepMatch, grep(pattern = keep[i], names(columnData))) #get index for colums to be saved, only if >keep< is not null | |
keepMatch <- unique(keepMatch) | |
} | |
keepMatch <- unique(keepMatch) | |
isAbove <- isAbove %notin% keepMatch | |
} | |
columnData[isAbove] <- NULL | |
} | |
if (!is.null(col_char)) { | |
keepMatch <- numeric() | |
for (i in 1:length(keep)) { | |
if (!is.null(keep)) keepMatch <- append(keepMatch, grep(pattern = keep[i], names(columnData))) #get index for colums to be saved, only if >keep< is not null | |
keepMatch <- unique(keepMatch) | |
} | |
dropMatch <- numeric() #empty vector for match indices | |
for (j in 1:length(col_char)) { | |
dropMatch <- append(dropMatch, grep(pattern = col_char[j], names(columnData))) | |
dropMatch <- unique(dropMatch) #unique indices, throws error otherwise | |
dropMatch <- dropMatch[!dropMatch %in% keepMatch] #removes indices which should be saved | |
} | |
columnData[dropMatch] <- NULL #delete matched columns | |
} | |
#either return data frame or SE | |
#if (verbose) { | |
# cat("Deleted following columns": setdiff(colnames(colData(summarized_experiment)), colnames(columnData))) | |
#} | |
if (is_class_SE(summarized_experiment)) { | |
colData(summarized_experiment) <- as(columnData, "DataFrame") #need this format to sucessfully subset colData | |
return(summarized_experiment) | |
} else { | |
return(columnData) | |
} | |
} | |
#error when verbose | |
#make a new example for grep | |
#' Drop unused factor levels | |
#' @description This function drops all unused levels from all factors in a data frame. | |
#' @param summarized_experiment Data frame | |
#' @examples | |
#' lungMeta <- drop_unused_levels(lungMeta) | |
#' @return Returns the altered data frame. | |
#' @export | |
drop_unused_levels <- function(summarized_experiment, verbose = T) { | |
if (is_class_SE(summarized_experiment)) { | |
columnData <- colData(summarized_experiment) #colData to df | |
} else { | |
columnData <- summarized_experiment | |
} | |
for (column in colnames(df)) { | |
if (is.factor(df[[column]])) { | |
df[[column]] <- factor(df[[column]]) | |
} | |
} | |
if (is_class_SE(summarized_experiment)) { | |
colData(summarized_experiment) <- as(columnData, "DataFrame") #need this format to sucessfully subset colData | |
return(summarized_experiment) | |
} else { | |
return(columnData) | |
} | |
} | |
#' Drop Binary Factors With Few Counts | |
#' @description This function drops binary factors that have a level with very feq counts. | |
#' @param summarized_experiment Data frame or SE | |
#' @param min_counts Integer - minimum amount of counts in factor level | |
#' @param exception Character - column names of eceptions if they would fall below threshold. Uses grep | |
#' @param verbose Logical - prints names of dropped columns | |
#' @return Returns the input object with dropped columns | |
#' @export | |
filter_small_binary <- function(summarized_experiment, min_counts = 10, exception = NULL, verbose = T) { | |
if (is_class_SE(summarized_experiment)) { | |
columnData <- data.frame(colData(summarized_experiment)) #colData to df | |
} else { | |
columnData <- data.frame(summarized_experiment) | |
} | |
matches <- double() | |
if (!is.null(exception)) { | |
for (col in exception) { | |
matches <- c(matches, do.call(grep, args = list(pattern = col, x = colnames(columnData)))) #match colnames with exception, combine results | |
matches <- unique(matches) #unique column indices | |
names <- colnames(columnData)[matches] | |
} | |
} else names <- NULL | |
#list of tables if binary factor, else NA | |
binaryTable <- lapply(columnData, function(x) { | |
if (is.factor(x) && length(levels(x)) == 2) { | |
table(x) | |
} else (return(NA)) | |
}) | |
#vector if tables have min counts, only column name + TRUE or FALSe in vector | |
belowCounts <- lapply(binaryTable, function(x) { | |
ifelse(!is.na(x), return(any(x < min_counts)), return(FALSE)) | |
}) %>% unlist() %>% .[.] | |
#delete columns with binary factos with factor level below min_counts | |
if (!is.null(names)) { #exclude exceptions, | |
belowCounts <- dplyr::setdiff(names(belowCounts), names) | |
columnData <- dplyr::select(columnData, -dplyr::matches(belowCounts)) | |
} else { | |
columnData <- dplyr::select(columnData, -dplyr::matches(names(belowCounts))) | |
} | |
#verbose message | |
if (verbose) { | |
if (is.logical(belowCounts)) { | |
cat("Deleted columns:", names(belowCounts)) | |
} else { | |
cat("Deleted columns:", belowCounts) | |
} | |
} | |
#return SE or data frame | |
if (is_class_SE(summarized_experiment)) { | |
colData(summarized_experiment) <- as(columnData, "DataFrame") #need this format to sucessfully subset colData | |
return(summarized_experiment) | |
} else { | |
return(columnData) | |
} | |
} | |
#' Filter for Factor Level | |
#' @description Filter for factor levels in a column. | |
#' @import SummarizedExperiment | |
#' @param summarized_experiment SummarizedExperiment object or a simple data frame | |
#' @param column Character - Column name | |
#' @param value Character - Factor level that should be filtered for | |
#' @param distinct Logical - If TRUE filters where all variables in "value" occur. For example: value is c("alive", "female") --> only alive females are kept - only works with filter_at | |
#' @param verbose Logical - print results | |
#' @return Returns same SE or data frame | |
#' @export | |
#refactor whole dataframe | |
filter_for_factor <- function(summarized_experiment, column = NULL, column_value, distinct = T, verbose = T) { | |
#if is.null) {} | |
if (mmRmeta::is_class_SE(summarized_experiment)) { | |
colData <- colData(summarized_experiment) %>% data.frame() %>% tibble::rownames_to_column("rowname") | |
} else if (class(summarized_experiment) %in% c("data.frame", "DataFrame")) { | |
colData <- summarized_experiment %>% tibble::rownames_to_column("rowname") | |
} | |
if (!is.null(column)) { | |
if (distinct) { | |
filteredData <- dplyr::filter_at(colData, vars(column), all_vars(. %in% column_value)) #takes intersection with "and" (&) | |
} else { | |
filteredData <- dplyr::filter_at(colData, vars(column), any_vars(. %in% column_value)) #takes union with "or" (|) | |
} | |
#no specific column stated | |
} else { | |
#if (distinct) { for now not working | |
# filteredData <- dplyr::filter_all(colData, all_vars(. %in% column_value)) | |
# } else { | |
filteredData <- dplyr::filter_all(colData, any_vars(. %in% column_value)) | |
#} | |
} | |
if (verbose) { | |
deleted <- nrow(colData) - nrow(filteredData) | |
cat(deleted, "samples where deleted from the colData.") | |
} | |
rowNames <- filteredData$rowname | |
summarized_experiment <- summarized_experiment[, rowNames] | |
return(summarized_experiment) | |
} | |
#' Filter out Factor level | |
#' @description Filter out entries with certain factor levels. | |
#' @param summarized_experiment SummarizedExperiment object or a simple data frame | |
#' @param column Character - column name | |
#' @param value Character - factor level that should be filtered out | |
#' @param distinct Logical - if TRUE filter out where all variables in "value" occur. For example: value is c("alive", "female") --> only alive females are filtered out | |
#' @param verbose Logical - state how many entries were deleted | |
#' @return Returns same SE or data frame | |
#' @import SummarizedExperiment | |
#' @importFrom dplyr filter_all filter_at any_vars all_vars | |
#' @importFrom tibble rownames_to_column | |
#' @export | |
filter_out_factor <- function(summarized_experiment, column = NULL, column_value, distinct = T) { | |
#if is.null) {} | |
if (mmRmeta::is_class_SE(summarized_experiment)) { | |
colData <- unnest_dataframe(colData(summarized_experiment)) %>% tibble::rownames_to_column("rowname") | |
} else if (class(summarized_experiment) %in% c("data.frame", "DataFrame")) { | |
colData <- summarized_experiment %>% tibble::rownames_to_column("rowname") | |
} | |
if (!is.null(column) && distinct) { | |
filteredData <- dplyr::filter_at(colData, vars(column), all_vars(. %in% column_value)) #takes intersection with "and" (&) | |
} else if (!is.null(column) && isFALSE(distinct)) { | |
filteredData <- dplyr::filter_at(colData, vars(column), any_vars(. %in% column_value)) #takes union with "or" (|) | |
} else { | |
filteredData <- dplyr::filter_all(colData, any_vars(. %in% column_value)) | |
} | |
rowNames <- filteredData$rowname | |
summarized_experiment <- summarized_experiment[, rowNames] | |
if (verbose) { | |
println(nrow(colData) - nrow(filteredData), "samples where deleted from the colData.") | |
} | |
return(summarized_experiment) | |
} | |
#' Filter Single Level Factors | |
#' @description Function to filter out columns containing only single level factors. | |
#' @param summarized_experiment SummarizedExperiment object or a simple data frame | |
#' @param exception Character - Names of columns one wantes to keep despite having only 1 level | |
#' @param unnest Logical - Should the data be unnested (meaning having lists in them or if the data comes directl from a summarized experiment) | |
#' @param factorize Logical - Should thr | |
#' @param ... further variables for mmRmeta::factorize_columns | |
#' @import SummarizedExperiment | |
#' @return Retunrs a DataFrame or a SummarizedExperiment | |
#' @export | |
filter_single_level <- function(summarized_experiment, exception = NULL, unnest = T, factorize = F, verbose = T) { | |
if (class(summarized_experiment) %in% c("RangedSummarizedExperiment", "SummarizedExperiment")) { | |
columnData <- colData(summarized_experiment) | |
} else if (any(class(summarized_experiment) %in% c("data.frame", "DataFrame"))) { | |
columnData <- summarized_experiment | |
} | |
if (unnest) { | |
columnData <- mmRmeta::unnest_dataframe(columnData) | |
} | |
if (factorize) { | |
columnData <- mmRmeta::factorize_columns(columnData, ... ) | |
} | |
cnames <- colnames(columnData) | |
savedColumns <- columnData %>% dplyr::select_if(~!is.factor(.)) %>% cbind(., columnData[, exception]) #save all columns that are no factors and are set in exception | |
columnData <- columnData %>% dplyr::select_if(~!length(levels(.)) <= 1) | |
columnData <- cbind(savedColumns, columnData)#assign class "DataFrame" to object | |
if (verbose) { | |
setnames <- dplyr::setdiff(cnames, colnames(columnData)) | |
cat("Deleted columns:", setnames) | |
} | |
if (class(summarized_experiment) %in% c("RangedSummarizedExperiment", "SummarizedExperiment")) { | |
columnData <- as(columnData, "DataFrame") | |
colData(summarized_experiment) <- columnData #assign to colData | |
return(summarized_experiment) | |
} else { | |
return(columnData) | |
} | |
} | |
#' Uses reorder.column on all eligible columns | |
#' @description This function removes factor levels with less counts than a set value of every column. Additionally, it reorders the factor levels depending on the counts. | |
#' @param summarized_experiment A large list created prior by multimodalR | |
#' @param exception Character - One or more column names that should not be processed by this function | |
#' @param threshold Integer - remove >= counts of the factor level | |
#' @param unnest Logical - Unnest data? | |
#' @import SummarizedExperiment | |
#' @examples | |
#' lungMeta <- reorder_all(lungMeta, c("tumor_stage", "race"), 10) | |
#' @return Returns the data frame with filtered/ordered columns. | |
#' @export | |
reorder_all <- function(summarized_experiment, exception = NULL, unnest = F, threshold = 0) { | |
if (class(summarized_experiment) %in% c("RangedSummarizedExperiment", "SummarizedExperiment")) { | |
columnData <- colData(summarized_experiment) %>% data.frame() %>% tibble::rownames_to_column("rowname") | |
} else if (class(summarized_experiment) %in% c("data.frame", "DataFrame")) { | |
columnData <- summarized_experiment %>% tibble::rownames_to_column("rowname") | |
} | |
if (unnest) { | |
columnData <- mmRmeta::unnest_dataframe(columnData) | |
} | |
columnNames <- names(columnData) | |
if (!is.null(exception)) { | |
columnNames <- columnNames[!c(columnNames %in% exception)] #remove column names given in exception | |
} | |
for (names in columnNames) { | |
if (is.factor(columnData[[names]]) ) { #only reorder factors | |
columnData <- mmRmeta::reorder_column(summarized_experiment = columnData, column = names, threshold = threshold) #use reorder.columns on given column names | |
} | |
} | |
if (class(summarized_experiment) %in% c("RangedSummarizedExperiment", "SummarizedExperiment")) { | |
sampleNames <- columnData$rowname | |
summarized_experiment <- summarized_experiment[, sampleNames] #first need to subset SE if we filtered out some rows in colData | |
columnData <- tibble::column_to_rownames(columnData, "rowname") # | |
columnData <- as(columnData, "DataFrame") #switch class of data frame | |
colData(summarized_experiment) <- columnData | |
return(summarized_experiment) | |
} else { | |
return(columnData) | |
} | |
} | |
#' Remove small factor levels and reorder | |
#' @description This function removes factor levels with counts less than a set value. Additionally, it reorders the factor levels depending on the counts. | |
#' This may be necessary later on to get ordered labels when plotting. | |
#' @param summarized_experiment A large list created prior by multimodalR. | |
#' @param column Character - column name | |
#' @param threshold Integer - threshold of counts; any factor level with equal or less counts are discarded | |
#' @import SummarizedExperiment | |
#' @import dplyr | |
#' @import rlang | |
#' @importFrom tibble rownames_to_column | |
#' @examples | |
#' lungMeta <- reorder.colums(lungMeta, "primary_diagnosis", 10) #keeps factor levels with >= 10 counts. | |
#' @return Returns the same large list with changed column names of the expressionmatrix. | |
#' @export | |
reorder_column <- function(summarized_experiment, column, threshold = 0) { | |
if (class(summarized_experiment) %in% c("RangedSummarizedExperiment", "SummarizedExperiment")) { | |
columnData <- unnest_dataframe(colData(summarized_experiment)) | |
} else if (class(summarized_experiment) %in% c("data.frame", "DataFrame")) { | |
columnData <- summarized_experiment | |
} | |
if (!"rowname" %in% colnames(columnData)) { | |
columnData <- tibble::rownames_to_column(columnData, "rowname") #add rowname if it doesnt exist | |
} | |
if (is.factor(columnData[[column]])) { | |
arrangedCounts <- columnData %>% dplyr::group_by_(column) %>% dplyr::summarize(n = n()) | |
arrangedCounts <- dplyr::arrange(arrangedCounts, desc(n)) #order by descending counts | |
columnData[[column]] <- factor(columnData[[column]], levels = arrangedCounts[[column]]) #reorder factor in data frame | |
factorCount <- plyr::count(columnData[[column]]) %>% dplyr::filter(freq >= threshold) #removes counts less than the value of removes | |
factorCount <- factorCount[,1] | |
columnSymbol <- rlang::sym(column) | |
columnData <- dplyr::filter(columnData, !!(columnSymbol) %in% factorCount) # !! is the newer version of rlang::UQ() which unquotes the variable | |
columnData[[column]] <- factor(columnData[[column]]) | |
} else { | |
warning(column, " is no factor, no changes applied") | |
} | |
if (mmRmeta::is_class_SE(summarized_experiment)) { | |
sampleNames <- columnData$rowname | |
summarized_experiment <- summarized_experiment[, sampleNames] #first need to subset SE if we filtered out some rows in colData | |
columnData <- tibble::column_to_rownames(columnData, "rowname") # | |
columnData <- as(columnData, "DataFrame") #switch class of data frame | |
colData(summarized_experiment) <- columnData | |
} else { | |
return(columnData) | |
} | |
} |