Skip to content
Permalink
837de1d39b
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
457 lines (435 sloc) 22.7 KB
#' Process Coldata/Clinical Data
#' @description This function applies changes to the colData/clinical data of a SummarizedExperiment.
#' @param summarized_experiment SummarizedExperiment object or a simple data frame
#' @param exception Character - Names of columns that should not be dropped.
#' @examples
#' lungMeta <- reorder_all(lungMeta, c("tumor_stage", "race"), 10)
#' @return Returns the data frame with filtered/ordered columns.
#' @export
preprocess_coldata <- function(summarized_experiment, to_lower = T, unnest = T, exception = NULL, ..., verbose = F) {
if (verbose) cat("Factorizing columns")
summarized_experiment <- mmRmeta::factorize_columns(summarized_experiment, unnest = unnest, exception = exception, ...)
if (verbose) cat("Filtering NA values")
summarized_experiment <- mmRmeta::filter_columns_with_na(summarized_experiment, to_lower = to_lower, unnest = unnest)
if (verbose) cat("Filtering single levels")
summarized_experiment <- mmRmeta::filter_single_level(summarized_experiment, unnest = unnest, exception = exception)
return(summarized_experiment)
} #all functions need unnest --> some functions dont work with DataFrames
#Error in xi == xj : comparison of these types is not implemented --> works after merge_duplicated
#' Set Character Columns to Factor
#' @description This function changes the columns to factors, excludes columns by name via grep function.
#' @param summarized_experiment SummarizedExperiment object or a simple data frame
#' @param unnest Logical - Should the data be unnested by mmRmeta::unnest_dataframe()
#' @param keep_unique Logical - Should columns with unique entries not changed to factors (Because they might be IDs)
#' @param exception Character - Column names or part of them that are not factorized
#' @param max_levels Integer - Maximum number of levels allowed for a character column to be changed to a factor column
#' @param factorize_logical Logical - Should all logical columns be changed to factors?
#' @details Be aware, even if columns should have unique values because they are IDs doesnt mean that the data set follows the rule. When a character column consist of more than max_levels unique values then this columns
#' is not changed to a factor column. Also logical columns are changed to factors.
#' @return Returns the data frame with filtered/ordered columns.
#' @import SummarizedExperiment
#' @importFrom Biobase isUnique
#' @export
factorize_columns <- function(summarized_experiment, unnest = T, keep_unique = TRUE, exception = NULL, max_levels = 100, factorize_logical = T) {
#dont change unique character values -> because probably IDs
#dont change certain column names, ID,
#for SummarizedExperiment
if (is_class_SE(summarized_experiment)) {
columnData <- data.frame(colData(summarized_experiment))
#for data frames, uses any if class = data.table and data.frame
} else if (any(class(summarized_experiment) %in% c("data.frame", "DataFrame"))) {
columnData <- summarized_experiment
} #else throw error maybe
if (unnest) {
columnData <- mmRmeta::unnest_dataframe(columnData)
}
matches <- double()
numberRows <- nrow(columnData)
colNames <- colnames(columnData)
#deal with exception
for (col in exception) {
matches <- c(matches, do.call(grep, args = list(pattern = col, x = colNames))) #match colnames with exception, combine results
matches <- unique(matches) #unique column indices
}
#deal with columns with unique values and with columns with too many factor levels
for (index in 1:length(colNames)) {
if (sum(Biobase::isUnique(columnData[[index]]), na.rm = TRUE) == numberRows) {
matches <- c(matches, index)
} else if (!is.null(max_levels)) {
if (sum(Biobase::isUnique(columnData[[index]]), na.rm = TRUE) > max_levels) {
matches <- c(matches, index)
}
}
}
matches <- unique(matches) #only unique indices, those matches are excluded from factorization
#now actually factorize the columns
for (columnIndex in 1:length(colNames)) {
#check if column index doesnt match the exception (index/integer/column number) and if column is a character
if (!(columnIndex %in% matches) && class(columnData[[columnIndex]]) == "character") { #NOT index in matching indices
columnData[[columnIndex]] <- as.factor(columnData[[columnIndex]])
} else if (factorize_logical && class(columnData[[columnIndex]]) == "logical") {
columnData[[columnIndex]] <- as.factor(columnData[[columnIndex]])
}
}
if (class(summarized_experiment) %in% c("RangedSummarizedExperiment", "SummarizedExperiment")) {
columnData <- as(columnData, "DataFrame")
colData(summarized_experiment) <- columnData #assign to colData
return(summarized_experiment)
} else {
return(columnData)
}
} #add exception to be integers for column index
#' Delete Colums with only NA
#' @description Sometimes actual NA values are indicated as other strings (e.g. "not reported"). This function converts these strings to NA and ultimately deletes any column consisting only of NA values.
#' Additonally, you can drop columns by their name or parts of it via a grep command. Automatically uses \link{drop_unused_levels}. Be careful that you input unique matches for >keep<.
#' @param summarized_experiment Data frame
#' @param unnest Logical - Unnest data.frame / colData
#' @param value A value or character that should be converted to NA.
#' @param percentage Double - Allowed percentage of NA values in column. Columns above that value are dropped. E.g 0.5 for 50$
#' @param col_char character or character vector - columns are dropped if the grep matches
#' @param keep character or character vector - exceptions that wont be dropped from the data frame if they match (i.e. "case_id" when you set col_char as "_id")
#' @param verbose Logical - print how many colums were NA
#' @importFrom magrittr %>%
#' @examples
#' df <- data.frame(num1 = c(1,2,3,4), num2= c(5,6,7, "not reported"))
#' df <- drop_columns_with_na(df, "not reported")
#' @details Be careful when
#' @return Returns the filtered data frame
#' @export
filter_columns_with_na <- function(summarized_experiment, to_lower = T, unnest = T, value = "not reported", percentage = NULL, col_char = NULL, keep = NULL, verbose = T) {
if (is_class_SE(summarized_experiment)) {
columnData <- colData(summarized_experiment) #colData to df
} else {
columnData <- summarized_experiment
}
if (unnest) {
columnData <- mmRmeta::unnest_dataframe(columnData)
}
if (to_lower) {
names <- rownames(columnData)
columnData <- lapply(columnData, function(x) {
if (is.factor(x)) return(tolower(x))
else return(x)
}) %>% data.frame()
rownames(columnData) <- names
}
columnData <- dplyr::na_if(columnData, value) %>% dplyr::na_if(., "") #changes all occurences of >value< to NA, Default is "not reported"
columnData <- columnData %>% dplyr::select_if(~!all(is.na(.))) #deletes columns with only NA values
columnData <- mmRmeta::drop_unused_levels(columnData) #drop unused factor levels
if (!is.null(percentage)) { #drop columns with more than >percentage< NA obseervations
keepMatch <- numeric()
sumNA <- colSums(is.na(columnData))/nrow(columnData) %>% round(., digits = 2)
isAbove <- sumNA >= percentage
if (!is.null(keep)) {
for (i in 1:length(keep)) {
if (!is.null(keep)) keepMatch <- append(keepMatch, grep(pattern = keep[i], names(columnData))) #get index for colums to be saved, only if >keep< is not null
keepMatch <- unique(keepMatch)
}
keepMatch <- unique(keepMatch)
isAbove <- isAbove %notin% keepMatch
}
columnData[isAbove] <- NULL
}
if (!is.null(col_char)) {
keepMatch <- numeric()
for (i in 1:length(keep)) {
if (!is.null(keep)) keepMatch <- append(keepMatch, grep(pattern = keep[i], names(columnData))) #get index for colums to be saved, only if >keep< is not null
keepMatch <- unique(keepMatch)
}
dropMatch <- numeric() #empty vector for match indices
for (j in 1:length(col_char)) {
dropMatch <- append(dropMatch, grep(pattern = col_char[j], names(columnData)))
dropMatch <- unique(dropMatch) #unique indices, throws error otherwise
dropMatch <- dropMatch[!dropMatch %in% keepMatch] #removes indices which should be saved
}
columnData[dropMatch] <- NULL #delete matched columns
}
#either return data frame or SE
#if (verbose) {
# cat("Deleted following columns": setdiff(colnames(colData(summarized_experiment)), colnames(columnData)))
#}
if (is_class_SE(summarized_experiment)) {
colData(summarized_experiment) <- as(columnData, "DataFrame") #need this format to sucessfully subset colData
return(summarized_experiment)
} else {
return(columnData)
}
}
#error when verbose
#make a new example for grep
#' Drop unused factor levels
#' @description This function drops all unused levels from all factors in a data frame.
#' @param summarized_experiment Data frame
#' @examples
#' lungMeta <- drop_unused_levels(lungMeta)
#' @return Returns the altered data frame.
#' @export
drop_unused_levels <- function(summarized_experiment, verbose = T) {
if (is_class_SE(summarized_experiment)) {
columnData <- colData(summarized_experiment) #colData to df
} else {
columnData <- summarized_experiment
}
for (column in colnames(df)) {
if (is.factor(df[[column]])) {
df[[column]] <- factor(df[[column]])
}
}
if (is_class_SE(summarized_experiment)) {
colData(summarized_experiment) <- as(columnData, "DataFrame") #need this format to sucessfully subset colData
return(summarized_experiment)
} else {
return(columnData)
}
}
#' Drop Binary Factors With Few Counts
#' @description This function drops binary factors that have a level with very feq counts.
#' @param summarized_experiment Data frame or SE
#' @param min_counts Integer - minimum amount of counts in factor level
#' @param exception Character - column names of eceptions if they would fall below threshold. Uses grep
#' @param verbose Logical - prints names of dropped columns
#' @return Returns the input object with dropped columns
#' @export
filter_small_binary <- function(summarized_experiment, min_counts = 10, exception = NULL, verbose = T) {
if (is_class_SE(summarized_experiment)) {
columnData <- data.frame(colData(summarized_experiment)) #colData to df
} else {
columnData <- data.frame(summarized_experiment)
}
matches <- double()
if (!is.null(exception)) {
for (col in exception) {
matches <- c(matches, do.call(grep, args = list(pattern = col, x = colnames(columnData)))) #match colnames with exception, combine results
matches <- unique(matches) #unique column indices
names <- colnames(columnData)[matches]
}
} else names <- NULL
#list of tables if binary factor, else NA
binaryTable <- lapply(columnData, function(x) {
if (is.factor(x) && length(levels(x)) == 2) {
table(x)
} else (return(NA))
})
#vector if tables have min counts, only column name + TRUE or FALSe in vector
belowCounts <- lapply(binaryTable, function(x) {
ifelse(!is.na(x), return(any(x < min_counts)), return(FALSE))
}) %>% unlist() %>% .[.]
#delete columns with binary factos with factor level below min_counts
if (!is.null(names)) { #exclude exceptions,
belowCounts <- dplyr::setdiff(names(belowCounts), names)
columnData <- dplyr::select(columnData, -dplyr::matches(belowCounts))
} else {
columnData <- dplyr::select(columnData, -dplyr::matches(names(belowCounts)))
}
#verbose message
if (verbose) {
if (is.logical(belowCounts)) {
cat("Deleted columns:", names(belowCounts))
} else {
cat("Deleted columns:", belowCounts)
}
}
#return SE or data frame
if (is_class_SE(summarized_experiment)) {
colData(summarized_experiment) <- as(columnData, "DataFrame") #need this format to sucessfully subset colData
return(summarized_experiment)
} else {
return(columnData)
}
}
#' Filter for Factor Level
#' @description Filter for factor levels in a column.
#' @import SummarizedExperiment
#' @param summarized_experiment SummarizedExperiment object or a simple data frame
#' @param column Character - Column name
#' @param value Character - Factor level that should be filtered for
#' @param distinct Logical - If TRUE filters where all variables in "value" occur. For example: value is c("alive", "female") --> only alive females are kept - only works with filter_at
#' @param verbose Logical - print results
#' @return Returns same SE or data frame
#' @export
#refactor whole dataframe
filter_for_factor <- function(summarized_experiment, column = NULL, column_value, distinct = T, verbose = T) {
#if is.null) {}
if (mmRmeta::is_class_SE(summarized_experiment)) {
colData <- colData(summarized_experiment) %>% data.frame() %>% tibble::rownames_to_column("rowname")
} else if (class(summarized_experiment) %in% c("data.frame", "DataFrame")) {
colData <- summarized_experiment %>% tibble::rownames_to_column("rowname")
}
if (!is.null(column)) {
if (distinct) {
filteredData <- dplyr::filter_at(colData, vars(column), all_vars(. %in% column_value)) #takes intersection with "and" (&)
} else {
filteredData <- dplyr::filter_at(colData, vars(column), any_vars(. %in% column_value)) #takes union with "or" (|)
}
#no specific column stated
} else {
#if (distinct) { for now not working
# filteredData <- dplyr::filter_all(colData, all_vars(. %in% column_value))
# } else {
filteredData <- dplyr::filter_all(colData, any_vars(. %in% column_value))
#}
}
if (verbose) {
deleted <- nrow(colData) - nrow(filteredData)
cat(deleted, "samples where deleted from the colData.")
}
rowNames <- filteredData$rowname
summarized_experiment <- summarized_experiment[, rowNames]
return(summarized_experiment)
}
#' Filter out Factor level
#' @description Filter out entries with certain factor levels.
#' @param summarized_experiment SummarizedExperiment object or a simple data frame
#' @param column Character - column name
#' @param value Character - factor level that should be filtered out
#' @param distinct Logical - if TRUE filter out where all variables in "value" occur. For example: value is c("alive", "female") --> only alive females are filtered out
#' @param verbose Logical - state how many entries were deleted
#' @return Returns same SE or data frame
#' @import SummarizedExperiment
#' @importFrom dplyr filter_all filter_at any_vars all_vars
#' @importFrom tibble rownames_to_column
#' @export
filter_out_factor <- function(summarized_experiment, column = NULL, column_value, distinct = T) {
#if is.null) {}
if (mmRmeta::is_class_SE(summarized_experiment)) {
colData <- unnest_dataframe(colData(summarized_experiment)) %>% tibble::rownames_to_column("rowname")
} else if (class(summarized_experiment) %in% c("data.frame", "DataFrame")) {
colData <- summarized_experiment %>% tibble::rownames_to_column("rowname")
}
if (!is.null(column) && distinct) {
filteredData <- dplyr::filter_at(colData, vars(column), all_vars(. %in% column_value)) #takes intersection with "and" (&)
} else if (!is.null(column) && isFALSE(distinct)) {
filteredData <- dplyr::filter_at(colData, vars(column), any_vars(. %in% column_value)) #takes union with "or" (|)
} else {
filteredData <- dplyr::filter_all(colData, any_vars(. %in% column_value))
}
rowNames <- filteredData$rowname
summarized_experiment <- summarized_experiment[, rowNames]
if (verbose) {
println(nrow(colData) - nrow(filteredData), "samples where deleted from the colData.")
}
return(summarized_experiment)
}
#' Filter Single Level Factors
#' @description Function to filter out columns containing only single level factors.
#' @param summarized_experiment SummarizedExperiment object or a simple data frame
#' @param exception Character - Names of columns one wantes to keep despite having only 1 level
#' @param unnest Logical - Should the data be unnested (meaning having lists in them or if the data comes directl from a summarized experiment)
#' @param factorize Logical - Should thr
#' @param ... further variables for mmRmeta::factorize_columns
#' @import SummarizedExperiment
#' @return Retunrs a DataFrame or a SummarizedExperiment
#' @export
filter_single_level <- function(summarized_experiment, exception = NULL, unnest = T, factorize = F, verbose = T) {
if (class(summarized_experiment) %in% c("RangedSummarizedExperiment", "SummarizedExperiment")) {
columnData <- colData(summarized_experiment)
} else if (any(class(summarized_experiment) %in% c("data.frame", "DataFrame"))) {
columnData <- summarized_experiment
}
if (unnest) {
columnData <- mmRmeta::unnest_dataframe(columnData)
}
if (factorize) {
columnData <- mmRmeta::factorize_columns(columnData, ... )
}
cnames <- colnames(columnData)
savedColumns <- columnData %>% dplyr::select_if(~!is.factor(.)) %>% cbind(., columnData[, exception]) #save all columns that are no factors and are set in exception
columnData <- columnData %>% dplyr::select_if(~!length(levels(.)) <= 1)
columnData <- cbind(savedColumns, columnData)#assign class "DataFrame" to object
if (verbose) {
setnames <- dplyr::setdiff(cnames, colnames(columnData))
cat("Deleted columns:", setnames)
}
if (class(summarized_experiment) %in% c("RangedSummarizedExperiment", "SummarizedExperiment")) {
columnData <- as(columnData, "DataFrame")
colData(summarized_experiment) <- columnData #assign to colData
return(summarized_experiment)
} else {
return(columnData)
}
}
#' Uses reorder.column on all eligible columns
#' @description This function removes factor levels with less counts than a set value of every column. Additionally, it reorders the factor levels depending on the counts.
#' @param summarized_experiment A large list created prior by multimodalR
#' @param exception Character - One or more column names that should not be processed by this function
#' @param threshold Integer - remove >= counts of the factor level
#' @param unnest Logical - Unnest data?
#' @import SummarizedExperiment
#' @examples
#' lungMeta <- reorder_all(lungMeta, c("tumor_stage", "race"), 10)
#' @return Returns the data frame with filtered/ordered columns.
#' @export
reorder_all <- function(summarized_experiment, exception = NULL, unnest = F, threshold = 0) {
if (class(summarized_experiment) %in% c("RangedSummarizedExperiment", "SummarizedExperiment")) {
columnData <- colData(summarized_experiment) %>% data.frame() %>% tibble::rownames_to_column("rowname")
} else if (class(summarized_experiment) %in% c("data.frame", "DataFrame")) {
columnData <- summarized_experiment %>% tibble::rownames_to_column("rowname")
}
if (unnest) {
columnData <- mmRmeta::unnest_dataframe(columnData)
}
columnNames <- names(columnData)
if (!is.null(exception)) {
columnNames <- columnNames[!c(columnNames %in% exception)] #remove column names given in exception
}
for (names in columnNames) {
if (is.factor(columnData[[names]]) ) { #only reorder factors
columnData <- mmRmeta::reorder_column(summarized_experiment = columnData, column = names, threshold = threshold) #use reorder.columns on given column names
}
}
if (class(summarized_experiment) %in% c("RangedSummarizedExperiment", "SummarizedExperiment")) {
sampleNames <- columnData$rowname
summarized_experiment <- summarized_experiment[, sampleNames] #first need to subset SE if we filtered out some rows in colData
columnData <- tibble::column_to_rownames(columnData, "rowname") #
columnData <- as(columnData, "DataFrame") #switch class of data frame
colData(summarized_experiment) <- columnData
return(summarized_experiment)
} else {
return(columnData)
}
}
#' Remove small factor levels and reorder
#' @description This function removes factor levels with counts less than a set value. Additionally, it reorders the factor levels depending on the counts.
#' This may be necessary later on to get ordered labels when plotting.
#' @param summarized_experiment A large list created prior by multimodalR.
#' @param column Character - column name
#' @param threshold Integer - threshold of counts; any factor level with equal or less counts are discarded
#' @import SummarizedExperiment
#' @import dplyr
#' @import rlang
#' @importFrom tibble rownames_to_column
#' @examples
#' lungMeta <- reorder.colums(lungMeta, "primary_diagnosis", 10) #keeps factor levels with >= 10 counts.
#' @return Returns the same large list with changed column names of the expressionmatrix.
#' @export
reorder_column <- function(summarized_experiment, column, threshold = 0) {
if (class(summarized_experiment) %in% c("RangedSummarizedExperiment", "SummarizedExperiment")) {
columnData <- unnest_dataframe(colData(summarized_experiment))
} else if (class(summarized_experiment) %in% c("data.frame", "DataFrame")) {
columnData <- summarized_experiment
}
if (!"rowname" %in% colnames(columnData)) {
columnData <- tibble::rownames_to_column(columnData, "rowname") #add rowname if it doesnt exist
}
if (is.factor(columnData[[column]])) {
arrangedCounts <- columnData %>% dplyr::group_by_(column) %>% dplyr::summarize(n = n())
arrangedCounts <- dplyr::arrange(arrangedCounts, desc(n)) #order by descending counts
columnData[[column]] <- factor(columnData[[column]], levels = arrangedCounts[[column]]) #reorder factor in data frame
factorCount <- plyr::count(columnData[[column]]) %>% dplyr::filter(freq >= threshold) #removes counts less than the value of removes
factorCount <- factorCount[,1]
columnSymbol <- rlang::sym(column)
columnData <- dplyr::filter(columnData, !!(columnSymbol) %in% factorCount) # !! is the newer version of rlang::UQ() which unquotes the variable
columnData[[column]] <- factor(columnData[[column]])
} else {
warning(column, " is no factor, no changes applied")
}
if (mmRmeta::is_class_SE(summarized_experiment)) {
sampleNames <- columnData$rowname
summarized_experiment <- summarized_experiment[, sampleNames] #first need to subset SE if we filtered out some rows in colData
columnData <- tibble::column_to_rownames(columnData, "rowname") #
columnData <- as(columnData, "DataFrame") #switch class of data frame
colData(summarized_experiment) <- columnData
} else {
return(columnData)
}
}