diff --git a/DESCRIPTION b/DESCRIPTION index 27b5b46..b4aa4e6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: wilson Type: Package Title: Web-Based Interactive Omics Visualization -Version: 2.0.3 +Version: 2.1.0 Authors@R: c( person("Hendrik", "Schultheis", email = "hendrik.schultheis@mpi-bn.mpg.de", role = c("aut", "cre")), person("Jens", "Preussner", email = "jens.preussner@mpi-bn.mpg.de", role = "aut"), diff --git a/NAMESPACE b/NAMESPACE index df0d7dd..355b8a2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -32,6 +32,7 @@ export(pcaUI) export(scatterPlot) export(scatterPlotUI) export(set_logger) +export(tobias_parser) export(transformation) export(transformationUI) import(data.table) diff --git a/NEWS.md b/NEWS.md index 40007a3..b73fa15 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +# wilson 2.1.0 +- implemented tobias_parser # wilson 2.0.3 - reactive transformation parameter # wilson 2.0.2 diff --git a/R/clarion.R b/R/clarion.R index 235c95f..15e3e1a 100644 --- a/R/clarion.R +++ b/R/clarion.R @@ -117,6 +117,8 @@ Clarion <- R6::R6Class("Clarion", self$metadata <- metadata self$data <- data + if (validate) self$validate() + # coerce unique_id and name to character if (self$get_id() == self$get_name()) { cols <- self$get_id() @@ -124,8 +126,6 @@ Clarion <- R6::R6Class("Clarion", cols <- c(self$get_id(), self$get_name()) } self$data[, (cols) := lapply(.SD, as.character), .SDcols = cols] - - if (validate) self$validate() } ), private = list( diff --git a/R/parser.R b/R/parser.R index 475ecb1..9e38855 100644 --- a/R/parser.R +++ b/R/parser.R @@ -378,3 +378,217 @@ parser <- function(file, dec = ".") { return(Clarion$new(header = header, metadata = metadata, data = data)) } + +#' TOBIAS TFBS table to clarion parser +#' +#' Click \href{https://github.molgen.mpg.de/loosolab/TOBIAS}{here} for more information about TOBIAS. +#' +#' @param input Path to input table +#' @param output Output path. +#' @param filter_columns Either a vector of columnnames or a file containing one columnname per row. +#' @param filter_pattern Keep columns matching the given pattern. Uses parameter filter_columns for matching if set. In the case of no matches a warning will be issued and all columns will be used. +#' @param config Json file containing metadata information for all columns. Will use first occurence for duplicate column names. +#' @param omit_NA Logical whether all rows containing NA should be removed. +#' @param condition_names Vector of condition names. Default = NULL. Used to classify columns not provided in config. +#' @param condition_pattern Used to identify condition names by matching und removing given pattern with \code{\link[base]{grep}}. Ignored when condition_names is set. +#' @param in_field_delimiter Delimiter for multi value fields. Default = ','. +#' @param dec Decimal separator. Used in file reading and writing. +#' @param unique_id Whether the table contains an unique id column. If FALSE (default) will create one at first position. +#' @param ... Used as header information. +#' +#' @details During conversion the parser will try to use the given config (if provided) to create the \href{https://github.molgen.mpg.de/loosolab/wilson-apps/wiki/CLARION-Format}{Clarion} metadata. In the case of insufficient config information it will try to approximate by referencing condition names issuing warnings in the process. +#' @details Factor grouping (metadata factor columns) is currently not implemented! +#' +#' @export +tobias_parser <- function(input, output, filter_columns = NULL, filter_pattern = NULL, config = system.file("extdata", "tobias_config.json", package = "wilson"), omit_NA = FALSE, condition_names = NULL, condition_pattern = "_bound$", in_field_delimiter = ",", dec = ".", unique_id = FALSE, ...) { + ## filter data columns + # check if filter columns is a file or a vector + if (!is.null(filter_columns) && file.exists(filter_columns)) { + select_columns <- scan(file = filter_columns, what = character()) + } else { + select_columns <- filter_columns + } + + # filter pattern + if (!is.null(filter_pattern)) { + # use file header if filter_columns is empty + if (is.null(select_columns)) { + # only read header + select_columns <- names(data.table::fread(input = input, header = TRUE, nrows = 0)) + } + + select_columns <- grep(pattern = filter_pattern, x = select_columns, value = TRUE) + + if (identical(select_columns, character(0))) { + warning("No column matches for given filter pattern! Proceeding with all columns.") + } + } + + ##### data + data <- data.table::fread(input, dec = dec, select = select_columns, header = TRUE) + + # omit na rows + if (omit_NA) { + data <- stats::na.omit(data) + } + + # create id column + if (!unique_id) { + data[, "id" := seq_len(nrow(data))] + # move id column to first position + new_order <- c("id", names(data)[ names(data) != "id"]) + data <- data[, new_order, with = FALSE] + } + + ##### metadata + metadata <- data.table::data.table(names(data)) + + # load config + if (!is.null(config)) { + config_file <- RJSONIO::fromJSON(config) + # get column names from config file + col_names <- vapply(X = config_file$meta, FUN.VALUE = character(1), FUN = function(x) { + x[["col_name"]] + }) + } else { + config_file <- NULL + } + + # identify conditions + if (is.null(condition_names)) { + conditions <- gsub(pattern = condition_pattern, replacement = "", x = grep(pattern = condition_pattern, x = metadata[[1]], value = TRUE)) + } else { + conditions <- condition_names + } + + + ## create metadata row by row + unique_id_fallback <- NULL + condition_pattern <- paste0(conditions, collapse = "|") + approx_columns <- NULL + + meta_rows <- lapply(metadata[[1]], function(x) { + # is column information provided in config? + if (!is.null(config_file) && is.element(x, col_names)) { + # uses first appearance in config file in case of duplicates + return(config_file$meta[[which.max(col_names == x)]][-1]) + } + + # if no information is provided via config try to guess meta-information + # utilize condition names to do so + approx_columns <<- c(approx_columns, x) + + ## level + # get distance of all conditions matched to x + # count number of exact substring matches + match_dist <- utils::adist(conditions, x) - nchar(x) + nchar(conditions) + count_matches <- sum(match_dist == 0) + + if (count_matches == 1) { + level <- "condition" + } else if (count_matches > 1) { + level <- "contrast" + } else { + level <- "feature" + } + + ## type + if (level == "feature") { + if (any(grepl(pattern = in_field_delimiter, x = data[[x]], fixed = TRUE))) { + type <- "array" + } else { + # define fallback unique_id in case none is defined through config + # this will redefine first unique feature with type category as unqiue_id + if (is.null(unique_id_fallback) && anyDuplicated(data[[x]]) == 0) { + unique_id_fallback <<- x + } + type <- "category" + } + } else { + if (!is.numeric(data[[x]])) { + type <- "array" + } else if (grepl(pattern = "fc|fold[\\._\\- ]?change", x = x, perl = TRUE, ignore.case = TRUE)) { + type <- "ratio" + } else if (grepl(pattern = "p[\\._\\- ]?val|p[\\._\\- ]?adj", x = x, perl = TRUE, ignore.case = TRUE)) { + type <- "probability" + } else { + type <- "score" + } + } + + ## label/ sub_label + label <- sub_label <- "" + label_parts <- unlist(strsplit(x = x, split = "_")) + + if (length(label_parts) == 1) { + label <- label_parts + } else if (length(label_parts) == 2) { + label <- label_parts[1] + sub_label <- label_parts[2] + } else if (length(label_parts) >= 3 && level == "contrast") { + # replace '_' with whitespace + condition_pattern <- gsub(pattern = "_", replacement = " ", x = condition_pattern, fixed = TRUE) + x <- gsub(pattern = "_", replacement = " ", x = x, fixed = TRUE) + # get first condition using all identified conditions as pattern + first_condition <- gsub(pattern = paste0("(^", condition_pattern, ").*"), replacement = "\\1", x = x) + # strip first condition + stripped_condition <- substring(x, first = nchar(first_condition) + 2) # + 1 because parameter is inclusive and + 1 for whitespace + # get new first condition + second_condition <- gsub(pattern = paste0("(^", condition_pattern, ").*"), replacement = "\\1", x = stripped_condition) + + label <- paste0(first_condition, "|", second_condition) + sub_label <- substring(stripped_condition, first = nchar(second_condition) + 2) # + 1 because parameter is inclusive and + 1 for whitespace + } else { + label <- paste0(label_parts[-length(label_parts)], collapse = " ") + sub_label <- label_parts[length(label_parts)] + } + + return(c(level, type, label, sub_label)) + }) + + # meta approximation warning + if (!is.null(approx_columns)) { + warning("Missing information in config! Tried guessing meta-information for ", paste0(approx_columns, collapse = ", ")) + } + + # list of vectors (rows) to matrix + meta_matrix <- do.call(rbind, meta_rows) + metadata <- cbind(metadata, meta_matrix) + names(metadata) <- c("key", "level", "type", "label", "sub_label") + + # set unique_id fallback + if (!any(metadata[["type"]] == "unique_id")) { + metadata[key == unique_id_fallback, "type"] <- "unique_id" + } + + ##### header + header <- c( + list( + format = "Clarion", + version = "1.0"), + list(...) + ) + + # add delimiter if necessary + if (any(metadata[["type"]] == "array")) { + header <- append(x = header, values = list(delimiter = in_field_delimiter), after = 2) + } + + ##### validate + # create clarion object for validation + clarion <- Clarion$new(header = header, metadata = metadata, data = data) + + ##### write + # TODO implement and use clarion write function + # write clarion + # header + flat_header <- data.table::data.table(paste0("!", names(clarion$header), "=", clarion$header)) + data.table::fwrite(x = flat_header, file = output, col.names = FALSE, sep = "\t", dec = dec) + # metadata + # add '#' + names(clarion$metadata)[1] <- paste0("#", names(clarion$metadata)[1]) + clarion$metadata[, names(clarion$metadata)[1] := paste0("#", clarion$metadata[[1]])] + data.table::fwrite(x = clarion$metadata, file = output, col.names = TRUE, sep = "\t", append = TRUE, quote = FALSE, dec = dec) + # data + data.table::fwrite(x = clarion$data, file = output, col.names = TRUE, sep = "\t", append = TRUE, dec = dec) +} diff --git a/inst/extdata/tobias_config.json b/inst/extdata/tobias_config.json new file mode 100644 index 0000000..f8e050a --- /dev/null +++ b/inst/extdata/tobias_config.json @@ -0,0 +1,516 @@ +{ + "meta": [ + { + "col_name": "id", + "level": "feature", + "type": "unique_id", + "label": "id", + "sublabel": "" + }, + { + "col_name": "TFBS_chr", + "level": "feature", + "type": "category", + "label": "chr", + "sublabel": "TBFS" + }, + { + "col_name": "TFBS_start", + "level": "feature", + "type": "category", + "label": "start", + "sublabel": "TFBS" + }, + { + "col_name": "TFBS_end", + "level": "feature", + "type": "category", + "label": "end", + "sublabel": "TFBS" + }, + { + "col_name": "TFBS_name", + "level": "feature", + "type": "category", + "label": "name", + "sublabel": "TFBS" + }, + { + "col_name": "TFBS_score", + "level": "feature", + "type": "category", + "label": "score", + "sublabel": "TFBS" + }, + { + "col_name": "TFBS_strand", + "level": "feature", + "type": "category", + "label": "strand", + "sublabel": "TFBS" + }, + { + "col_name": "peak_chr", + "level": "feature", + "type": "category", + "label": "chr", + "sublabel": "peak" + }, + { + "col_name": "peak_start", + "level": "feature", + "type": "category", + "label": "start", + "sublabel": "peak" + }, + { + "col_name": "peak_end", + "level": "feature", + "type": "category", + "label": "end", + "sublabel": "peak" + }, + { + "col_name": "peak_id", + "level": "feature", + "type": "array", + "label": "id", + "sublabel": "peak" + }, + { + "col_name": "feature", + "level": "feature", + "type": "category", + "label": "feature", + "sublabel": "" + }, + { + "col_name": "feat_start", + "level": "feature", + "type": "category", + "label": "start", + "sublabel": "feature" + }, + { + "col_name": "feat_end", + "level": "feature", + "type": "category", + "label": "end", + "sublabel": "feature" + }, + { + "col_name": "feat_strand", + "level": "feature", + "type": "category", + "label": "strand", + "sublabel": "feature" + }, + { + "col_name": "feat_anchor", + "level": "feature", + "type": "category", + "label": "anchor", + "sublabel": "feature" + }, + { + "col_name": "distance", + "level": "feature", + "type": "category", + "label": "distance", + "sublabel": "" + }, + { + "col_name": "gene_biotype", + "level": "feature", + "type": "category", + "label": "biotype", + "sublabel": "gene" + }, + { + "col_name": "gene_id", + "level": "feature", + "type": "category", + "label": "id", + "sublabel": "gene" + }, + { + "col_name": "gene_name", + "level": "feature", + "type": "category", + "label": "name", + "sublabel": "gene" + }, + { + "col_name": "2C_score", + "level": "condition", + "type": "score", + "label": "2C", + "sublabel": "score" + }, + { + "col_name": "8C_score", + "level": "condition", + "type": "score", + "label": "8C", + "sublabel": "score" + }, + { + "col_name": "ICM_score", + "level": "condition", + "type": "score", + "label": "ICM", + "sublabel": "score" + }, + { + "col_name": "naive_hESC_score", + "level": "condition", + "type": "score", + "label": "naive hESC", + "sublabel": "score" + }, + { + "col_name": "hESC_score", + "level": "condition", + "type": "score", + "label": "hESC", + "sublabel": "score" + }, + { + "col_name": "2C_bound", + "level": "condition", + "type": "score", + "label": "2C", + "sublabel": "bound" + }, + { + "col_name": "8C_bound", + "level": "condition", + "type": "score", + "label": "8C", + "sublabel": "bound" + }, + { + "col_name": "ICM_bound", + "level": "condition", + "type": "score", + "label": "ICM", + "sublabel": "bound" + }, + { + "col_name": "naive_hESC_bound", + "level": "condition", + "type": "score", + "label": "naive hESC", + "sublabel": "bound" + }, + { + "col_name": "hESC_bound", + "level": "condition", + "type": "score", + "label": "hESC", + "sublabel": "bound" + }, + { + "col_name": "2C_8C_log2fc", + "level": "contrast", + "type": "ratio", + "label": "2C|8C", + "sublabel": "log2fc" + }, + { + "col_name": "2C_ICM_log2fc", + "level": "contrast", + "type": "ratio", + "label": "2C|ICM", + "sublabel": "log2fc" + }, + { + "col_name": "2C_naive_hESC_log2fc", + "level": "contrast", + "type": "ratio", + "label": "2C|naive hESC", + "sublabel": "log2fc" + }, + { + "col_name": "2C_hESC_log2fc", + "level": "contrast", + "type": "ratio", + "label": "2C|hESC", + "sublabel": "log2fc" + }, + { + "col_name": "8C_ICM_log2fc", + "level": "contrast", + "type": "ratio", + "label": "8C|ICM", + "sublabel": "log2fc" + }, + { + "col_name": "8C_naive_hESC_log2fc", + "level": "contrast", + "type": "ratio", + "label": "8C|naive hESC", + "sublabel": "log2fc" + }, + { + "col_name": "8C_hESC_log2fc", + "level": "contrast", + "type": "ratio", + "label": "8C|hESC", + "sublabel": "log2fc" + }, + { + "col_name": "ICM_naive_hESC_log2fc", + "level": "contrast", + "type": "ratio", + "label": "ICM|naive hESC", + "sublabel": "log2fc" + }, + { + "col_name": "ICM_hESC_log2fc", + "level": "contrast", + "type": "ratio", + "label": "ICM|hESC", + "sublabel": "log2fc" + }, + { + "col_name": "naive_hESC_hESC_log2fc", + "level": "contrast", + "type": "ratio", + "label": "naive hESC|hESC", + "sublabel": "log2fc" + }, + // overview columns + { + "col_name": "TF_name", + "level": "feature", + "type": "unique_id", + "label": "TF name", + "sublabel": "" + }, + { + "col_name": "cluster", + "level": "feature", + "type": "category", + "label": "cluster", + "sublabel": "" + }, + { + "col_name": "total_tfbs", + "level": "feature", + "type": "category", + "label": "total TFBS", + "sublabel": "" + }, + { + "col_name": "2C_mean_score", + "level": "condition", + "type": "score", + "label": "2C", + "sublabel": "mean" + }, + { + "col_name": "2C_bound", + "level": "condition", + "type": "score", + "label": "2C", + "sublabel": "bound" + }, + { + "col_name": "8C_mean_score", + "level": "condition", + "type": "score", + "label": "8C", + "sublabel": "mean" + }, + { + "col_name": "8C_bound", + "level": "condition", + "type": "score", + "label": "8C", + "sublabel": "bound" + }, + { + "col_name": "ICM_mean_score", + "level": "condition", + "type": "score", + "label": "ICM", + "sublabel": "mean" + }, + { + "col_name": "ICM_bound", + "level": "condition", + "type": "score", + "label": "ICM", + "sublabel": "bound" + }, + { + "col_name": "naive_hESC_mean_score", + "level": "condition", + "type": "score", + "label": "naive_hESC", + "sublabel": "mean" + }, + { + "col_name": "naive_hESC_bound", + "level": "condition", + "type": "score", + "label": "naive_hESC", + "sublabel": "bound" + }, + { + "col_name": "hESC_mean_score", + "level": "condition", + "type": "score", + "label": "hESC", + "sublabel": "mean" + }, + { + "col_name": "hESC_bound", + "level": "condition", + "type": "score", + "label": "hESC", + "sublabel": "bound" + }, + { + "col_name": "2C_8C_change", + "level": "contrast", + "type": "ratio", + "label": "2C|8C", + "sublabel": "change" + }, + { + "col_name": "2C_8C_pvalue", + "level": "contrast", + "type": "probability", + "label": "2C|8C", + "sublabel": "p-value" + }, + { + "col_name": "2C_ICM_change", + "level": "contrast", + "type": "ratio", + "label": "2C|ICM", + "sublabel": "change" + }, + { + "col_name": "2C_ICM_pvalue", + "level": "contrast", + "type": "probability", + "label": "2C|ICM", + "sublabel": "p-value" + }, + { + "col_name": "2C_naive_hESC_change", + "level": "contrast", + "type": "ratio", + "label": "2C|naive hESC", + "sublabel": "change" + }, + { + "col_name": "2C_naive_hESC_pvalue", + "level": "contrast", + "type": "probability", + "label": "2C|naive hESC", + "sublabel": "p-value" + }, + { + "col_name": "2C_hESC_change", + "level": "contrast", + "type": "ratio", + "label": "2C|hESC", + "sublabel": "change" + }, + { + "col_name": "2C_hESC_pvalue", + "level": "contrast", + "type": "probability", + "label": "2C|hESC", + "sublabel": "p-value" + }, + { + "col_name": "8C_ICM_change", + "level": "contrast", + "type": "ratio", + "label": "8C|ICM", + "sublabel": "change" + }, + { + "col_name": "8C_ICM_pvalue", + "level": "contrast", + "type": "probability", + "label": "8C|ICM", + "sublabel": "p-value" + }, + { + "col_name": "8C_naive_hESC_change", + "level": "contrast", + "type": "ratio", + "label": "8C|naive hESC", + "sublabel": "change" + }, + { + "col_name": "8C_naive_hESC_pvalue", + "level": "contrast", + "type": "probability", + "label": "8C|naive hESC", + "sublabel": "p-value" + }, + { + "col_name": "8C_hESC_change", + "level": "contrast", + "type": "ratio", + "label": "8C|hESC", + "sublabel": "change" + }, + { + "col_name": "8C_hESC_pvalue", + "level": "contrast", + "type": "probability", + "label": "8C|hESC", + "sublabel": "p-value" + }, + { + "col_name": "ICM_naive_hESC_change", + "level": "contrast", + "type": "ratio", + "label": "ICM|naive hESC", + "sublabel": "change" + }, + { + "col_name": "ICM_naive_hESC_pvalue", + "level": "contrast", + "type": "probability", + "label": "ICM|naive hESC", + "sublabel": "p-value" + }, + { + "col_name": "ICM_hESC_change", + "level": "contrast", + "type": "ratio", + "label": "ICM|hESC", + "sublabel": "change" + }, + { + "col_name": "ICM_hESC_pvalue", + "level": "contrast", + "type": "probability", + "label": "ICM|hESC", + "sublabel": "p-value" + }, + { + "col_name": "naive_hESC_hESC_change", + "level": "contrast", + "type": "ratio", + "label": "naive hESC|hESC", + "sublabel": "p-value" + }, + { + "col_name": "naive_hESC_hESC_pvalue", + "level": "contrast", + "type": "probability", + "label": "naive hESC|hESC", + "sublabel": "change" + } + ] +} \ No newline at end of file diff --git a/man/tobias_parser.Rd b/man/tobias_parser.Rd new file mode 100644 index 0000000..c7cafdc --- /dev/null +++ b/man/tobias_parser.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parser.R +\name{tobias_parser} +\alias{tobias_parser} +\title{TOBIAS TFBS table to clarion parser} +\usage{ +tobias_parser(input, output, filter_columns = NULL, + filter_pattern = NULL, config = system.file("extdata", + "tobias_config.json", package = "wilson"), omit_NA = FALSE, + condition_names = NULL, condition_pattern = "_bound$", + in_field_delimiter = ",", dec = ".", unique_id = FALSE, ...) +} +\arguments{ +\item{input}{Path to input table} + +\item{output}{Output path.} + +\item{filter_columns}{Either a vector of columnnames or a file containing one columnname per row.} + +\item{filter_pattern}{Keep columns matching the given pattern. Uses parameter filter_columns for matching if set. In the case of no matches a warning will be issued and all columns will be used.} + +\item{config}{Json file containing metadata information for all columns. Will use first occurence for duplicate column names.} + +\item{omit_NA}{Logical whether all rows containing NA should be removed.} + +\item{condition_names}{Vector of condition names. Default = NULL. Used to classify columns not provided in config.} + +\item{condition_pattern}{Used to identify condition names by matching und removing given pattern with \code{\link[base]{grep}}. Ignored when condition_names is set.} + +\item{in_field_delimiter}{Delimiter for multi value fields. Default = ','.} + +\item{dec}{Decimal separator. Used in file reading and writing.} + +\item{unique_id}{Whether the table contains an unique id column. If FALSE (default) will create one at first position.} + +\item{...}{Used as header information.} +} +\description{ +Click \href{https://github.molgen.mpg.de/loosolab/TOBIAS}{here} for more information about TOBIAS. +} +\details{ +During conversion the parser will try to use the given config (if provided) to create the \href{https://github.molgen.mpg.de/loosolab/wilson-apps/wiki/CLARION-Format}{Clarion} metadata. In the case of insufficient config information it will try to approximate by referencing condition names issuing warnings in the process. + +Factor grouping (metadata factor columns) is currently not implemented! +}