Skip to content
This repository has been archived by the owner. It is now read-only.

Tobias #31

Merged
merged 18 commits into from
Mar 19, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
63b8bc8
parser: implemented first half of metadata
HendrikSchultheis Mar 7, 2019
69e3913
implemented tobias_converter (needs refactoring)
HendrikSchultheis Mar 7, 2019
9baee27
omit leading whitespace for contrast sub_labels
HendrikSchultheis Mar 7, 2019
cfddb18
clarion: fixed bug caused by to late object validation
HendrikSchultheis Mar 8, 2019
cb49e55
added tobias config file
HendrikSchultheis Mar 8, 2019
2d81483
tobias_parser: added column specific filtering; use condition names a…
HendrikSchultheis Mar 8, 2019
7badae9
tobias_parser: added details
HendrikSchultheis Mar 8, 2019
7dd7d3a
updated DESCRIPTION, NAMESPACE, NEWS.md
HendrikSchultheis Mar 8, 2019
e5fc83c
tobias_parser: fixed devtools::check() warnings/notes
HendrikSchultheis Mar 8, 2019
5db0610
tobias_parser: do not combine filter_columns with table columns for p…
HendrikSchultheis Mar 13, 2019
af0c3d9
tobias_parser: added missing probability type
HendrikSchultheis Mar 13, 2019
9178e7f
tobias_parser: implemented unique_id parameter; print single warning …
HendrikSchultheis Mar 13, 2019
75d0252
tobias_parser: docu update
HendrikSchultheis Mar 13, 2019
30ecb5e
tobias_parser: which.min -> which.max (get first TRUE)
HendrikSchultheis Mar 13, 2019
48840c8
add overview columns to config
HendrikSchultheis Mar 13, 2019
fe30336
updated tobias_parser.md
HendrikSchultheis Mar 13, 2019
27c9eff
fixed version, new feature = minor upgrade (2.1.0)
HendrikSchultheis Mar 14, 2019
89c4bd3
Merge branch 'master' into tobias
HendrikSchultheis Mar 19, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: wilson
Type: Package
Title: Web-Based Interactive Omics Visualization
Version: 2.0.3
Version: 2.1.0
Authors@R: c(
person("Hendrik", "Schultheis", email = "hendrik.schultheis@mpi-bn.mpg.de", role = c("aut", "cre")),
person("Jens", "Preussner", email = "jens.preussner@mpi-bn.mpg.de", role = "aut"),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ export(pcaUI)
export(scatterPlot)
export(scatterPlotUI)
export(set_logger)
export(tobias_parser)
export(transformation)
export(transformationUI)
import(data.table)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# wilson 2.1.0
- implemented tobias_parser
# wilson 2.0.3
- reactive transformation parameter
# wilson 2.0.2
Expand Down
4 changes: 2 additions & 2 deletions R/clarion.R
Original file line number Diff line number Diff line change
Expand Up @@ -117,15 +117,15 @@ Clarion <- R6::R6Class("Clarion",
self$metadata <- metadata
self$data <- data

if (validate) self$validate()

# coerce unique_id and name to character
if (self$get_id() == self$get_name()) {
cols <- self$get_id()
} else {
cols <- c(self$get_id(), self$get_name())
}
self$data[, (cols) := lapply(.SD, as.character), .SDcols = cols]

if (validate) self$validate()
}
),
private = list(
Expand Down
214 changes: 214 additions & 0 deletions R/parser.R
Original file line number Diff line number Diff line change
Expand Up @@ -378,3 +378,217 @@ parser <- function(file, dec = ".") {

return(Clarion$new(header = header, metadata = metadata, data = data))
}

#' TOBIAS TFBS table to clarion parser
#'
#' Click \href{https://github.molgen.mpg.de/loosolab/TOBIAS}{here} for more information about TOBIAS.
#'
#' @param input Path to input table
#' @param output Output path.
#' @param filter_columns Either a vector of columnnames or a file containing one columnname per row.
#' @param filter_pattern Keep columns matching the given pattern. Uses parameter filter_columns for matching if set. In the case of no matches a warning will be issued and all columns will be used.
#' @param config Json file containing metadata information for all columns. Will use first occurence for duplicate column names.
#' @param omit_NA Logical whether all rows containing NA should be removed.
#' @param condition_names Vector of condition names. Default = NULL. Used to classify columns not provided in config.
#' @param condition_pattern Used to identify condition names by matching und removing given pattern with \code{\link[base]{grep}}. Ignored when condition_names is set.
#' @param in_field_delimiter Delimiter for multi value fields. Default = ','.
#' @param dec Decimal separator. Used in file reading and writing.
#' @param unique_id Whether the table contains an unique id column. If FALSE (default) will create one at first position.
#' @param ... Used as header information.
#'
#' @details During conversion the parser will try to use the given config (if provided) to create the \href{https://github.molgen.mpg.de/loosolab/wilson-apps/wiki/CLARION-Format}{Clarion} metadata. In the case of insufficient config information it will try to approximate by referencing condition names issuing warnings in the process.
#' @details Factor grouping (metadata factor columns) is currently not implemented!
#'
#' @export
tobias_parser <- function(input, output, filter_columns = NULL, filter_pattern = NULL, config = system.file("extdata", "tobias_config.json", package = "wilson"), omit_NA = FALSE, condition_names = NULL, condition_pattern = "_bound$", in_field_delimiter = ",", dec = ".", unique_id = FALSE, ...) {
## filter data columns
# check if filter columns is a file or a vector
if (!is.null(filter_columns) && file.exists(filter_columns)) {
select_columns <- scan(file = filter_columns, what = character())
} else {
select_columns <- filter_columns
}

# filter pattern
if (!is.null(filter_pattern)) {
# use file header if filter_columns is empty
if (is.null(select_columns)) {
# only read header
select_columns <- names(data.table::fread(input = input, header = TRUE, nrows = 0))
}

select_columns <- grep(pattern = filter_pattern, x = select_columns, value = TRUE)

if (identical(select_columns, character(0))) {
warning("No column matches for given filter pattern! Proceeding with all columns.")
}
}

##### data
data <- data.table::fread(input, dec = dec, select = select_columns, header = TRUE)

# omit na rows
if (omit_NA) {
data <- stats::na.omit(data)
}

# create id column
if (!unique_id) {
data[, "id" := seq_len(nrow(data))]
# move id column to first position
new_order <- c("id", names(data)[ names(data) != "id"])
data <- data[, new_order, with = FALSE]
}

##### metadata
metadata <- data.table::data.table(names(data))

# load config
if (!is.null(config)) {
config_file <- RJSONIO::fromJSON(config)
# get column names from config file
col_names <- vapply(X = config_file$meta, FUN.VALUE = character(1), FUN = function(x) {
x[["col_name"]]
})
} else {
config_file <- NULL
}

# identify conditions
if (is.null(condition_names)) {
conditions <- gsub(pattern = condition_pattern, replacement = "", x = grep(pattern = condition_pattern, x = metadata[[1]], value = TRUE))
} else {
conditions <- condition_names
}


## create metadata row by row
unique_id_fallback <- NULL
condition_pattern <- paste0(conditions, collapse = "|")
approx_columns <- NULL

meta_rows <- lapply(metadata[[1]], function(x) {
# is column information provided in config?
if (!is.null(config_file) && is.element(x, col_names)) {
# uses first appearance in config file in case of duplicates
return(config_file$meta[[which.max(col_names == x)]][-1])
}

# if no information is provided via config try to guess meta-information
# utilize condition names to do so
approx_columns <<- c(approx_columns, x)

## level
# get distance of all conditions matched to x
# count number of exact substring matches
match_dist <- utils::adist(conditions, x) - nchar(x) + nchar(conditions)
count_matches <- sum(match_dist == 0)

if (count_matches == 1) {
level <- "condition"
} else if (count_matches > 1) {
level <- "contrast"
} else {
level <- "feature"
}

## type
if (level == "feature") {
if (any(grepl(pattern = in_field_delimiter, x = data[[x]], fixed = TRUE))) {
type <- "array"
} else {
# define fallback unique_id in case none is defined through config
# this will redefine first unique feature with type category as unqiue_id
if (is.null(unique_id_fallback) && anyDuplicated(data[[x]]) == 0) {
unique_id_fallback <<- x
}
type <- "category"
}
} else {
if (!is.numeric(data[[x]])) {
type <- "array"
} else if (grepl(pattern = "fc|fold[\\._\\- ]?change", x = x, perl = TRUE, ignore.case = TRUE)) {
type <- "ratio"
} else if (grepl(pattern = "p[\\._\\- ]?val|p[\\._\\- ]?adj", x = x, perl = TRUE, ignore.case = TRUE)) {
type <- "probability"
} else {
type <- "score"
}
}

## label/ sub_label
label <- sub_label <- ""
label_parts <- unlist(strsplit(x = x, split = "_"))

if (length(label_parts) == 1) {
label <- label_parts
} else if (length(label_parts) == 2) {
label <- label_parts[1]
sub_label <- label_parts[2]
} else if (length(label_parts) >= 3 && level == "contrast") {
# replace '_' with whitespace
condition_pattern <- gsub(pattern = "_", replacement = " ", x = condition_pattern, fixed = TRUE)
x <- gsub(pattern = "_", replacement = " ", x = x, fixed = TRUE)
# get first condition using all identified conditions as pattern
first_condition <- gsub(pattern = paste0("(^", condition_pattern, ").*"), replacement = "\\1", x = x)
# strip first condition
stripped_condition <- substring(x, first = nchar(first_condition) + 2) # + 1 because parameter is inclusive and + 1 for whitespace
# get new first condition
second_condition <- gsub(pattern = paste0("(^", condition_pattern, ").*"), replacement = "\\1", x = stripped_condition)

label <- paste0(first_condition, "|", second_condition)
sub_label <- substring(stripped_condition, first = nchar(second_condition) + 2) # + 1 because parameter is inclusive and + 1 for whitespace
} else {
label <- paste0(label_parts[-length(label_parts)], collapse = " ")
sub_label <- label_parts[length(label_parts)]
}

return(c(level, type, label, sub_label))
})

# meta approximation warning
if (!is.null(approx_columns)) {
warning("Missing information in config! Tried guessing meta-information for ", paste0(approx_columns, collapse = ", "))
}

# list of vectors (rows) to matrix
meta_matrix <- do.call(rbind, meta_rows)
metadata <- cbind(metadata, meta_matrix)
names(metadata) <- c("key", "level", "type", "label", "sub_label")

# set unique_id fallback
if (!any(metadata[["type"]] == "unique_id")) {
metadata[key == unique_id_fallback, "type"] <- "unique_id"
}

##### header
header <- c(
list(
format = "Clarion",
version = "1.0"),
list(...)
)

# add delimiter if necessary
if (any(metadata[["type"]] == "array")) {
header <- append(x = header, values = list(delimiter = in_field_delimiter), after = 2)
}

##### validate
# create clarion object for validation
clarion <- Clarion$new(header = header, metadata = metadata, data = data)

##### write
# TODO implement and use clarion write function
# write clarion
# header
flat_header <- data.table::data.table(paste0("!", names(clarion$header), "=", clarion$header))
data.table::fwrite(x = flat_header, file = output, col.names = FALSE, sep = "\t", dec = dec)
# metadata
# add '#'
names(clarion$metadata)[1] <- paste0("#", names(clarion$metadata)[1])
clarion$metadata[, names(clarion$metadata)[1] := paste0("#", clarion$metadata[[1]])]
data.table::fwrite(x = clarion$metadata, file = output, col.names = TRUE, sep = "\t", append = TRUE, quote = FALSE, dec = dec)
# data
data.table::fwrite(x = clarion$data, file = output, col.names = TRUE, sep = "\t", append = TRUE, dec = dec)
}
Loading