This repository has been archived by the owner. It is now read-only.
Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
wilson/R/parser.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
380 lines (335 sloc)
15.2 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Converting MaxQuant Output file proteinGroups.txt to CLARION format | |
#' by creating a headline of metadata for each column | |
#' | |
#' List with columns of reduced version (see config.json file) | |
#' If you only want the samples of a specific keyword write: column;exp | |
#' For example: | |
#' You got: | |
#' Intensity | |
#' Intensity 'experiment_name' | |
#' Do you want both add "Intensity" to the list. | |
#' Do you only want the sample add "Intensity;exp" to the list | |
#' Anything else like 'Intensity;ex' or 'Intensity;' results in writing both. | |
#' Only works if there are samples of that type. If not, column does not show up in file | |
#' | |
#' @author Rene Wiegandt | |
#' @param proteinGroups_in path of proteinGroup.txt file | |
#' @param summary_in path of belonging summary.txt file | |
#' @param outfile path of full CLARION output file | |
#' @param outfile_reduced path of reduced CLARION output file | |
#' @param config path of config file (containing information about metadata) | |
#' @param delimiter delimiter (Default = ;) | |
#' @param format pre-header information about format (optional) | |
#' @param version pre-header information about version (optional) | |
#' @param experiment_id pre-header information about experiment id (optional) | |
#' | |
#' @return TRUE on success | |
#' | |
#' @export | |
parse_MaxQuant <- function(proteinGroups_in, summary_in, outfile, outfile_reduced, config = system.file("extdata", "parser_MaxQuant_config.json", package = "wilson"), delimiter = ";", format = NULL, version = NULL, experiment_id = NULL){ | |
if (missing(proteinGroups_in)) { | |
stop("The proteinGroups file was not given") | |
} | |
if (missing(summary_in)) { | |
stop("The summary file was not given") | |
} | |
if (missing(outfile)) { | |
stop("The output file was not given") | |
} | |
if (missing(outfile_reduced)) { | |
stop("The output_reduced file was not given") | |
} | |
# parsers json config file | |
# @param config path of config file | |
# @return data.table with metadata | |
get_meta_from_config <- function(meta_config) { | |
dr <- lapply(meta_config$meta, function(r) { | |
data.table::data.table("col_name" = r$col_name, | |
"level" = r$level, | |
"type" = r$type, | |
"label" = r$label, | |
"sublabel" = r$sublabel | |
) | |
}) | |
do.call("rbind", dr) | |
} | |
# Get the type of the sample | |
# Cutting of the experiment name and checking in given lists if left column name is in one of those lists | |
# Depending on in which list it is returns the type | |
# @param exp_name experiment name | |
# @param scores,ratios,category,ary vectors with Strings | |
# @return String type of given column | |
get_sample_type <- function(name, scores, ratios, prob, category, ary) { | |
name_split <- strsplit(name, " ") | |
if (grepl("[[:digit:]]{1,2}", utils::tail(name_split[[1]], 1))) { | |
name <- paste(utils::head(name_split[[1]], -1), collapse = " ") | |
} | |
if (name %in% scores) return("score") | |
if (name %in% ratios) return("ratio") | |
if (name %in% prob) return("probability") | |
if (name %in% category) return("category") | |
if (name %in% ary) return("array") | |
return("unknown") | |
} | |
# Get the level of the sample column | |
# Checking for keywords inside of the column name | |
# Each keyword is given one level | |
# @param col_head column head | |
# @return String level of given column | |
get_sample_level <- function(col_head, isSample, full_list) { | |
# Get the level of all 'sample' columns. | |
# Default: level is "sample" | |
if (grepl("Ratio", col_head, perl = TRUE)) { | |
if (grepl("type", col_head, perl = TRUE)) return("feature") | |
return("contrast") | |
} | |
if (grepl("type", col_head, perl = TRUE)) return("feature") | |
if (grepl("Fraction [[:digit:]]{1,2}", col_head, perl = TRUE)) return("condition") | |
if (isSample) return("sample") | |
if (col_head %in% full_list) return("contrast") | |
return("unknown") | |
} | |
# Get label and sublabel of remaining columns | |
# remaining columns <- columns which are not in meta or a sample column | |
# @param col_head column head | |
# @return list with label and sublabel | |
get_remaining_labeling <- function(col_head) { | |
col_head_split <- strsplit(col_head, " ") | |
if (length(col_head_split) > 1) { | |
sublabel <- col_head_split[[1]][length(col_head_split[[1]])] | |
if (grepl(sublabel, "[%]", fixed = TRUE)) { | |
label <- paste(utils::head(col_head_split[[1]], -2), collapse = " ") | |
sublabel <- paste(utils::tail(col_head_split[[1]], 2), collapse = " ") | |
} else { | |
label <- paste(utils::head(col_head_split[[1]], -1), collapse = " ") | |
sublabel <- col_head_split[[1]][length(col_head_split[[1]])] | |
} | |
return(list(label, sublabel)) | |
} | |
return(list(col_head, "")) | |
} | |
# Get list of all column names with experiment name, which are in the reduced version | |
# @param meta_full data table with metadata | |
# @param reduced_list raw list of all columns of every quantification method, which need to in the reduced version | |
# @param exp_names list of experiment names | |
# @param col_names list of all column names | |
# @return list: 1 <- list of all reduced column names, 2 <- columns which are in the raw list but not in the final list | |
get_reduced_version <- function(meta_full, reduced_list, exp_names, col_names) { | |
red_split <- strsplit(reduced_list, ";") | |
red_exp_list <- unlist(lapply(red_split, function(split_name) { | |
# if length == 2 entry of reduced list is column_name;exp | |
# only adding column_name + experiment name to the list | |
if (length(split_name) == 2) { | |
red_col_name <- lapply(exp_names, function(exp_name) { | |
rcn <- paste(split_name[1], exp_name) | |
# special case for Sequence Coverage exp_name [%] | |
if (!(rcn %in% col_names) && (paste(rcn, "[%]") %in% col_names)) { | |
rcn <- paste(rcn, "[%]") | |
} | |
return(rcn) | |
}) | |
} else { | |
# adding both column name + experiment_name and column name without experiment name to list | |
red_col_name <- lapply(1:(length(exp_names) + 1), function(i) { | |
if (i <= length(exp_names)) { | |
if (paste(split_name, exp_names[i]) %in% col_names) { | |
return(paste(split_name, exp_names[i])) | |
} | |
} else { | |
return(split_name) | |
} | |
}) | |
} | |
})) | |
overlap <- setdiff(red_exp_list, col_names) | |
red_exp_list <- red_exp_list[!(red_exp_list %in% overlap)] | |
return(list(red_exp_list, overlap)) | |
} | |
# writing clarion file | |
# @param meta data table with meta data | |
# @param out output file | |
# @param format format | |
# @param version version number | |
# @param exp_id experiment id | |
# @param pGroups data table protein groups file | |
write_clarion_file <- function(meta, out, format, version, exp_id, pGroups, delimiter) { | |
to_append <- FALSE | |
if (!missing(format)) { | |
write(paste0("!format=", format), file = out, append = to_append) | |
to_append <- TRUE | |
} | |
if (!missing(version)) { | |
write(paste0("!version=", version), file = out, append = to_append) | |
to_append <- TRUE | |
} | |
if (!missing(exp_id)) { | |
write(paste0("!experiment_id=", exp_id), file = out, append = to_append) | |
to_append <- TRUE | |
} | |
write(paste0("!delimiter=", delimiter), file = out, append = to_append) | |
write("#key\tfactor1\tlevel\ttype\tlabel\tsub_label", file = out, append = TRUE) | |
data.table::fwrite(meta, file = out, sep = "\t", append = TRUE, col.names = FALSE, quote = FALSE) | |
data.table::fwrite(pGroups, file = out, sep = "\t", append = TRUE, col.names = TRUE, quote = FALSE) | |
} | |
# reading files in data tables | |
proteinGroups <- data.table::fread(proteinGroups_in, header = TRUE, quote = "") | |
summary_file <- data.table::fread(summary_in, header = TRUE) | |
meta_config <- tryCatch({ | |
rjson::fromJSON(file = config) | |
}, error = function(cond) { | |
stop("Could not read config file") | |
}, warning = function(w) { | |
stop("Could not read config file") | |
}) | |
# getting experiment names | |
if ("Experiment" %in% colnames(summary_file)) { | |
exp_names <- unique(summary_file[Experiment != "", Experiment]) | |
} else { | |
stop("wrong format on summary file: column \'Experiment\' misssing") | |
} | |
meta <- get_meta_from_config(meta_config = meta_config) | |
sample_scores <- meta_config$type_scores | |
sample_ratios <- meta_config$type_ratios | |
sample_probability <- meta_config$type_probability | |
sample_category <- meta_config$type_category | |
sample_ary <- meta_config$type_array | |
reduced_list <- meta_config$reduced_list | |
full_sample_list <- c(sample_scores, sample_ratios, sample_probability, sample_category, sample_ary) | |
if (is.null(reduced_list)) { | |
stop("reduced_list is missing in config file") | |
} | |
# get column names | |
col_names <- colnames(proteinGroups) | |
# delete rows from meta, which are not in col_names | |
meta_trim <- meta[col_name %in% col_names] | |
# get metadata for each sample column | |
# append rows to data table with metadata | |
samples_list <- lapply(col_names, function(col_head) { | |
unlist(lapply(exp_names, function(name) { | |
name_brackets <- paste0("\\Q", name) | |
exp_regex <- paste0("\\Q ", name) | |
sample_description <- strsplit(col_head, exp_regex) | |
if (length(grep(name_brackets, col_head, perl = TRUE)) == 1 ) { # Does column name contains experiment name? | |
de <- data.table::data.table("col_name" = c(col_head), | |
"level" = c(get_sample_level(col_head = col_head, isSample = TRUE, full_list = full_sample_list)), | |
"type" = c(get_sample_type(name = sample_description[[1]][1], scores = sample_scores, | |
ratios = sample_ratios, prob = sample_probability, category = sample_category, | |
ary = sample_ary)), | |
"label" = c(name), | |
"sublabel" = c(trimws(gsub(name_brackets, "", col_head), "r")) | |
) | |
return(de) | |
} | |
})) | |
}) | |
samples <- do.call("rbind", Filter(Negate(is.null), samples_list)) | |
meta_half <- rbind(meta_trim, samples) | |
# get metadata for each remaining column(columsn which are specific for a certain quantification methode) | |
# append rows to data table with metadata | |
remaining_list <- lapply(col_names, function(col_head) { | |
if (!(col_head %in% meta_half[["col_name"]])) { | |
label_sublabel <- get_remaining_labeling(col_head = col_head) | |
de2 <- data.table::data.table("col_name" = c(col_head), | |
"level" = c(get_sample_level(col_head = col_head, isSample = FALSE, full_list = full_sample_list)), | |
"type" = c(get_sample_type(name = col_head, scores = sample_scores, ratios = sample_ratios, | |
prob = sample_probability, category = sample_category, ary = sample_ary)), | |
"label" = c(label_sublabel[1]), | |
"sublabel" = c(label_sublabel[2]) | |
) | |
return(de2) | |
} | |
}) | |
remaining <- do.call("rbind", Filter(Negate(is.null), remaining_list)) | |
meta_full <- rbind(meta_half, remaining) | |
meta_full$col_name <- sub("^", "#", meta_full$col_name) | |
# add column factor 1 and reorder the columns | |
meta_full$factor1 <- "" | |
meta_full <- meta_full[, c("col_name", "factor1", "level", "type", "label", "sublabel")] | |
# get data.table with reduced metadata | |
reduced <- get_reduced_version(meta_full = meta_full, reduced_list = reduced_list, | |
exp_names = exp_names, col_names = col_names) | |
meta_reduced <- meta_full[meta_full$col_name %in% sub("^", "#", reduced[[1]]), ] | |
# if there are unknown columns the user gets a warning with all unknown columns | |
# unknown columns wont be writen in the meta data header | |
if (nrow(meta_full[level == "unknown"]) > 0) { | |
meta_warn <- gsub("#", "", meta_full[level == "unknown", col_name]) | |
warning("Following columns are unknown and have been removed (Check JSON config file: 'meta'): ", paste(meta_warn, collapse = ", ")) | |
meta_full <- meta_full[level != "unknown"] | |
} | |
# if there are unknown column types the user gets a warning with all columns with unknown type | |
# columns with unknown types wont be writen in the meta data header | |
if (nrow(meta_full[type == "unknown"]) > 0) { | |
meta_warn <- gsub("#", "", meta_full[type == "unknown", col_name]) | |
warning("Due to unknown type follwing columns were removed (Check JSON config file: 'type_X'): ", paste(meta_warn, collapse = ", ")) | |
meta_full <- meta_full[type != "unknown"] | |
} | |
# writing advanced CLARION file | |
write_clarion_file(meta = meta_full, out = outfile, format = format, | |
version = version, exp_id = experiment_id, pGroups = proteinGroups, delimiter = delimiter) | |
# writing reduced CLARION file | |
write_clarion_file(meta = meta_reduced, out = outfile_reduced, format = format, | |
version = version, exp_id = experiment_id, pGroups = proteinGroups, delimiter = delimiter) | |
return(TRUE) | |
} | |
#' Method to parse input file. | |
#' | |
#' @param file Path to file that needs parsing. | |
#' @param dec The decimal separator. See \code{\link[data.table]{fread}}. | |
#' | |
#' @return Clarion object. See \code{\link[wilson]{Clarion}} | |
#' | |
#' @import data.table | |
#' | |
#' @export | |
parser <- function(file, dec = ".") { | |
message(paste("Parsing file:", file)) | |
# number of rows for each part | |
con <- file(file, open = "r") | |
num_header <- 0 | |
num_metadata <- 0 | |
tryCatch(expr = { | |
while (TRUE) { | |
line <- readLines(con = con, n = 1) | |
if (grepl("^!", line, perl = TRUE)) { | |
num_header <- num_header + 1 | |
} else if (grepl("^#", line, perl = TRUE)) { | |
num_metadata <- num_metadata + 1 | |
} else { | |
break | |
} | |
} | |
}, finally = { | |
close(con = con) | |
}) | |
### parse header | |
if (num_header > 0) { | |
header <- data.table::fread(input = file, fill = TRUE, header = FALSE, dec = dec, nrows = num_header, integer64 = "double") | |
# cut of leading ! | |
header <- as.list(gsub("^!", "", header[[1]])) | |
# make named list | |
header_names <- gsub("=.*$", "", header, perl = TRUE) | |
header <- as.list(gsub("^.*?=", "", header, perl = TRUE)) | |
names(header) <- header_names | |
# remove quotes from delimiter | |
if (!is.null(header$delimiter) && grepl(header$delimiter, pattern = '^".*"$', perl = TRUE)) { | |
header$delimiter <- substr(header$delimiter, start = 2, stop = nchar(header$delimiter) - 1) | |
} | |
} else { | |
header <- NULL | |
} | |
### parse metadata | |
metadata <- data.table::fread(input = file, skip = num_header, header = FALSE, nrows = num_metadata, fill = TRUE, dec = dec, integer64 = "double") | |
# cut off leading # | |
metadata[, names(metadata)[1] := gsub("^#", "", metadata[[1]])] | |
# set first line as header | |
names(metadata) <- as.character(metadata[1]) | |
# remove empty columns | |
metadata <- metadata[, which(unlist(lapply(metadata, function(x) !all(is.na(x) || x == "")))), with = FALSE] | |
# delete first line | |
metadata <- metadata[-1] | |
### parse data | |
data <- data.table::fread(input = file, header = TRUE, skip = num_header + num_metadata, fill = FALSE, dec = dec, integer64 = "double") | |
data.table::setindexv(metadata, names(metadata)[1]) | |
data.table::setindexv(data, names(data)[1]) | |
return(Clarion$new(header = header, metadata = metadata, data = data)) | |
} |