From 5a7c84eb969aed0d111ba320a0b15310c275508d Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 19 Dec 2018 17:11:46 +0100 Subject: [PATCH] automatically detect and keep column names if provided --- bin/reduce_sequence.R | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/bin/reduce_sequence.R b/bin/reduce_sequence.R index d13f899..f9e8340 100644 --- a/bin/reduce_sequence.R +++ b/bin/reduce_sequence.R @@ -32,6 +32,8 @@ opt <- parse_args(opt_parser) #' @param min_seq_length Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower. #' @param motif_occurence Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence. #' +#' @details If there is a header supplied other then the default data.table nameing scheme ('V1', 'V2', etc.) it will be kept. +#' reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) { if (system("which jellyfish", ignore.stdout = TRUE) != 0) { stop("Required program jellyfish not found! Please check whether it is installed.") @@ -49,7 +51,17 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed" message("Loading bed...") # load bed # columns: chr, start, end, name, ..., sequence - bed_table <- data.table::fread(input = input, header = FALSE) + bed_table <- data.table::fread(input = input) + + # check for header and save it if provided + default_col_names <- grepl(pattern = "^V+\\d$", names(bed_table), perl = TRUE) + if (!any(default_col_names)) { + keep_col_names <- TRUE + col_names <- names(bed_table) + } else { + keep_col_names <- FALSE + } + names(bed_table)[1:4] <- c("chr", "start", "end", "name") names(bed_table)[ncol(bed_table)] <- "sequence" # index @@ -123,18 +135,23 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed" merged[, sequence := stringr::str_sub(sequence, relative_start, relative_end)] # bed files count from 0 - merged[, data.table::`:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)] + merged[, `:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)] # change start end location - merged[, data.table::`:=`(start = start + relative_start, end = start + relative_end)] + merged[, `:=`(start = start + relative_start, end = start + relative_end)] # clean table - merged[, data.table::`:=`(relative_start = NULL, relative_end = NULL, width = NULL)] + merged[, `:=`(relative_start = NULL, relative_end = NULL, width = NULL)] if (clean) { file.remove(fasta_file, count_output_binary, mer_count_table) } - data.table::fwrite(merged, file = output, sep = "\t", col.names = FALSE) + # keep provided column names + if (keep_col_names) { + names(merged) <- col_names + } + + data.table::fwrite(merged, file = output, sep = "\t", col.names = keep_col_names) } #' Predict how many interesting kmer are possible for the given data.