automatically detect and keep column names if provided

loosolab · Dec 19, 2018 · 5a7c84e · 5a7c84e
1 parent 4c16f6f
commit 5a7c84e
Showing 1 changed file with 22 additions and 5 deletions.
diff --git a/bin/reduce_sequence.R b/bin/reduce_sequence.R
@@ -32,6 +32,8 @@ opt <- parse_args(opt_parser)
 #' @param min_seq_length Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.
 #' @param motif_occurence Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence.
 #' 
+#' @details If there is a header supplied other then the default data.table nameing scheme ('V1', 'V2', etc.) it will be kept.
+#' 
 reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) {
   if (system("which jellyfish", ignore.stdout = TRUE) != 0) {
     stop("Required program jellyfish not found! Please check whether it is installed.")
@@ -49,7 +51,17 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed"
   message("Loading bed...")
   # load bed
   # columns: chr, start, end, name, ..., sequence
-  bed_table <- data.table::fread(input = input, header = FALSE)
+  bed_table <- data.table::fread(input = input)
+
+  # check for header and save it if provided
+  default_col_names <- grepl(pattern = "^V+\\d$", names(bed_table), perl = TRUE)
+  if (!any(default_col_names)) {
+    keep_col_names <- TRUE
+    col_names <- names(bed_table)
+  } else {
+    keep_col_names <- FALSE
+  }
+
   names(bed_table)[1:4] <- c("chr", "start", "end", "name")
   names(bed_table)[ncol(bed_table)] <- "sequence"
   # index
@@ -123,18 +135,23 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed"
   merged[, sequence := stringr::str_sub(sequence, relative_start, relative_end)]  
 
   # bed files count from 0
-  merged[, data.table::`:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)]
+  merged[, `:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)]
   # change start end location
-  merged[, data.table::`:=`(start = start + relative_start, end = start + relative_end)]
+  merged[, `:=`(start = start + relative_start, end = start + relative_end)]
 
   # clean table
-  merged[, data.table::`:=`(relative_start = NULL, relative_end = NULL, width = NULL)]
+  merged[, `:=`(relative_start = NULL, relative_end = NULL, width = NULL)]
 
   if (clean) {
     file.remove(fasta_file, count_output_binary, mer_count_table)
   }
 
-  data.table::fwrite(merged, file = output, sep = "\t", col.names = FALSE)
+  # keep provided column names
+  if (keep_col_names) {
+    names(merged) <- col_names
+  }
+
+  data.table::fwrite(merged, file = output, sep = "\t", col.names = keep_col_names)
 }
 
 #' Predict how many interesting kmer are possible for the given data.