Skip to content

Commit

Permalink
automatically detect and keep column names if provided
Browse files Browse the repository at this point in the history
  • Loading branch information
HendrikSchultheis committed Dec 19, 2018
1 parent 4c16f6f commit 5a7c84e
Showing 1 changed file with 22 additions and 5 deletions.
27 changes: 22 additions & 5 deletions bin/reduce_sequence.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ opt <- parse_args(opt_parser)
#' @param min_seq_length Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.
#' @param motif_occurence Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence.
#'
#' @details If there is a header supplied other then the default data.table nameing scheme ('V1', 'V2', etc.) it will be kept.
#'
reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) {
if (system("which jellyfish", ignore.stdout = TRUE) != 0) {
stop("Required program jellyfish not found! Please check whether it is installed.")
Expand All @@ -49,7 +51,17 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed"
message("Loading bed...")
# load bed
# columns: chr, start, end, name, ..., sequence
bed_table <- data.table::fread(input = input, header = FALSE)
bed_table <- data.table::fread(input = input)

# check for header and save it if provided
default_col_names <- grepl(pattern = "^V+\\d$", names(bed_table), perl = TRUE)
if (!any(default_col_names)) {
keep_col_names <- TRUE
col_names <- names(bed_table)
} else {
keep_col_names <- FALSE
}

names(bed_table)[1:4] <- c("chr", "start", "end", "name")
names(bed_table)[ncol(bed_table)] <- "sequence"
# index
Expand Down Expand Up @@ -123,18 +135,23 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed"
merged[, sequence := stringr::str_sub(sequence, relative_start, relative_end)]

# bed files count from 0
merged[, data.table::`:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)]
merged[, `:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)]
# change start end location
merged[, data.table::`:=`(start = start + relative_start, end = start + relative_end)]
merged[, `:=`(start = start + relative_start, end = start + relative_end)]

# clean table
merged[, data.table::`:=`(relative_start = NULL, relative_end = NULL, width = NULL)]
merged[, `:=`(relative_start = NULL, relative_end = NULL, width = NULL)]

if (clean) {
file.remove(fasta_file, count_output_binary, mer_count_table)
}

data.table::fwrite(merged, file = output, sep = "\t", col.names = FALSE)
# keep provided column names
if (keep_col_names) {
names(merged) <- col_names
}

data.table::fwrite(merged, file = output, sep = "\t", col.names = keep_col_names)
}

#' Predict how many interesting kmer are possible for the given data.
Expand Down

0 comments on commit 5a7c84e

Please sign in to comment.