From 3b4eb716346595879f8f3436d34de3dea07c4a70 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 15:23:37 +0100 Subject: [PATCH 1/7] print number of reduced kmer --- bin/reduce_bed.R | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index 2f8e8a2..6aeccbf 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -98,6 +98,7 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr # reduce kmer reduced_kmer <- reduce_kmer(kmer = kmer_counts, significant = keep_hits) + message("Reduced kmer to first most frequent ", nrow(reduced_kmer), " out of ", nrow(kmer_counts), ".") message("Find kmer in sequences.") # find k-mer in sequences From 5ee211392b3ae7f5ad9c103fcc56ceb3df664024 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Mon, 7 Jan 2019 09:45:42 +0100 Subject: [PATCH 2/7] show help if called without arguments; require = quiet --- bin/2.1_clustering/reduce_sequence.R | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/bin/2.1_clustering/reduce_sequence.R b/bin/2.1_clustering/reduce_sequence.R index e5abe33..e1d88d9 100644 --- a/bin/2.1_clustering/reduce_sequence.R +++ b/bin/2.1_clustering/reduce_sequence.R @@ -1,5 +1,5 @@ #! /bin/Rscript -if (!require(optparse)) install.packages("optparse"); library(optparse) +if (!require(optparse, quietly = TRUE)) install.packages("optparse"); library(optparse) option_list <- list( make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"), @@ -281,7 +281,13 @@ find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) if (!interactive()) { # show apply progressbar pbo <- pbapply::pboptions(type = "timer") - # remove last parameter (help param) - params <- opt[-length(opt)] - do.call(reduce_sequence, args = params) + + # show help if called without arguments + if (length(commandArgs(trailingOnly = TRUE)) <= 0) { + print_help(opt_parser) + } else { + # remove last parameter (help param) + params <- opt[-length(opt)] + do.call(reduce_sequence, args = params) + } } From dd46d81f43aeb213261d78b0527ffd5cdc5b1e3b Mon Sep 17 00:00:00 2001 From: Schultheis Date: Mon, 7 Jan 2019 11:47:29 +0100 Subject: [PATCH 3/7] show help if called without arguments; require = quiet --- bin/2.1_clustering/cdhit_wrapper.R | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/bin/2.1_clustering/cdhit_wrapper.R b/bin/2.1_clustering/cdhit_wrapper.R index dead7b7..408503e 100644 --- a/bin/2.1_clustering/cdhit_wrapper.R +++ b/bin/2.1_clustering/cdhit_wrapper.R @@ -1,5 +1,5 @@ #! /bin/Rscript -if (!require(optparse)) install.packages("optparse"); library(optparse) +if (!require(optparse, quietly = TRUE)) install.packages("optparse"); library(optparse) option_list <- list( make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Fourth column is expected to contain names, last column must be sequences.", metavar = "character"), @@ -166,6 +166,12 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed" # call function with given parameter if not in interactive context (e.g. run from shell) if (!interactive()) { # remove last parameter (help param) - params <- opt[-length(opt)] - do.call(cdhitest, args = params) + + # show help if called without arguments + if (length(commandArgs(trailingOnly = TRUE)) <= 0) { + print_help(opt_parser) + } else { + params <- opt[-length(opt)] + do.call(cdhitest, args = params) + } } From ffbf2946cbdb0542c16b1f7d92dcadfa33590587 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Tue, 8 Jan 2019 09:59:09 +0100 Subject: [PATCH 4/7] Basic parameter checks added --- bin/2.1_clustering/reduce_sequence.R | 37 ++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/bin/2.1_clustering/reduce_sequence.R b/bin/2.1_clustering/reduce_sequence.R index e1d88d9..cc57871 100644 --- a/bin/2.1_clustering/reduce_sequence.R +++ b/bin/2.1_clustering/reduce_sequence.R @@ -38,6 +38,7 @@ opt <- parse_args(opt_parser) #' @author Hendrik Schultheis #' reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurrence = 1) { + # parameter checks if (system("which jellyfish", ignore.stdout = TRUE) != 0) { stop("Required program jellyfish not found! Please check whether it is installed.") } @@ -46,6 +47,42 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed" stop("No input specified! Please forward a valid bed-file.") } + if (!file.exists(input)) { + stop("File ", input, " does not exist!") + } + + if (!is.numeric(kmer) || kmer != as.integer(kmer) || kmer <= 0) { + stop("K-mer has to be a positive integer above 0.") + } + + if (!is.numeric(motif) || motif != as.integer(motif) || motif <= 0) { + stop("Motif has to be a positive integer above 0.") + } + + if (!is.numeric(threads) || threads != as.integer(threads) || threads < 0) { + stop("Threads has to be a positive integer (0 or greater).") + } + + if (!is.logical(clean)) { + stop("Parameter clean has to be a boolean value.") + } + + if (!is.numeric(minoverlap_kmer) || minoverlap_kmer != as.integer(minoverlap_kmer) || minoverlap_kmer <= 0) { + stop("Minoverlap_kmer has to be a positive integer above 0.") + } + + if (!is.numeric(minoverlap_motif) || minoverlap_motif != as.integer(minoverlap_motif) || minoverlap_motif <= 0) { + stop("Minoverlap_motif has to be a positive integer above 0.") + } + + if (!is.numeric(min_seq_length) || min_seq_length != as.integer(min_seq_length) || min_seq_length <= 0) { + stop("Min_seq_length hat to be a positive integer above 0.") + } + + if (!is.numeric(motif_occurrence) || motif_occurrence < 0 || motif_occurrence > 1) { # TODO remove motif_occurence > 1. See TODO of find_kmer_regions below. + stop("Motif_occurence has to be a numeric value above 0 and can not be greater than 1.") + } + # get number of available cores if (threads == 0) { threads <- parallel::detectCores() From 961e0ec038c1674afcd7b7dd2924c349bdc90457 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Tue, 8 Jan 2019 14:17:04 +0100 Subject: [PATCH 5/7] Added basic parameter checks. --- bin/2.1_clustering/cdhit_wrapper.R | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/bin/2.1_clustering/cdhit_wrapper.R b/bin/2.1_clustering/cdhit_wrapper.R index 408503e..aedddc4 100644 --- a/bin/2.1_clustering/cdhit_wrapper.R +++ b/bin/2.1_clustering/cdhit_wrapper.R @@ -74,6 +74,7 @@ opt <- parse_args(opt_parser) #' @author Hendrik Schultheis #' cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) { + # parameter checks if (system("which cd-hit-est", ignore.stdout = FALSE) != 0) { stop("Required program CD-HIT not found! Please check whether it is installed.") } @@ -82,6 +83,14 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed" stop("No input specified! Please forward a valid bed-file.") } + if (!file.exists(input)) { + stop("File ", input, " does not exist!") + } + + if (!is.logical(clean)) { + stop("Parameter clean has to be a boolean value.") + } + message("Loading bed.") # load bed if necessary if (!data.table::is.data.table(input)) { From 39f485662165275ba1c98c8aa00c9083f3eb2d2b Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 9 Jan 2019 15:25:11 +0100 Subject: [PATCH 6/7] Parameter summary added; fixed wrong default for threads parameter --- bin/2.1_clustering/reduce_sequence.R | 43 ++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/bin/2.1_clustering/reduce_sequence.R b/bin/2.1_clustering/reduce_sequence.R index cc57871..15d99ae 100644 --- a/bin/2.1_clustering/reduce_sequence.R +++ b/bin/2.1_clustering/reduce_sequence.R @@ -11,7 +11,8 @@ option_list <- list( make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and k-mer and can not be lower.", metavar = "integer", type = "integer"), make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between k-mer. Used to create reduced sequence ranges out of merged k-mer. Can not be greater than k-mer length. Default = kmer - 1", metavar = "integer", type = "integer"), make_option(opt_str = c("-v", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and k-mer to consider k-mer significant. Used for k-mer cutoff calculation. Can not be greater than motif and k-mer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"), - make_option(opt_str = c("-f", "--motif_occurrence"), default = 1, help = "Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = %default meaning that there should be approximately one motif per sequence.", metavar = "double") + make_option(opt_str = c("-f", "--motif_occurrence"), default = 1, help = "Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = %default meaning that there should be approximately one motif per sequence.", metavar = "double"), + make_option(opt_str = c("--summary"), default = NULL, help = "Filepath where a small summary file of the run should be written. Default = %default", metavar = "character") ) opt_parser <- OptionParser(option_list = option_list, @@ -32,12 +33,13 @@ opt <- parse_args(opt_parser) #' @param minoverlap_motif Minimum required overlap between motif and k-mer to consider k-mer significant. Used for k-mer cutoff calculation. Can not be greater than motif and k-mer length. Default = ceiling(motif / 2) #' @param min_seq_length Remove sequences below this length. Defaults to the maximum value of motif and k-mer and can not be lower. #' @param motif_occurrence Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence. +#' @param summary Filepath where a small summary file of the run should be written. #' #' @details If there is a header supplied other then the default data.table naming scheme ('V1', 'V2', etc.) it will be kept. #' #' @author Hendrik Schultheis #' -reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurrence = 1) { +reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = 1, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurrence = 1, summary = NULL) { # parameter checks if (system("which jellyfish", ignore.stdout = TRUE) != 0) { stop("Required program jellyfish not found! Please check whether it is installed.") @@ -187,6 +189,43 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed" file.remove(fasta_file, count_output_binary, mer_count_table) } + # write summary + if (!is.null(summary)) { + # script/ function call + file_name <- sub("--file=", "", commandArgs()[grep("--file=", commandArgs())]) + call <- ifelse(interactive(), deparse(match.call()), + paste("Rscript", basename(file_name), paste(commandArgs(trailingOnly = TRUE), collapse = " ")) + ) + + # sequence length summary + summary_input <- summary(nchar(bed_table[["sequence"]])) + summary_output <- summary(nchar(merged[["sequence"]])) + + # sequences removed + seq_to_small <- total_rows - nrow(bed_table) + no_hit_in_assembly <- nrow(bed_table) - nrow(merged) + total_removed <- seq_to_small + no_hit_in_assembly + + summary_data <- c( + "Call:", + paste0("\t", call), + "", + "Sequence length information:", + paste0("\t\t", paste0(names(summary_input), collapse = "\t")), + paste("\tInput", paste0(summary_input, collapse = "\t"), sep = "\t"), + paste("\tOutput", paste0(summary_output, collapse = "\t"), sep = "\t"), + "Total sequences:", + paste("\tInput", total_rows), + paste("\tOutput", nrow(merged)), + "Removed sequences:", + paste("\tBelow minimum size", seq_to_small), + paste("\tNo hit in assembly", no_hit_in_assembly), + paste("\tTotal", total_removed) + ) + + write(x = summary_data, file = summary) + } + # keep provided column names if (keep_col_names) { names(merged) <- col_names From 53b0857b4ab2125d8583b74dbfb78fe5578f5381 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 9 Jan 2019 16:05:35 +0100 Subject: [PATCH 7/7] Parameter summary added --- bin/2.1_clustering/cdhit_wrapper.R | 34 ++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/bin/2.1_clustering/cdhit_wrapper.R b/bin/2.1_clustering/cdhit_wrapper.R index aedddc4..ebf7667 100644 --- a/bin/2.1_clustering/cdhit_wrapper.R +++ b/bin/2.1_clustering/cdhit_wrapper.R @@ -29,7 +29,8 @@ option_list <- list( make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--gap_ext"), default = -1, help = "Gap extension score. Default = %default (CD-HIT parameter)", metavar = "integer"), - make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer") + make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer"), + make_option(opt_str = c("--summary"), default = NULL, help = "Filepath where a small summary file of the run should be written. Default = %default", metavar = "character") ) opt_parser <- OptionParser(option_list = option_list, @@ -68,12 +69,13 @@ opt <- parse_args(opt_parser) #' @param gap Gap score. Default = -6 (CD-HIT parameter) #' @param gap_ext Gap extension score. Default = -1 (CD-HIT parameter) #' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter) +#' @param summary Filepath where a small summary file of the run should be written. #' #' @details If there is a header supplied other then the default data.table naming scheme ('V1', 'V2', etc.) it will be kept and extended. #' #' @author Hendrik Schultheis #' -cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) { +cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1, summary = NULL) { # parameter checks if (system("which cd-hit-est", ignore.stdout = FALSE) != 0) { stop("Required program CD-HIT not found! Please check whether it is installed.") @@ -169,6 +171,34 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed" file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file) } + # write summary + if (!is.null(summary)) { + # script/ function call + file_name <- sub("--file=", "", commandArgs()[grep("--file=", commandArgs())]) + call <- ifelse(interactive(), deparse(match.call()), + paste("Rscript", basename(file_name), paste(commandArgs(trailingOnly = TRUE), collapse = " ")) + ) + + total_cluster <- length(unique(result[[ncol(result)]])) + + # cluster size table + # columns: cluster_id, size + cluster_table <- data.table::as.data.table(table(result[[ncol(result)]])) + names(cluster_table) <- c("cluster_id", "size") + + summary_data <- c( + "Call:", + paste0("\t", call), + "", + "Cluster:", + paste("\tTotal", total_cluster), + "" + ) + + write(x = summary_data, file = summary) + data.table::fwrite(x = cluster_table, file = summary, append = TRUE, sep = "\t", col.names = TRUE) + } + data.table::fwrite(x = result, file = output, sep = "\t", col.names = keep_col_names) }