Merge pull request #43 from loosolab/cluster

Cluster
loosolab · Jan 9, 2019 · 9ab7116 · 9ab7116
2 parents 81c6fcd + 53b0857
commit 9ab7116
Show file tree

Hide file tree

Showing 2 changed files with 139 additions and 11 deletions.
diff --git a/bin/2.1_clustering/cdhit_wrapper.R b/bin/2.1_clustering/cdhit_wrapper.R
@@ -1,5 +1,5 @@
 #! /bin/Rscript
-if (!require(optparse)) install.packages("optparse"); library(optparse)
+if (!require(optparse, quietly = TRUE)) install.packages("optparse"); library(optparse)
 
 option_list <- list(
   make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Fourth column is expected to contain names, last column must be sequences.", metavar = "character"),
@@ -29,7 +29,8 @@ option_list <- list(
   make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"),
   make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"),
   make_option(opt_str = c("--gap_ext"), default = -1, help = "Gap extension score. Default = %default (CD-HIT parameter)", metavar = "integer"),
-  make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer")
+  make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer"),
+  make_option(opt_str = c("--summary"), default = NULL, help = "Filepath where a small summary file of the run should be written. Default = %default", metavar = "character")
 )
 
 opt_parser <- OptionParser(option_list = option_list, 
@@ -68,12 +69,14 @@ opt <- parse_args(opt_parser)
 #' @param gap Gap score. Default = -6 (CD-HIT parameter)
 #' @param gap_ext Gap extension score. Default = -1 (CD-HIT parameter)
 #' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter)
+#' @param summary Filepath where a small summary file of the run should be written.
 #' 
 #' @details If there is a header supplied other then the default data.table naming scheme ('V1', 'V2', etc.) it will be kept and extended.
 #' 
 #' @author Hendrik Schultheis <Hendrik.Schultheis@@mpi-bn.mpg.de>
 #' 
-cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) {
+cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1, summary = NULL) {
+  # parameter checks
   if (system("which cd-hit-est", ignore.stdout = FALSE) != 0) {
     stop("Required program CD-HIT not found! Please check whether it is installed.")
   }
@@ -82,6 +85,14 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed"
     stop("No input specified! Please forward a valid bed-file.")
   }
 
+  if (!file.exists(input)) {
+    stop("File ", input, " does not exist!")
+  }
+
+  if (!is.logical(clean)) {
+    stop("Parameter clean has to be a boolean value.")
+  }
+
   message("Loading bed.")
   # load bed if necessary
   if (!data.table::is.data.table(input)) {
@@ -160,12 +171,46 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed"
     file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file)
   }
 
+  # write summary
+  if (!is.null(summary)) {
+    # script/ function call
+    file_name <- sub("--file=", "", commandArgs()[grep("--file=", commandArgs())])
+    call <- ifelse(interactive(), deparse(match.call()),
+                   paste("Rscript", basename(file_name), paste(commandArgs(trailingOnly = TRUE), collapse = " "))
+    )
+
+    total_cluster <- length(unique(result[[ncol(result)]]))
+
+    # cluster size table
+    # columns: cluster_id, size
+    cluster_table <- data.table::as.data.table(table(result[[ncol(result)]]))
+    names(cluster_table) <- c("cluster_id", "size")
+
+    summary_data <- c(
+      "Call:",
+      paste0("\t", call),
+      "",
+      "Cluster:",
+      paste("\tTotal", total_cluster),
+      ""
+    )
+
+    write(x = summary_data, file = summary)
+    data.table::fwrite(x = cluster_table, file = summary, append = TRUE, sep = "\t", col.names = TRUE)
+  }
+
   data.table::fwrite(x = result, file = output, sep = "\t", col.names = keep_col_names)
 }
 
 # call function with given parameter if not in interactive context (e.g. run from shell)
 if (!interactive()) {
   # remove last parameter (help param)
-  params <- opt[-length(opt)]
-  do.call(cdhitest, args = params)
+
+  # show help if called without arguments
+  if (length(commandArgs(trailingOnly = TRUE)) <= 0) {
+    print_help(opt_parser)
+  } else {
+    params <- opt[-length(opt)]
+    do.call(cdhitest, args = params)
+  }
 }
diff --git a/bin/2.1_clustering/reduce_sequence.R b/bin/2.1_clustering/reduce_sequence.R
@@ -1,5 +1,5 @@
 #! /bin/Rscript
-if (!require(optparse)) install.packages("optparse"); library(optparse)
+if (!require(optparse, quietly = TRUE)) install.packages("optparse"); library(optparse)
 
 option_list <- list(
   make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"),
@@ -11,7 +11,8 @@ option_list <- list(
   make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and k-mer and can not be lower.", metavar = "integer", type = "integer"),
   make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between k-mer. Used to create reduced sequence ranges out of merged k-mer. Can not be greater than k-mer length. Default = kmer - 1", metavar = "integer", type = "integer"),
   make_option(opt_str = c("-v", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and k-mer to consider k-mer significant. Used for k-mer cutoff calculation. Can not be greater than motif and k-mer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"),
-  make_option(opt_str = c("-f", "--motif_occurrence"), default = 1, help = "Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = %default meaning that there should be approximately one motif per sequence.", metavar = "double")
+  make_option(opt_str = c("-f", "--motif_occurrence"), default = 1, help = "Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = %default meaning that there should be approximately one motif per sequence.", metavar = "double"),
+  make_option(opt_str = c("--summary"), default = NULL, help = "Filepath where a small summary file of the run should be written. Default = %default", metavar = "character")
 )
 
 opt_parser <- OptionParser(option_list = option_list, 
@@ -32,12 +33,14 @@ opt <- parse_args(opt_parser)
 #' @param minoverlap_motif Minimum required overlap between motif and k-mer to consider k-mer significant. Used for k-mer cutoff calculation. Can not be greater than motif and k-mer length. Default = ceiling(motif / 2)
 #' @param min_seq_length Remove sequences below this length. Defaults to the maximum value of motif and k-mer and can not be lower.
 #' @param motif_occurrence Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence.
+#' @param summary Filepath where a small summary file of the run should be written.
 #' 
 #' @details If there is a header supplied other then the default data.table naming scheme ('V1', 'V2', etc.) it will be kept.
 #' 
 #' @author Hendrik Schultheis <Hendrik.Schultheis@@mpi-bn.mpg.de>
 #' 
-reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurrence = 1) {
+reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = 1, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurrence = 1, summary = NULL) {
+  # parameter checks
   if (system("which jellyfish", ignore.stdout = TRUE) != 0) {
     stop("Required program jellyfish not found! Please check whether it is installed.")
   }
@@ -46,6 +49,42 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed"
     stop("No input specified! Please forward a valid bed-file.")
   }
 
+  if (!file.exists(input)) {
+    stop("File ", input, " does not exist!")
+  }
+
+  if (!is.numeric(kmer) || kmer != as.integer(kmer) || kmer <= 0) {
+    stop("K-mer has to be a positive integer above 0.")
+  }
+
+  if (!is.numeric(motif) || motif != as.integer(motif) || motif <= 0) {
+    stop("Motif has to be a positive integer above 0.")
+  }
+
+  if (!is.numeric(threads) || threads != as.integer(threads) || threads < 0) {
+    stop("Threads has to be a positive integer (0 or greater).")
+  }
+
+  if (!is.logical(clean)) {
+    stop("Parameter clean has to be a boolean value.")
+  }
+
+  if (!is.numeric(minoverlap_kmer) || minoverlap_kmer != as.integer(minoverlap_kmer) || minoverlap_kmer <= 0) {
+    stop("Minoverlap_kmer has to be a positive integer above 0.")
+  }
+
+  if (!is.numeric(minoverlap_motif) || minoverlap_motif != as.integer(minoverlap_motif) || minoverlap_motif <= 0) {
+    stop("Minoverlap_motif has to be a positive integer above 0.")
+  }
+
+  if (!is.numeric(min_seq_length) || min_seq_length != as.integer(min_seq_length) || min_seq_length <= 0) {
+    stop("Min_seq_length hat to be a positive integer above 0.")
+  }
+
+  if (!is.numeric(motif_occurrence) || motif_occurrence < 0 || motif_occurrence > 1) { # TODO remove motif_occurence > 1. See TODO of find_kmer_regions below.
+    stop("Motif_occurence has to be a numeric value above 0 and can not be greater than 1.")
+  }
+
   # get number of available cores
   if (threads == 0) {
     threads <- parallel::detectCores()
@@ -119,6 +158,7 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed"
 
   # reduce k-mer
   reduced_kmer <- reduce_kmer(kmer = kmer_counts, significant = keep_hits)
+  message("Reduced kmer to first most frequent ", nrow(reduced_kmer), " out of ", nrow(kmer_counts), ".")
 
   message("Find k-mer in sequences.")
   # find k-mer in sequences
@@ -149,6 +189,43 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed"
     file.remove(fasta_file, count_output_binary, mer_count_table)
   }
 
+  # write summary
+  if (!is.null(summary)) {
+    # script/ function call
+    file_name <- sub("--file=", "", commandArgs()[grep("--file=", commandArgs())])
+    call <- ifelse(interactive(), deparse(match.call()),
+                        paste("Rscript", basename(file_name), paste(commandArgs(trailingOnly = TRUE), collapse = " "))
+                        )
+
+    # sequence length summary
+    summary_input <- summary(nchar(bed_table[["sequence"]]))
+    summary_output <- summary(nchar(merged[["sequence"]]))
+
+    # sequences removed
+    seq_to_small <- total_rows - nrow(bed_table)
+    no_hit_in_assembly <- nrow(bed_table) - nrow(merged)
+    total_removed <- seq_to_small + no_hit_in_assembly
+
+    summary_data <- c(
+      "Call:",
+      paste0("\t", call),
+      "",
+      "Sequence length information:",
+      paste0("\t\t", paste0(names(summary_input), collapse = "\t")),
+      paste("\tInput", paste0(summary_input, collapse = "\t"), sep = "\t"),
+      paste("\tOutput", paste0(summary_output, collapse = "\t"), sep = "\t"),
+      "Total sequences:",
+      paste("\tInput", total_rows),
+      paste("\tOutput", nrow(merged)),
+      "Removed sequences:",
+      paste("\tBelow minimum size", seq_to_small),
+      paste("\tNo hit in assembly", no_hit_in_assembly),
+      paste("\tTotal", total_removed)
+    )
+
+    write(x = summary_data, file = summary)
+  }
+
   # keep provided column names
   if (keep_col_names) {
     names(merged) <- col_names
@@ -280,7 +357,13 @@ find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL)
 if (!interactive()) {
   # show apply progressbar
   pbo <- pbapply::pboptions(type = "timer")
-  # remove last parameter (help param)
-  params <- opt[-length(opt)]
-  do.call(reduce_sequence, args = params)  
+
+  # show help if called without arguments
+  if (length(commandArgs(trailingOnly = TRUE)) <= 0) {
+    print_help(opt_parser)
+  } else {
+    # remove last parameter (help param)
+    params <- opt[-length(opt)]
+    do.call(reduce_sequence, args = params)      
+  }
 }