Skip to content

Commit

Permalink
Parameter summary added
Browse files Browse the repository at this point in the history
  • Loading branch information
HendrikSchultheis committed Jan 9, 2019
1 parent 39f4856 commit 53b0857
Showing 1 changed file with 32 additions and 2 deletions.
34 changes: 32 additions & 2 deletions bin/2.1_clustering/cdhit_wrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ option_list <- list(
make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("--gap_ext"), default = -1, help = "Gap extension score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer")
make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer"),
make_option(opt_str = c("--summary"), default = NULL, help = "Filepath where a small summary file of the run should be written. Default = %default", metavar = "character")
)

opt_parser <- OptionParser(option_list = option_list,
Expand Down Expand Up @@ -68,12 +69,13 @@ opt <- parse_args(opt_parser)
#' @param gap Gap score. Default = -6 (CD-HIT parameter)
#' @param gap_ext Gap extension score. Default = -1 (CD-HIT parameter)
#' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter)
#' @param summary Filepath where a small summary file of the run should be written.
#'
#' @details If there is a header supplied other then the default data.table naming scheme ('V1', 'V2', etc.) it will be kept and extended.
#'
#' @author Hendrik Schultheis <Hendrik.Schultheis@@mpi-bn.mpg.de>
#'
cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) {
cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1, summary = NULL) {
# parameter checks
if (system("which cd-hit-est", ignore.stdout = FALSE) != 0) {
stop("Required program CD-HIT not found! Please check whether it is installed.")
Expand Down Expand Up @@ -169,6 +171,34 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed"
file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file)
}

# write summary
if (!is.null(summary)) {
# script/ function call
file_name <- sub("--file=", "", commandArgs()[grep("--file=", commandArgs())])
call <- ifelse(interactive(), deparse(match.call()),
paste("Rscript", basename(file_name), paste(commandArgs(trailingOnly = TRUE), collapse = " "))
)

total_cluster <- length(unique(result[[ncol(result)]]))

# cluster size table
# columns: cluster_id, size
cluster_table <- data.table::as.data.table(table(result[[ncol(result)]]))
names(cluster_table) <- c("cluster_id", "size")

summary_data <- c(
"Call:",
paste0("\t", call),
"",
"Cluster:",
paste("\tTotal", total_cluster),
""
)

write(x = summary_data, file = summary)
data.table::fwrite(x = cluster_table, file = summary, append = TRUE, sep = "\t", col.names = TRUE)
}

data.table::fwrite(x = result, file = output, sep = "\t", col.names = keep_col_names)
}

Expand Down

0 comments on commit 53b0857

Please sign in to comment.