Skip to content

Cluster #43

Merged
merged 9 commits into from
Jan 9, 2019
55 changes: 50 additions & 5 deletions bin/2.1_clustering/cdhit_wrapper.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#! /bin/Rscript
if (!require(optparse)) install.packages("optparse"); library(optparse)
if (!require(optparse, quietly = TRUE)) install.packages("optparse"); library(optparse)

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Fourth column is expected to contain names, last column must be sequences.", metavar = "character"),
Expand Down Expand Up @@ -29,7 +29,8 @@ option_list <- list(
make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("--gap_ext"), default = -1, help = "Gap extension score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer")
make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer"),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the difference between cluster length and cluster size?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Decreasing length will sort after their sequences length whereas size sorts after number of members per cluster.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah okay.

make_option(opt_str = c("--summary"), default = NULL, help = "Filepath where a small summary file of the run should be written. Default = %default", metavar = "character")
)

opt_parser <- OptionParser(option_list = option_list,
Expand Down Expand Up @@ -68,12 +69,14 @@ opt <- parse_args(opt_parser)
#' @param gap Gap score. Default = -6 (CD-HIT parameter)
#' @param gap_ext Gap extension score. Default = -1 (CD-HIT parameter)
#' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter)
#' @param summary Filepath where a small summary file of the run should be written.
#'
#' @details If there is a header supplied other then the default data.table naming scheme ('V1', 'V2', etc.) it will be kept and extended.
#'
#' @author Hendrik Schultheis <Hendrik.Schultheis@@mpi-bn.mpg.de>
#'
cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) {
cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1, summary = NULL) {
# parameter checks
if (system("which cd-hit-est", ignore.stdout = FALSE) != 0) {
stop("Required program CD-HIT not found! Please check whether it is installed.")
}
Expand All @@ -82,6 +85,14 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed"
stop("No input specified! Please forward a valid bed-file.")
}

if (!file.exists(input)) {
stop("File ", input, " does not exist!")
}

if (!is.logical(clean)) {
stop("Parameter clean has to be a boolean value.")
}

message("Loading bed.")
# load bed if necessary
if (!data.table::is.data.table(input)) {
Expand Down Expand Up @@ -160,12 +171,46 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed"
file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file)
}

# write summary
if (!is.null(summary)) {
# script/ function call
file_name <- sub("--file=", "", commandArgs()[grep("--file=", commandArgs())])
call <- ifelse(interactive(), deparse(match.call()),
paste("Rscript", basename(file_name), paste(commandArgs(trailingOnly = TRUE), collapse = " "))
)

total_cluster <- length(unique(result[[ncol(result)]]))

# cluster size table
# columns: cluster_id, size
cluster_table <- data.table::as.data.table(table(result[[ncol(result)]]))
names(cluster_table) <- c("cluster_id", "size")

summary_data <- c(
"Call:",
paste0("\t", call),
"",
"Cluster:",
paste("\tTotal", total_cluster),
""
)

write(x = summary_data, file = summary)
data.table::fwrite(x = cluster_table, file = summary, append = TRUE, sep = "\t", col.names = TRUE)
}

data.table::fwrite(x = result, file = output, sep = "\t", col.names = keep_col_names)
}

# call function with given parameter if not in interactive context (e.g. run from shell)
if (!interactive()) {
# remove last parameter (help param)
params <- opt[-length(opt)]
do.call(cdhitest, args = params)

# show help if called without arguments
if (length(commandArgs(trailingOnly = TRUE)) <= 0) {
print_help(opt_parser)
} else {
params <- opt[-length(opt)]
do.call(cdhitest, args = params)
}
}
95 changes: 89 additions & 6 deletions bin/2.1_clustering/reduce_sequence.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#! /bin/Rscript
if (!require(optparse)) install.packages("optparse"); library(optparse)
if (!require(optparse, quietly = TRUE)) install.packages("optparse"); library(optparse)

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"),
Expand All @@ -11,7 +11,8 @@ option_list <- list(
make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and k-mer and can not be lower.", metavar = "integer", type = "integer"),
make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between k-mer. Used to create reduced sequence ranges out of merged k-mer. Can not be greater than k-mer length. Default = kmer - 1", metavar = "integer", type = "integer"),
make_option(opt_str = c("-v", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and k-mer to consider k-mer significant. Used for k-mer cutoff calculation. Can not be greater than motif and k-mer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"),
make_option(opt_str = c("-f", "--motif_occurrence"), default = 1, help = "Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = %default meaning that there should be approximately one motif per sequence.", metavar = "double")
make_option(opt_str = c("-f", "--motif_occurrence"), default = 1, help = "Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = %default meaning that there should be approximately one motif per sequence.", metavar = "double"),
make_option(opt_str = c("--summary"), default = NULL, help = "Filepath where a small summary file of the run should be written. Default = %default", metavar = "character")
)

opt_parser <- OptionParser(option_list = option_list,
Expand All @@ -32,12 +33,14 @@ opt <- parse_args(opt_parser)
#' @param minoverlap_motif Minimum required overlap between motif and k-mer to consider k-mer significant. Used for k-mer cutoff calculation. Can not be greater than motif and k-mer length. Default = ceiling(motif / 2)
#' @param min_seq_length Remove sequences below this length. Defaults to the maximum value of motif and k-mer and can not be lower.
#' @param motif_occurrence Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence.
#' @param summary Filepath where a small summary file of the run should be written.
#'
#' @details If there is a header supplied other then the default data.table naming scheme ('V1', 'V2', etc.) it will be kept.
#'
#' @author Hendrik Schultheis <Hendrik.Schultheis@@mpi-bn.mpg.de>
#'
reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurrence = 1) {
reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = 1, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurrence = 1, summary = NULL) {
# parameter checks
if (system("which jellyfish", ignore.stdout = TRUE) != 0) {
stop("Required program jellyfish not found! Please check whether it is installed.")
}
Expand All @@ -46,6 +49,42 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed"
stop("No input specified! Please forward a valid bed-file.")
}

if (!file.exists(input)) {
stop("File ", input, " does not exist!")
}

if (!is.numeric(kmer) || kmer != as.integer(kmer) || kmer <= 0) {
stop("K-mer has to be a positive integer above 0.")
}

if (!is.numeric(motif) || motif != as.integer(motif) || motif <= 0) {
stop("Motif has to be a positive integer above 0.")
}

if (!is.numeric(threads) || threads != as.integer(threads) || threads < 0) {
stop("Threads has to be a positive integer (0 or greater).")
}

if (!is.logical(clean)) {
stop("Parameter clean has to be a boolean value.")
}

if (!is.numeric(minoverlap_kmer) || minoverlap_kmer != as.integer(minoverlap_kmer) || minoverlap_kmer <= 0) {
stop("Minoverlap_kmer has to be a positive integer above 0.")
}

if (!is.numeric(minoverlap_motif) || minoverlap_motif != as.integer(minoverlap_motif) || minoverlap_motif <= 0) {
stop("Minoverlap_motif has to be a positive integer above 0.")
}

if (!is.numeric(min_seq_length) || min_seq_length != as.integer(min_seq_length) || min_seq_length <= 0) {
stop("Min_seq_length hat to be a positive integer above 0.")
}

if (!is.numeric(motif_occurrence) || motif_occurrence < 0 || motif_occurrence > 1) { # TODO remove motif_occurence > 1. See TODO of find_kmer_regions below.
stop("Motif_occurence has to be a numeric value above 0 and can not be greater than 1.")
}

# get number of available cores
if (threads == 0) {
threads <- parallel::detectCores()
Expand Down Expand Up @@ -119,6 +158,7 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed"

# reduce k-mer
reduced_kmer <- reduce_kmer(kmer = kmer_counts, significant = keep_hits)
message("Reduced kmer to first most frequent ", nrow(reduced_kmer), " out of ", nrow(kmer_counts), ".")

message("Find k-mer in sequences.")
# find k-mer in sequences
Expand Down Expand Up @@ -149,6 +189,43 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed"
file.remove(fasta_file, count_output_binary, mer_count_table)
}

# write summary
if (!is.null(summary)) {
# script/ function call
file_name <- sub("--file=", "", commandArgs()[grep("--file=", commandArgs())])
call <- ifelse(interactive(), deparse(match.call()),
paste("Rscript", basename(file_name), paste(commandArgs(trailingOnly = TRUE), collapse = " "))
)

# sequence length summary
summary_input <- summary(nchar(bed_table[["sequence"]]))
summary_output <- summary(nchar(merged[["sequence"]]))

# sequences removed
seq_to_small <- total_rows - nrow(bed_table)
no_hit_in_assembly <- nrow(bed_table) - nrow(merged)
total_removed <- seq_to_small + no_hit_in_assembly

summary_data <- c(
"Call:",
paste0("\t", call),
"",
"Sequence length information:",
paste0("\t\t", paste0(names(summary_input), collapse = "\t")),
paste("\tInput", paste0(summary_input, collapse = "\t"), sep = "\t"),
paste("\tOutput", paste0(summary_output, collapse = "\t"), sep = "\t"),
"Total sequences:",
paste("\tInput", total_rows),
paste("\tOutput", nrow(merged)),
"Removed sequences:",
paste("\tBelow minimum size", seq_to_small),
paste("\tNo hit in assembly", no_hit_in_assembly),
paste("\tTotal", total_removed)
)

write(x = summary_data, file = summary)
}

# keep provided column names
if (keep_col_names) {
names(merged) <- col_names
Expand Down Expand Up @@ -280,7 +357,13 @@ find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL)
if (!interactive()) {
# show apply progressbar
pbo <- pbapply::pboptions(type = "timer")
# remove last parameter (help param)
params <- opt[-length(opt)]
do.call(reduce_sequence, args = params)

# show help if called without arguments
if (length(commandArgs(trailingOnly = TRUE)) <= 0) {
print_help(opt_parser)
} else {
# remove last parameter (help param)
params <- opt[-length(opt)]
do.call(reduce_sequence, args = params)
}
}