Skip to content

Cluster #43

Merged
merged 9 commits into from
Jan 9, 2019
34 changes: 32 additions & 2 deletions bin/2.1_clustering/cdhit_wrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ option_list <- list(
make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("--gap_ext"), default = -1, help = "Gap extension score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer")
make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer"),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the difference between cluster length and cluster size?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Decreasing length will sort after their sequences length whereas size sorts after number of members per cluster.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah okay.

make_option(opt_str = c("--summary"), default = NULL, help = "Filepath where a small summary file of the run should be written. Default = %default", metavar = "character")
)

opt_parser <- OptionParser(option_list = option_list,
Expand Down Expand Up @@ -68,12 +69,13 @@ opt <- parse_args(opt_parser)
#' @param gap Gap score. Default = -6 (CD-HIT parameter)
#' @param gap_ext Gap extension score. Default = -1 (CD-HIT parameter)
#' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter)
#' @param summary Filepath where a small summary file of the run should be written.
#'
#' @details If there is a header supplied other then the default data.table naming scheme ('V1', 'V2', etc.) it will be kept and extended.
#'
#' @author Hendrik Schultheis <Hendrik.Schultheis@@mpi-bn.mpg.de>
#'
cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) {
cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1, summary = NULL) {
# parameter checks
if (system("which cd-hit-est", ignore.stdout = FALSE) != 0) {
stop("Required program CD-HIT not found! Please check whether it is installed.")
Expand Down Expand Up @@ -169,6 +171,34 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed"
file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file)
}

# write summary
if (!is.null(summary)) {
# script/ function call
file_name <- sub("--file=", "", commandArgs()[grep("--file=", commandArgs())])
call <- ifelse(interactive(), deparse(match.call()),
paste("Rscript", basename(file_name), paste(commandArgs(trailingOnly = TRUE), collapse = " "))
)

total_cluster <- length(unique(result[[ncol(result)]]))

# cluster size table
# columns: cluster_id, size
cluster_table <- data.table::as.data.table(table(result[[ncol(result)]]))
names(cluster_table) <- c("cluster_id", "size")

summary_data <- c(
"Call:",
paste0("\t", call),
"",
"Cluster:",
paste("\tTotal", total_cluster),
""
)

write(x = summary_data, file = summary)
data.table::fwrite(x = cluster_table, file = summary, append = TRUE, sep = "\t", col.names = TRUE)
}

data.table::fwrite(x = result, file = output, sep = "\t", col.names = keep_col_names)
}

Expand Down
43 changes: 41 additions & 2 deletions bin/2.1_clustering/reduce_sequence.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ option_list <- list(
make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and k-mer and can not be lower.", metavar = "integer", type = "integer"),
make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between k-mer. Used to create reduced sequence ranges out of merged k-mer. Can not be greater than k-mer length. Default = kmer - 1", metavar = "integer", type = "integer"),
make_option(opt_str = c("-v", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and k-mer to consider k-mer significant. Used for k-mer cutoff calculation. Can not be greater than motif and k-mer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"),
make_option(opt_str = c("-f", "--motif_occurrence"), default = 1, help = "Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = %default meaning that there should be approximately one motif per sequence.", metavar = "double")
make_option(opt_str = c("-f", "--motif_occurrence"), default = 1, help = "Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = %default meaning that there should be approximately one motif per sequence.", metavar = "double"),
make_option(opt_str = c("--summary"), default = NULL, help = "Filepath where a small summary file of the run should be written. Default = %default", metavar = "character")
)

opt_parser <- OptionParser(option_list = option_list,
Expand All @@ -32,12 +33,13 @@ opt <- parse_args(opt_parser)
#' @param minoverlap_motif Minimum required overlap between motif and k-mer to consider k-mer significant. Used for k-mer cutoff calculation. Can not be greater than motif and k-mer length. Default = ceiling(motif / 2)
#' @param min_seq_length Remove sequences below this length. Defaults to the maximum value of motif and k-mer and can not be lower.
#' @param motif_occurrence Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence.
#' @param summary Filepath where a small summary file of the run should be written.
#'
#' @details If there is a header supplied other then the default data.table naming scheme ('V1', 'V2', etc.) it will be kept.
#'
#' @author Hendrik Schultheis <Hendrik.Schultheis@@mpi-bn.mpg.de>
#'
reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurrence = 1) {
reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = 1, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurrence = 1, summary = NULL) {
# parameter checks
if (system("which jellyfish", ignore.stdout = TRUE) != 0) {
stop("Required program jellyfish not found! Please check whether it is installed.")
Expand Down Expand Up @@ -187,6 +189,43 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed"
file.remove(fasta_file, count_output_binary, mer_count_table)
}

# write summary
if (!is.null(summary)) {
# script/ function call
file_name <- sub("--file=", "", commandArgs()[grep("--file=", commandArgs())])
call <- ifelse(interactive(), deparse(match.call()),
paste("Rscript", basename(file_name), paste(commandArgs(trailingOnly = TRUE), collapse = " "))
)

# sequence length summary
summary_input <- summary(nchar(bed_table[["sequence"]]))
summary_output <- summary(nchar(merged[["sequence"]]))

# sequences removed
seq_to_small <- total_rows - nrow(bed_table)
no_hit_in_assembly <- nrow(bed_table) - nrow(merged)
total_removed <- seq_to_small + no_hit_in_assembly

summary_data <- c(
"Call:",
paste0("\t", call),
"",
"Sequence length information:",
paste0("\t\t", paste0(names(summary_input), collapse = "\t")),
paste("\tInput", paste0(summary_input, collapse = "\t"), sep = "\t"),
paste("\tOutput", paste0(summary_output, collapse = "\t"), sep = "\t"),
"Total sequences:",
paste("\tInput", total_rows),
paste("\tOutput", nrow(merged)),
"Removed sequences:",
paste("\tBelow minimum size", seq_to_small),
paste("\tNo hit in assembly", no_hit_in_assembly),
paste("\tTotal", total_removed)
)

write(x = summary_data, file = summary)
}

# keep provided column names
if (keep_col_names) {
names(merged) <- col_names
Expand Down