diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R index f26a71a..9c62564 100644 --- a/bin/cdhit_wrapper.R +++ b/bin/cdhit_wrapper.R @@ -3,53 +3,77 @@ library("optparse") option_list <- list( make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Fourth column is expected to contain names, last column must be sequences.", metavar = "character"), - make_option(opt_str = c("-c", "--identity"), default = 0.8, help = "Identity threshold. Default = %default", metavar = "double >= 0.8"), - make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default", metavar = "integer"), + make_option(opt_str = c("-c", "--identity"), default = 0.8, help = "Identity threshold. Default = %default (CD-HIT parameter)", metavar = "double >= 0.8"), + make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file same as input but with appended column of cluster numbers. Default = %default", metavar = "character"), make_option(opt_str = c("--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), - # make_option(opt_str = c("-G", "--global"), default = FALSE), - # make_option(opt_str = c("-b", "--band_width"), default = 20), - # make_option(opt_str = c("-M", "--memory"), default = 800), - make_option(opt_str = c("-T", "--threads"), default = 1)#, - # make_option(opt_str = c("-n", "--word_length"), default = 3), - # make_option(opt_str = c("-l", "--throw_away_sequences"), default = 5), + make_option(opt_str = c("-G", "--global"), default = 0, help = "Global sequence identity = 1, local = 0. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-b", "--band_width"), default = 20, help = "Alignment band width. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-M", "--memory"), default = 800, help = "Memory limit in MB. 0 for unlimited. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-T", "--threads"), default = 1, help = "Number of threads. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-n", "--word_length"), default = 3, help = "Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-l", "--throw_away_sequences"), default = 5, help = "Maximum length of sequences thrown away. Default = %default (CD-HIT parameter)", metavar = "integer"), # make_option(opt_str = c("-d", "--description")), # can not produce bed if this is != 0 - # make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0), - # make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999), - # make_option(opt_str = c("-aL", "--alignment_coverage_longer_p"), default = 0.0), - # make_option(opt_str = c("-AL", "--alignment_coverage_longer_n"), default = 99999999), - # make_option(opt_str = c("-aS", "--alignment_coverage_shorter_p"), default = 0.0), - # make_option(opt_str = c("-AS", "--alignment_coverage_shorter_n"), default = 99999999), - # make_option(opt_str = c("-uL", "--max_unmatched_longer_p"), default = 1.0), - # make_option(opt_str = c("-uS", "--max_unmatched_shorter_p"), default = 99999999), - # make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999), - # make_option(opt_str = c("-g", "--fast_cluster"), default = FALSE), - # make_option(opt_str = c("-r", "--ignore_strand"), default = TRUE), - # make_option(opt_str = c("--match"), default = 2), - # make_option(opt_str = c("--mismatch"), default = -2), - # make_option(opt_str = c("--gap"), default = -6), - # make_option(opt_str = c("--gap_ext"), default = -1), - # make_option(opt_str = c("-sc", "--sort_cluster_by_size"), default = TRUE) - # TODO more args + make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0, help = "Shorter sequences length must be at least x percent of longer sequenecs length. Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999, help = "Length difference between sequences can not be higher than x nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-aL", "--alignment_coverage_longer_p"), default = 0.0, help = "Alignment must cover x percent of longer sequence. Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-AL", "--alignment_coverage_longer_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-aS", "--alignment_coverage_shorter_p"), default = 0.0, help = "Alignment must cover x percent of shorter sequence. Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-AS", "--alignment_coverage_shorter_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-uL", "--max_unmatched_longer_p"), default = 1.0, help = "Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-uS", "--max_unmatched_shorter_p"), default = 1.0, help = "Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999, help = "Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-g", "--fast_cluster"), default = 1, help = "Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-r", "--ignore_strand"), default = 0, help = "Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--match"), default = 2, help = "Matching score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--gap_ext"), default = -1, help = "Gap extension score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-sc", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter)", metavar = "integer") ) opt_parser <- OptionParser(option_list = option_list, - description = "CD-HIT-EST Wrapper function.") + description = "CD-HIT-EST Wrapper function. See https://github.com/weizhongli/cdhit/wiki for more information regarding CD-HITs parameters.") opt <- parse_args(opt_parser) -#' cd-hit wrapper +#' cd-hit-est wrapper +#' Receives bed with sequences in last column and writes bed with added cluster numbers. #' #' @param input Data.table or file in bed format (requires names in fourth column and sequences in last column). -#' @param identity Identity threshold. -#' @param coverage Minimal alignment length for both sequences in nucelotides. -#' @param output Clustered bedfile. Adds cluster number in last column (lower number = bigger cluster). -#' @param clean Clean up after run. -#' @param threads Number of threads to use in cd-hit (0 = all cpus). +#' @param identity Identity threshold. Default = 0.8 (CD-HIT parameter) +#' @param coverage Minimal alignment length for both sequences in nucelotides. Default = 8 (CD-HIT parameter) +#' @param output Clustered bedfile. Adds cluster number in last column (numbering depend on sort_by_cluster_size parameter). Default = cluster.bed +#' @param clean Clean up after run. Default = TRUE +#' @param threads Number of threads to use (0 = all cores). Default = 1 (CD-HIT parameter) +#' @param global Global sequence identity = 1, local = 0. Default = 0 (CD-HIT parameter) +#' @param band_width "Alignment band width. Default = 20 (CD-HIT parameter)" +#' @param memory Memory limit in MB. 0 for unlimited. Default = 800 (CD-HIT parameter) +#' @param word_length Default = 3 (CD-HIT parameter) +#' @param throw_away_sequences Maximum length of sequences thrown away. Default = %default (CD-HIT parameter) +#' @param length_dif_cutoff_shorter_p Shorter sequences length must be at least x percent of longer sequenecs length. Default = 0 (CD-HIT parameter) +#' @param length_dif_cutoff_shorter_n Length difference between sequences can not be higher than x nucleotides. Default = 999999 (CD-HIT parameter) +#' @param alignment_coverage_longer_p Alignment must cover x percent of longer sequence. Default = 0 (CD-HIT parameter) +#' @param alignment_coverage_longer_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter) +#' @param alignment_coverage_shorter_p Alignment must cover x percent of shorter sequence. Default = 0 (CD-HIT parameter) +#' @param alignment_coverage_shorter_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter) +#' @param max_unmatched_longer_p Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) +#' @param max_unmatched_shorter_p Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) +#' @param max_unmatched_both_n Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = 99999999 (CD-HIT parameter) +#' @param fast_cluster Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = 1 (CD-HIT parameter) +#' @param ignore_strand Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = 0 (CD-HIT parameter) +#' @param match Matching score. Default = 2 (CD-HIT parameter) +#' @param mismatch Mismatch score. Default = -2 (CD-HIT parameter) +#' @param gap Gap score. Default = -6 (CD-HIT parameter) +#' @param gat_ext Gap extension score. Default = -1 (CD-HIT parameter) +#' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter) #' -#' TODO add all cdhit parameter (necessary ?) #' TODO check whether cdhit is installed -cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1) { +cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, ignore_strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) { + if (missing(input)) { + stop("Input parameter missing!") + } + message("Loading bed.") # load bed if neccessary if (!data.table::is.data.table(input)) { @@ -58,6 +82,12 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed" bed_table <- input } + # check for duplicated names + if (anyDuplicated(bed_table[, "V4"])) { + warning("Found duplicated names. Making names unique.") + bed_table[, V4 := make.unique(V4)] + } + message("Convert to fasta.") ### convert bed to fasta # 4th column = name @@ -68,7 +98,33 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed" message("Clustering.") ### cd-hit-est cdhit_output <- "cdhit_output" - cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", identity, "-A", coverage, "-T ", threads, "-G 0 -n 3 -g 1 -r 0 -l 5 -sc 1 -d 0") + cdhit_call <- paste("cd-hit-est -i", fasta_file, + "-o", cdhit_output, + "-c", identity, + "-A", coverage, + "-T", threads, + "-G", global, + "-b", band_width, + "-M", memory, + "-n", word_length, + "-l", throw_away_sequences, + "-s", length_dif_cutoff_shorter_p, + "-S", length_dif_cutoff_shorter_n, + "-aL", alignment_coverage_longer_p, + "-AL", alignment_coverage_longer_n, + "-aS", alignment_coverage_shorter_p, + "-AS", alignment_coverage_shorter_n, + "-uL", max_unmatched_longer_p, + "-uS", max_unmatched_shorter_p, + "-U", max_unmatched_both_n, + "-g", fast_cluster, + "-r", ignore_strand, + "-match", match, + "-mismatch", mismatch, + "-gap", gap, + "-gap-ext", gap_ext, + "-sc", sort_cluster_by_size, + "-d 0") system(command = cdhit_call, wait = TRUE)