#! /bin/Rscript if (!require(optparse, quietly = TRUE)) install.packages("optparse"); library(optparse) option_list <- list( make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Fourth column is expected to contain names, last column must be sequences.", metavar = "character"), make_option(opt_str = c("-c", "--identity"), default = 0.8, help = "Identity threshold. Default = %default (CD-HIT parameter)", metavar = "double >= 0.8"), make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file same as input but with appended column of cluster numbers. Default = %default", metavar = "character"), make_option(opt_str = c("--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), make_option(opt_str = c("-G", "--global"), default = 0, help = "Global sequence identity = 1, local = 0. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-b", "--band_width"), default = 20, help = "Alignment band width. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-M", "--memory"), default = 800, help = "Memory limit in MB. 0 for unlimited. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-T", "--threads"), default = 1, help = "Number of threads. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-n", "--word_length"), default = 3, help = "Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-l", "--throw_away_sequences"), default = 5, help = "Maximum length of sequences thrown away. Default = %default (CD-HIT parameter)", metavar = "integer"), # make_option(opt_str = c("-d", "--description")), # can not produce bed if this is != 0 make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0, help = "Shorter sequences length must be at least x percent of longer sequences length. Default = %default (CD-HIT parameter)", metavar = "double"), make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999, help = "Length difference between sequences can not be higher than x nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-e", "--alignment_coverage_longer_p"), default = 0.0, help = "Alignment must cover x percent of longer sequence. Default = %default (CD-HIT parameter: -aL)", metavar = "double"), make_option(opt_str = c("-E", "--alignment_coverage_longer_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter: -AL)", metavar = "integer"), make_option(opt_str = c("-f", "--alignment_coverage_shorter_p"), default = 0.0, help = "Alignment must cover x percent of shorter sequence. Default = %default (CD-HIT parameter: -aS)", metavar = "double"), make_option(opt_str = c("-F", "--alignment_coverage_shorter_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter: -AS)", metavar = "integer"), make_option(opt_str = c("-w", "--max_unmatched_longer_p"), default = 1.0, help = "Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter: -uL)", metavar = "double"), make_option(opt_str = c("-W", "--max_unmatched_shorter_p"), default = 1.0, help = "Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter: -uS)", metavar = "double"), make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999, help = "Maximum unmatched nucleotide on both sequences (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-g", "--fast_cluster"), default = 1, help = "Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-r", "--strand"), default = 0, help = "Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--match"), default = 2, help = "Matching score. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--gap_ext"), default = -1, help = "Gap extension score. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer"), make_option(opt_str = c("--summary"), default = NULL, help = "Filepath where a small summary file of the run should be written. Default = %default", metavar = "character") ) opt_parser <- OptionParser(option_list = option_list, description = "CD-HIT-EST Wrapper function. See https://github.com/weizhongli/cdhit/wiki for more information regarding CD-HITs parameters.", epilogue = "Author: Hendrik Schultheis <Hendrik.Schultheis@mpi-bn.mpg.de>") opt <- parse_args(opt_parser) #' cd-hit-est wrapper #' Receives bed with sequences in last column and writes bed with added cluster numbers. #' #' @param input Data.table or file in bed format (requires names in fourth column and sequences in last column). #' @param identity Identity threshold. Default = 0.8 (CD-HIT parameter) #' @param coverage Minimal alignment length for both sequences in nucleotides. Default = 8 (CD-HIT parameter) #' @param output Clustered bed-file. Adds cluster number in last column (numbering depend on sort_by_cluster_size parameter). Default = cluster.bed #' @param clean Clean up after run. Default = TRUE #' @param threads Number of threads to use (0 = all cores). Default = 1 (CD-HIT parameter) #' @param global Global sequence identity = 1, local = 0. Default = 0 (CD-HIT parameter) #' @param band_width "Alignment band width. Default = 20 (CD-HIT parameter)" #' @param memory Memory limit in MB. 0 for unlimited. Default = 800 (CD-HIT parameter) #' @param word_length Default = 3 (CD-HIT parameter) #' @param throw_away_sequences Maximum length of sequences thrown away. Default = %default (CD-HIT parameter) #' @param length_dif_cutoff_shorter_p Shorter sequences length must be at least x percent of longer sequences length. Default = 0 (CD-HIT parameter) #' @param length_dif_cutoff_shorter_n Length difference between sequences can not be higher than x nucleotides. Default = 999999 (CD-HIT parameter) #' @param alignment_coverage_longer_p Alignment must cover x percent of longer sequence. Default = 0 (CD-HIT parameter) #' @param alignment_coverage_longer_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter) #' @param alignment_coverage_shorter_p Alignment must cover x percent of shorter sequence. Default = 0 (CD-HIT parameter) #' @param alignment_coverage_shorter_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter) #' @param max_unmatched_longer_p Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) #' @param max_unmatched_shorter_p Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) #' @param max_unmatched_both_n Maximum unmatched nucleotide on both sequences (excludes leading tailing gap). Default = 99999999 (CD-HIT parameter) #' @param fast_cluster Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = 1 (CD-HIT parameter) #' @param strand Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = 0 (CD-HIT parameter) #' @param match Matching score. Default = 2 (CD-HIT parameter) #' @param mismatch Mismatch score. Default = -2 (CD-HIT parameter) #' @param gap Gap score. Default = -6 (CD-HIT parameter) #' @param gap_ext Gap extension score. Default = -1 (CD-HIT parameter) #' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter) #' @param summary Filepath where a small summary file of the run should be written. #' #' @details If there is a header supplied other then the default data.table naming scheme ('V1', 'V2', etc.) it will be kept and extended. #' #' @author Hendrik Schultheis <Hendrik.Schultheis@@mpi-bn.mpg.de> #' cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1, summary = NULL) { # parameter checks if (system("which cd-hit-est", ignore.stdout = FALSE) != 0) { stop("Required program CD-HIT not found! Please check whether it is installed.") } if (missing(input)) { stop("No input specified! Please forward a valid bed-file.") } if (!file.exists(input)) { stop("File ", input, " does not exist!") } if (!is.logical(clean)) { stop("Parameter clean has to be a boolean value.") } message("Loading bed.") # load bed if necessary if (!data.table::is.data.table(input)) { bed_table <- data.table::fread(input = input) } else { bed_table <- input } # check if there are column names to keep them default_col_names <- grepl(pattern = "^V+\\d$", names(bed_table), perl = TRUE) keep_col_names <- ifelse(any(default_col_names), FALSE, TRUE) # check for duplicated names if (anyDuplicated(bed_table[, 4])) { warning("Found duplicated names. Making names unique.") bed_table[, 4 := make.unique(names(bed_table)[4])] } message("Convert to fasta.") ### convert bed to fasta # 4th column = name # last column = sequence fasta_file <- "converted_bed.fasta" seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file) message("Clustering.") ### cd-hit-est cdhit_output <- "cdhit_output" cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", identity, "-A", coverage, "-T", threads, "-G", global, "-b", band_width, "-M", memory, "-n", word_length, "-l", throw_away_sequences, "-s", length_dif_cutoff_shorter_p, "-S", length_dif_cutoff_shorter_n, "-aL", alignment_coverage_longer_p, "-AL", alignment_coverage_longer_n, "-aS", alignment_coverage_shorter_p, "-AS", alignment_coverage_shorter_n, "-uL", max_unmatched_longer_p, "-uS", max_unmatched_shorter_p, "-U", max_unmatched_both_n, "-g", fast_cluster, "-r", strand, "-match", match, "-mismatch", mismatch, "-gap", gap, "-gap-ext", gap_ext, "-sc", sort_cluster_by_size, "-d 0") system(command = cdhit_call, wait = TRUE) message("Formatting/ writing results.") # reformat cluster file # columns: id, clstr, clstr_size, length, clstr_rep, clstr_iden, clstr_cov cluster_file <- "reformated.clstr" cluster_call <- paste("clstr2txt.pl", paste0(cdhit_output, ".clstr"), ">", cluster_file) system(command = cluster_call, wait = TRUE) # load reformated file cluster <- data.table::fread(cluster_file)[, c("id", "clstr")] names(cluster) <- c("id", "cluster") ### add cluster to bed_table result <- merge(x = bed_table, y = cluster, by.x = names(bed_table)[4], by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE] # delete files if (clean) { file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file) } # write summary if (!is.null(summary)) { # script/ function call file_name <- sub("--file=", "", commandArgs()[grep("--file=", commandArgs())]) call <- ifelse(interactive(), deparse(match.call()), paste("Rscript", basename(file_name), paste(commandArgs(trailingOnly = TRUE), collapse = " ")) ) total_cluster <- length(unique(result[[ncol(result)]])) # cluster size table # columns: cluster_id, size cluster_table <- data.table::as.data.table(table(result[[ncol(result)]])) names(cluster_table) <- c("cluster_id", "size") summary_data <- c( "Call:", paste0("\t", call), "", "Cluster:", paste("\tTotal", total_cluster), "" ) write(x = summary_data, file = summary) data.table::fwrite(x = cluster_table, file = summary, append = TRUE, sep = "\t", col.names = TRUE) } # cast start and end column to integer64 to prevent scientific notation e.g. 1e+10 # start and end are assumed to be at position 2 and 3 result[, c(2, 3) := lapply(.SD, bit64::as.integer64), .SDcols = c(2, 3)] data.table::fwrite(x = result, file = output, sep = "\t", col.names = keep_col_names) } # call function with given parameter if not in interactive context (e.g. run from shell) if (!interactive()) { # remove last parameter (help param) # show help if called without arguments if (length(commandArgs(trailingOnly = TRUE)) <= 0) { print_help(opt_parser) } else { params <- opt[-length(opt)] do.call(cdhitest, args = params) } }