Skip to content
Permalink
5acffac0c6
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
221 lines (193 sloc) 14.5 KB
#! /bin/Rscript
if (!require(optparse, quietly = TRUE)) install.packages("optparse"); library(optparse)
option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Fourth column is expected to contain names, last column must be sequences.", metavar = "character"),
make_option(opt_str = c("-c", "--identity"), default = 0.8, help = "Identity threshold. Default = %default (CD-HIT parameter)", metavar = "double >= 0.8"),
make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file same as input but with appended column of cluster numbers. Default = %default", metavar = "character"),
make_option(opt_str = c("--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"),
make_option(opt_str = c("-G", "--global"), default = 0, help = "Global sequence identity = 1, local = 0. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-b", "--band_width"), default = 20, help = "Alignment band width. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-M", "--memory"), default = 800, help = "Memory limit in MB. 0 for unlimited. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-T", "--threads"), default = 1, help = "Number of threads. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-n", "--word_length"), default = 3, help = "Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-l", "--throw_away_sequences"), default = 5, help = "Maximum length of sequences thrown away. Default = %default (CD-HIT parameter)", metavar = "integer"),
# make_option(opt_str = c("-d", "--description")), # can not produce bed if this is != 0
make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0, help = "Shorter sequences length must be at least x percent of longer sequences length. Default = %default (CD-HIT parameter)", metavar = "double"),
make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999, help = "Length difference between sequences can not be higher than x nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-e", "--alignment_coverage_longer_p"), default = 0.0, help = "Alignment must cover x percent of longer sequence. Default = %default (CD-HIT parameter: -aL)", metavar = "double"),
make_option(opt_str = c("-E", "--alignment_coverage_longer_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter: -AL)", metavar = "integer"),
make_option(opt_str = c("-f", "--alignment_coverage_shorter_p"), default = 0.0, help = "Alignment must cover x percent of shorter sequence. Default = %default (CD-HIT parameter: -aS)", metavar = "double"),
make_option(opt_str = c("-F", "--alignment_coverage_shorter_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter: -AS)", metavar = "integer"),
make_option(opt_str = c("-w", "--max_unmatched_longer_p"), default = 1.0, help = "Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter: -uL)", metavar = "double"),
make_option(opt_str = c("-W", "--max_unmatched_shorter_p"), default = 1.0, help = "Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter: -uS)", metavar = "double"),
make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999, help = "Maximum unmatched nucleotide on both sequences (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-g", "--fast_cluster"), default = 1, help = "Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-r", "--strand"), default = 0, help = "Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("--match"), default = 2, help = "Matching score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("--gap_ext"), default = -1, help = "Gap extension score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer"),
make_option(opt_str = c("--summary"), default = NULL, help = "Filepath where a small summary file of the run should be written. Default = %default", metavar = "character")
)
opt_parser <- OptionParser(option_list = option_list,
description = "CD-HIT-EST Wrapper function. See https://github.com/weizhongli/cdhit/wiki for more information regarding CD-HITs parameters.",
epilogue = "Author: Hendrik Schultheis <Hendrik.Schultheis@mpi-bn.mpg.de>")
opt <- parse_args(opt_parser)
#' cd-hit-est wrapper
#' Receives bed with sequences in last column and writes bed with added cluster numbers.
#'
#' @param input Data.table or file in bed format (requires names in fourth column and sequences in last column).
#' @param identity Identity threshold. Default = 0.8 (CD-HIT parameter)
#' @param coverage Minimal alignment length for both sequences in nucleotides. Default = 8 (CD-HIT parameter)
#' @param output Clustered bed-file. Adds cluster number in last column (numbering depend on sort_by_cluster_size parameter). Default = cluster.bed
#' @param clean Clean up after run. Default = TRUE
#' @param threads Number of threads to use (0 = all cores). Default = 1 (CD-HIT parameter)
#' @param global Global sequence identity = 1, local = 0. Default = 0 (CD-HIT parameter)
#' @param band_width "Alignment band width. Default = 20 (CD-HIT parameter)"
#' @param memory Memory limit in MB. 0 for unlimited. Default = 800 (CD-HIT parameter)
#' @param word_length Default = 3 (CD-HIT parameter)
#' @param throw_away_sequences Maximum length of sequences thrown away. Default = %default (CD-HIT parameter)
#' @param length_dif_cutoff_shorter_p Shorter sequences length must be at least x percent of longer sequences length. Default = 0 (CD-HIT parameter)
#' @param length_dif_cutoff_shorter_n Length difference between sequences can not be higher than x nucleotides. Default = 999999 (CD-HIT parameter)
#' @param alignment_coverage_longer_p Alignment must cover x percent of longer sequence. Default = 0 (CD-HIT parameter)
#' @param alignment_coverage_longer_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter)
#' @param alignment_coverage_shorter_p Alignment must cover x percent of shorter sequence. Default = 0 (CD-HIT parameter)
#' @param alignment_coverage_shorter_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter)
#' @param max_unmatched_longer_p Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter)
#' @param max_unmatched_shorter_p Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter)
#' @param max_unmatched_both_n Maximum unmatched nucleotide on both sequences (excludes leading tailing gap). Default = 99999999 (CD-HIT parameter)
#' @param fast_cluster Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = 1 (CD-HIT parameter)
#' @param strand Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = 0 (CD-HIT parameter)
#' @param match Matching score. Default = 2 (CD-HIT parameter)
#' @param mismatch Mismatch score. Default = -2 (CD-HIT parameter)
#' @param gap Gap score. Default = -6 (CD-HIT parameter)
#' @param gap_ext Gap extension score. Default = -1 (CD-HIT parameter)
#' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter)
#' @param summary Filepath where a small summary file of the run should be written.
#'
#' @details If there is a header supplied other then the default data.table naming scheme ('V1', 'V2', etc.) it will be kept and extended.
#'
#' @author Hendrik Schultheis <Hendrik.Schultheis@@mpi-bn.mpg.de>
#'
cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1, summary = NULL) {
# parameter checks
if (system("which cd-hit-est", ignore.stdout = FALSE) != 0) {
stop("Required program CD-HIT not found! Please check whether it is installed.")
}
if (missing(input)) {
stop("No input specified! Please forward a valid bed-file.")
}
if (!file.exists(input)) {
stop("File ", input, " does not exist!")
}
if (!is.logical(clean)) {
stop("Parameter clean has to be a boolean value.")
}
message("Loading bed.")
# load bed if necessary
if (!data.table::is.data.table(input)) {
bed_table <- data.table::fread(input = input)
} else {
bed_table <- input
}
# check if there are column names to keep them
default_col_names <- grepl(pattern = "^V+\\d$", names(bed_table), perl = TRUE)
keep_col_names <- ifelse(any(default_col_names), FALSE, TRUE)
# check for duplicated names
if (anyDuplicated(bed_table[, 4])) {
warning("Found duplicated names. Making names unique.")
bed_table[, 4 := make.unique(names(bed_table)[4])]
}
message("Convert to fasta.")
### convert bed to fasta
# 4th column = name
# last column = sequence
fasta_file <- "converted_bed.fasta"
seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file)
message("Clustering.")
### cd-hit-est
cdhit_output <- "cdhit_output"
cdhit_call <- paste("cd-hit-est -i", fasta_file,
"-o", cdhit_output,
"-c", identity,
"-A", coverage,
"-T", threads,
"-G", global,
"-b", band_width,
"-M", memory,
"-n", word_length,
"-l", throw_away_sequences,
"-s", length_dif_cutoff_shorter_p,
"-S", length_dif_cutoff_shorter_n,
"-aL", alignment_coverage_longer_p,
"-AL", alignment_coverage_longer_n,
"-aS", alignment_coverage_shorter_p,
"-AS", alignment_coverage_shorter_n,
"-uL", max_unmatched_longer_p,
"-uS", max_unmatched_shorter_p,
"-U", max_unmatched_both_n,
"-g", fast_cluster,
"-r", strand,
"-match", match,
"-mismatch", mismatch,
"-gap", gap,
"-gap-ext", gap_ext,
"-sc", sort_cluster_by_size,
"-d 0")
system(command = cdhit_call, wait = TRUE)
message("Formatting/ writing results.")
# reformat cluster file
# columns: id, clstr, clstr_size, length, clstr_rep, clstr_iden, clstr_cov
cluster_file <- "reformated.clstr"
cluster_call <- paste("clstr2txt.pl", paste0(cdhit_output, ".clstr"), ">", cluster_file)
system(command = cluster_call, wait = TRUE)
# load reformated file
cluster <- data.table::fread(cluster_file)[, c("id", "clstr")]
names(cluster) <- c("id", "cluster")
### add cluster to bed_table
result <- merge(x = bed_table, y = cluster, by.x = names(bed_table)[4], by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE]
# delete files
if (clean) {
file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file)
}
# write summary
if (!is.null(summary)) {
# script/ function call
file_name <- sub("--file=", "", commandArgs()[grep("--file=", commandArgs())])
call <- ifelse(interactive(), deparse(match.call()),
paste("Rscript", basename(file_name), paste(commandArgs(trailingOnly = TRUE), collapse = " "))
)
total_cluster <- length(unique(result[[ncol(result)]]))
# cluster size table
# columns: cluster_id, size
cluster_table <- data.table::as.data.table(table(result[[ncol(result)]]))
names(cluster_table) <- c("cluster_id", "size")
summary_data <- c(
"Call:",
paste0("\t", call),
"",
"Cluster:",
paste("\tTotal", total_cluster),
""
)
write(x = summary_data, file = summary)
data.table::fwrite(x = cluster_table, file = summary, append = TRUE, sep = "\t", col.names = TRUE)
}
# cast start and end column to integer64 to prevent scientific notation e.g. 1e+10
# start and end are assumed to be at position 2 and 3
result[, c(2, 3) := lapply(.SD, bit64::as.integer64), .SDcols = c(2, 3)]
data.table::fwrite(x = result, file = output, sep = "\t", col.names = keep_col_names)
}
# call function with given parameter if not in interactive context (e.g. run from shell)
if (!interactive()) {
# remove last parameter (help param)
# show help if called without arguments
if (length(commandArgs(trailingOnly = TRUE)) <= 0) {
print_help(opt_parser)
} else {
params <- opt[-length(opt)]
do.call(cdhitest, args = params)
}
}