Skip to content
Permalink
51d7efe989
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
158 lines (142 sloc) 12 KB
#! /bin/Rscript
library("optparse")
option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Fourth column is expected to contain names, last column must be sequences.", metavar = "character"),
make_option(opt_str = c("-c", "--identity"), default = 0.8, help = "Identity threshold. Default = %default (CD-HIT parameter)", metavar = "double >= 0.8"),
make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file same as input but with appended column of cluster numbers. Default = %default", metavar = "character"),
make_option(opt_str = c("--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"),
make_option(opt_str = c("-G", "--global"), default = 0, help = "Global sequence identity = 1, local = 0. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-b", "--band_width"), default = 20, help = "Alignment band width. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-M", "--memory"), default = 800, help = "Memory limit in MB. 0 for unlimited. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-T", "--threads"), default = 1, help = "Number of threads. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-n", "--word_length"), default = 3, help = "Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-l", "--throw_away_sequences"), default = 5, help = "Maximum length of sequences thrown away. Default = %default (CD-HIT parameter)", metavar = "integer"),
# make_option(opt_str = c("-d", "--description")), # can not produce bed if this is != 0
make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0, help = "Shorter sequences length must be at least x percent of longer sequenecs length. Default = %default (CD-HIT parameter)", metavar = "double"),
make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999, help = "Length difference between sequences can not be higher than x nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-aL", "--alignment_coverage_longer_p"), default = 0.0, help = "Alignment must cover x percent of longer sequence. Default = %default (CD-HIT parameter)", metavar = "double"),
make_option(opt_str = c("-AL", "--alignment_coverage_longer_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-aS", "--alignment_coverage_shorter_p"), default = 0.0, help = "Alignment must cover x percent of shorter sequence. Default = %default (CD-HIT parameter)", metavar = "double"),
make_option(opt_str = c("-AS", "--alignment_coverage_shorter_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-uL", "--max_unmatched_longer_p"), default = 1.0, help = "Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "double"),
make_option(opt_str = c("-uS", "--max_unmatched_shorter_p"), default = 1.0, help = "Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "double"),
make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999, help = "Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-g", "--fast_cluster"), default = 1, help = "Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-r", "--strand"), default = 0, help = "Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("--match"), default = 2, help = "Matching score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("--gap_ext"), default = -1, help = "Gap extension score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-sc", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter)", metavar = "integer")
)
opt_parser <- OptionParser(option_list = option_list,
description = "CD-HIT-EST Wrapper function. See https://github.com/weizhongli/cdhit/wiki for more information regarding CD-HITs parameters.")
opt <- parse_args(opt_parser)
#' cd-hit-est wrapper
#' Receives bed with sequences in last column and writes bed with added cluster numbers.
#'
#' @param input Data.table or file in bed format (requires names in fourth column and sequences in last column).
#' @param identity Identity threshold. Default = 0.8 (CD-HIT parameter)
#' @param coverage Minimal alignment length for both sequences in nucelotides. Default = 8 (CD-HIT parameter)
#' @param output Clustered bedfile. Adds cluster number in last column (numbering depend on sort_by_cluster_size parameter). Default = cluster.bed
#' @param clean Clean up after run. Default = TRUE
#' @param threads Number of threads to use (0 = all cores). Default = 1 (CD-HIT parameter)
#' @param global Global sequence identity = 1, local = 0. Default = 0 (CD-HIT parameter)
#' @param band_width "Alignment band width. Default = 20 (CD-HIT parameter)"
#' @param memory Memory limit in MB. 0 for unlimited. Default = 800 (CD-HIT parameter)
#' @param word_length Default = 3 (CD-HIT parameter)
#' @param throw_away_sequences Maximum length of sequences thrown away. Default = %default (CD-HIT parameter)
#' @param length_dif_cutoff_shorter_p Shorter sequences length must be at least x percent of longer sequenecs length. Default = 0 (CD-HIT parameter)
#' @param length_dif_cutoff_shorter_n Length difference between sequences can not be higher than x nucleotides. Default = 999999 (CD-HIT parameter)
#' @param alignment_coverage_longer_p Alignment must cover x percent of longer sequence. Default = 0 (CD-HIT parameter)
#' @param alignment_coverage_longer_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter)
#' @param alignment_coverage_shorter_p Alignment must cover x percent of shorter sequence. Default = 0 (CD-HIT parameter)
#' @param alignment_coverage_shorter_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter)
#' @param max_unmatched_longer_p Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter)
#' @param max_unmatched_shorter_p Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter)
#' @param max_unmatched_both_n Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = 99999999 (CD-HIT parameter)
#' @param fast_cluster Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = 1 (CD-HIT parameter)
#' @param strand Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = 0 (CD-HIT parameter)
#' @param match Matching score. Default = 2 (CD-HIT parameter)
#' @param mismatch Mismatch score. Default = -2 (CD-HIT parameter)
#' @param gap Gap score. Default = -6 (CD-HIT parameter)
#' @param gat_ext Gap extension score. Default = -1 (CD-HIT parameter)
#' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter)
#'
#' TODO check whether cdhit is installed
cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) {
if (missing(input)) {
stop("Input parameter missing!")
}
message("Loading bed.")
# load bed if neccessary
if (!data.table::is.data.table(input)) {
bed_table <- data.table::fread(input = input, header = FALSE)
} else {
bed_table <- input
}
# check for duplicated names
if (anyDuplicated(bed_table[, "V4"])) {
warning("Found duplicated names. Making names unique.")
bed_table[, V4 := make.unique(V4)]
}
message("Convert to fasta.")
### convert bed to fasta
# 4th column = name
# last column = sequence
fasta_file <- "converted_bed.fasta"
seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file)
message("Clustering.")
### cd-hit-est
cdhit_output <- "cdhit_output"
cdhit_call <- paste("cd-hit-est -i", fasta_file,
"-o", cdhit_output,
"-c", identity,
"-A", coverage,
"-T", threads,
"-G", global,
"-b", band_width,
"-M", memory,
"-n", word_length,
"-l", throw_away_sequences,
"-s", length_dif_cutoff_shorter_p,
"-S", length_dif_cutoff_shorter_n,
"-aL", alignment_coverage_longer_p,
"-AL", alignment_coverage_longer_n,
"-aS", alignment_coverage_shorter_p,
"-AS", alignment_coverage_shorter_n,
"-uL", max_unmatched_longer_p,
"-uS", max_unmatched_shorter_p,
"-U", max_unmatched_both_n,
"-g", fast_cluster,
"-r", strand,
"-match", match,
"-mismatch", mismatch,
"-gap", gap,
"-gap-ext", gap_ext,
"-sc", sort_cluster_by_size,
"-d 0")
system(command = cdhit_call, wait = TRUE)
message("Formatting/ writing results.")
# reformat cluster file
# columns: id, clstr, clstr_size, length, clstr_rep, clstr_iden, clstr_cov
cluster_file <- "reformated.clstr"
cluster_call <- paste("clstr2txt.pl", paste0(cdhit_output, ".clstr"), ">", cluster_file)
system(command = cluster_call, wait = TRUE)
# load reformated file
cluster <- data.table::fread(cluster_file)
### add cluster to bed_table
result <- merge(x = bed_table, y = cluster[, c("id", "clstr")], by.x = "V4", by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE]
# delete files
if (clean) {
file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file)
}
data.table::fwrite(x = result, file = output, sep = "\t", col.names = FALSE)
}
# call function with given parameter if not in interactive context (e.g. run from shell)
if (!interactive()) {
# remove last parameter (help param)
params <- opt[-length(opt)]
do.call(cdhitest, args = params)
}