Skip to content

Cluster #3

Merged
merged 15 commits into from
Dec 6, 2018
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
119 changes: 101 additions & 18 deletions bin/cdhit_wrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,50 +2,133 @@
library("optparse")

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"),
make_option(opt_str = c("-s", "--similarity"), default = 0.8, help = "Similarity threshold. Default = %default", metavar = "double >= 0.8"),
make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default", metavar = "integer"),
make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file. Default = %default", metavar = "character"),
make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical")
# TODO more args
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Fourth column is expected to contain names, last column must be sequences.", metavar = "character"),
make_option(opt_str = c("-c", "--identity"), default = 0.8, help = "Identity threshold. Default = %default (CD-HIT parameter)", metavar = "double >= 0.8"),
make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file same as input but with appended column of cluster numbers. Default = %default", metavar = "character"),
make_option(opt_str = c("--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"),
make_option(opt_str = c("-G", "--global"), default = 0, help = "Global sequence identity = 1, local = 0. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-b", "--band_width"), default = 20, help = "Alignment band width. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-M", "--memory"), default = 800, help = "Memory limit in MB. 0 for unlimited. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-T", "--threads"), default = 1, help = "Number of threads. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-n", "--word_length"), default = 3, help = "Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-l", "--throw_away_sequences"), default = 5, help = "Maximum length of sequences thrown away. Default = %default (CD-HIT parameter)", metavar = "integer"),
# make_option(opt_str = c("-d", "--description")), # can not produce bed if this is != 0
make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0, help = "Shorter sequences length must be at least x percent of longer sequenecs length. Default = %default (CD-HIT parameter)", metavar = "double"),
make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999, help = "Length difference between sequences can not be higher than x nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-aL", "--alignment_coverage_longer_p"), default = 0.0, help = "Alignment must cover x percent of longer sequence. Default = %default (CD-HIT parameter)", metavar = "double"),
make_option(opt_str = c("-AL", "--alignment_coverage_longer_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-aS", "--alignment_coverage_shorter_p"), default = 0.0, help = "Alignment must cover x percent of shorter sequence. Default = %default (CD-HIT parameter)", metavar = "double"),
make_option(opt_str = c("-AS", "--alignment_coverage_shorter_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-uL", "--max_unmatched_longer_p"), default = 1.0, help = "Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "double"),
make_option(opt_str = c("-uS", "--max_unmatched_shorter_p"), default = 1.0, help = "Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "double"),
make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999, help = "Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-g", "--fast_cluster"), default = 1, help = "Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-r", "--strand"), default = 0, help = "Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("--match"), default = 2, help = "Matching score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("--gap_ext"), default = -1, help = "Gap extension score. Default = %default (CD-HIT parameter)", metavar = "integer"),
make_option(opt_str = c("-sc", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter)", metavar = "integer")
)

opt_parser <- OptionParser(option_list = option_list,
description = "CD-HIT-EST Wrapper function.")
description = "CD-HIT-EST Wrapper function. See https://github.com/weizhongli/cdhit/wiki for more information regarding CD-HITs parameters.")

opt <- parse_args(opt_parser)

#' cd-hit wrapper
#' cd-hit-est wrapper
#' Receives bed with sequences in last column and writes bed with added cluster numbers.
#'
#' @param input
#' @param similarity Similarity threshold.
#' @param coverage Minimal alignment length for both sequences in nucelotides.
#' @param output Clustered bedfile. Adds cluster number in last column (lower number = bigger cluster).
#' @param clean Clean up after run.
#' @param input Data.table or file in bed format (requires names in fourth column and sequences in last column).
#' @param identity Identity threshold. Default = 0.8 (CD-HIT parameter)
#' @param coverage Minimal alignment length for both sequences in nucelotides. Default = 8 (CD-HIT parameter)
#' @param output Clustered bedfile. Adds cluster number in last column (numbering depend on sort_by_cluster_size parameter). Default = cluster.bed
#' @param clean Clean up after run. Default = TRUE
#' @param threads Number of threads to use (0 = all cores). Default = 1 (CD-HIT parameter)
#' @param global Global sequence identity = 1, local = 0. Default = 0 (CD-HIT parameter)
#' @param band_width "Alignment band width. Default = 20 (CD-HIT parameter)"
#' @param memory Memory limit in MB. 0 for unlimited. Default = 800 (CD-HIT parameter)
#' @param word_length Default = 3 (CD-HIT parameter)
#' @param throw_away_sequences Maximum length of sequences thrown away. Default = %default (CD-HIT parameter)
#' @param length_dif_cutoff_shorter_p Shorter sequences length must be at least x percent of longer sequenecs length. Default = 0 (CD-HIT parameter)
#' @param length_dif_cutoff_shorter_n Length difference between sequences can not be higher than x nucleotides. Default = 999999 (CD-HIT parameter)
#' @param alignment_coverage_longer_p Alignment must cover x percent of longer sequence. Default = 0 (CD-HIT parameter)
#' @param alignment_coverage_longer_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter)
#' @param alignment_coverage_shorter_p Alignment must cover x percent of shorter sequence. Default = 0 (CD-HIT parameter)
#' @param alignment_coverage_shorter_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter)
#' @param max_unmatched_longer_p Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter)
#' @param max_unmatched_shorter_p Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter)
#' @param max_unmatched_both_n Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = 99999999 (CD-HIT parameter)
#' @param fast_cluster Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = 1 (CD-HIT parameter)
#' @param strand Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = 0 (CD-HIT parameter)
#' @param match Matching score. Default = 2 (CD-HIT parameter)
#' @param mismatch Mismatch score. Default = -2 (CD-HIT parameter)
#' @param gap Gap score. Default = -6 (CD-HIT parameter)
#' @param gat_ext Gap extension score. Default = -1 (CD-HIT parameter)
#' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter)
#'
#' @return bed_table with cluster in last column
#' TODO add all cdhit parameter
#' TODO check whether cdhit is installed
cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE) {
cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) {
if (missing(input)) {
stop("Input parameter missing!")
}

message("Loading bed.")
# load bed if neccessary
if (!data.table::is.data.table(input)) {
bed_table <- data.table::fread(input = input, header = FALSE)
} else {
bed_table <- input
}

# check for duplicated names
if (anyDuplicated(bed_table[, "V4"])) {
warning("Found duplicated names. Making names unique.")
bed_table[, V4 := make.unique(V4)]
}

message("Convert to fasta.")
### convert bed to fasta
# 4th column = name
# last column = sequence
fasta_file <- "converted_bed.fasta"
seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file)

message("Clustering.")
### cd-hit-est
cdhit_output <- "cdhit_output"
cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", similarity, "-A", coverage, "-G 0 -n 3 -g 1 -r 0 -l 5 -sc 1 -d 0")
cdhit_call <- paste("cd-hit-est -i", fasta_file,
"-o", cdhit_output,
"-c", identity,
"-A", coverage,
"-T", threads,
"-G", global,
"-b", band_width,
"-M", memory,
"-n", word_length,
"-l", throw_away_sequences,
"-s", length_dif_cutoff_shorter_p,
"-S", length_dif_cutoff_shorter_n,
"-aL", alignment_coverage_longer_p,
"-AL", alignment_coverage_longer_n,
"-aS", alignment_coverage_shorter_p,
"-AS", alignment_coverage_shorter_n,
"-uL", max_unmatched_longer_p,
"-uS", max_unmatched_shorter_p,
"-U", max_unmatched_both_n,
"-g", fast_cluster,
"-r", strand,
"-match", match,
"-mismatch", mismatch,
"-gap", gap,
"-gap-ext", gap_ext,
"-sc", sort_cluster_by_size,
"-d 0")

system(command = cdhit_call, wait = TRUE)

message("Formatting/ writing results.")
# reformat cluster file
# columns: id, clstr, clstr_size, length, clstr_rep, clstr_iden, clstr_cov
cluster_file <- "reformated.clstr"
Expand All @@ -71,5 +154,5 @@ cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.be
if (!interactive()) {
# remove last parameter (help param)
params <- opt[-length(opt)]
do.call(cdhitest, args = params)
do.call(cdhitest, args = params)
}