Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
master_project_JLU2018/bin/2.1_clustering/cdhit_wrapper.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
221 lines (193 sloc)
14.5 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/Rscript | |
if (!require(optparse, quietly = TRUE)) install.packages("optparse"); library(optparse) | |
option_list <- list( | |
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Fourth column is expected to contain names, last column must be sequences.", metavar = "character"), | |
make_option(opt_str = c("-c", "--identity"), default = 0.8, help = "Identity threshold. Default = %default (CD-HIT parameter)", metavar = "double >= 0.8"), | |
make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"), | |
make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file same as input but with appended column of cluster numbers. Default = %default", metavar = "character"), | |
make_option(opt_str = c("--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), | |
make_option(opt_str = c("-G", "--global"), default = 0, help = "Global sequence identity = 1, local = 0. Default = %default (CD-HIT parameter)", metavar = "integer"), | |
make_option(opt_str = c("-b", "--band_width"), default = 20, help = "Alignment band width. Default = %default (CD-HIT parameter)", metavar = "integer"), | |
make_option(opt_str = c("-M", "--memory"), default = 800, help = "Memory limit in MB. 0 for unlimited. Default = %default (CD-HIT parameter)", metavar = "integer"), | |
make_option(opt_str = c("-T", "--threads"), default = 1, help = "Number of threads. Default = %default (CD-HIT parameter)", metavar = "integer"), | |
make_option(opt_str = c("-n", "--word_length"), default = 3, help = "Default = %default (CD-HIT parameter)", metavar = "integer"), | |
make_option(opt_str = c("-l", "--throw_away_sequences"), default = 5, help = "Maximum length of sequences thrown away. Default = %default (CD-HIT parameter)", metavar = "integer"), | |
# make_option(opt_str = c("-d", "--description")), # can not produce bed if this is != 0 | |
make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0, help = "Shorter sequences length must be at least x percent of longer sequences length. Default = %default (CD-HIT parameter)", metavar = "double"), | |
make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999, help = "Length difference between sequences can not be higher than x nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"), | |
make_option(opt_str = c("-e", "--alignment_coverage_longer_p"), default = 0.0, help = "Alignment must cover x percent of longer sequence. Default = %default (CD-HIT parameter: -aL)", metavar = "double"), | |
make_option(opt_str = c("-E", "--alignment_coverage_longer_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter: -AL)", metavar = "integer"), | |
make_option(opt_str = c("-f", "--alignment_coverage_shorter_p"), default = 0.0, help = "Alignment must cover x percent of shorter sequence. Default = %default (CD-HIT parameter: -aS)", metavar = "double"), | |
make_option(opt_str = c("-F", "--alignment_coverage_shorter_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter: -AS)", metavar = "integer"), | |
make_option(opt_str = c("-w", "--max_unmatched_longer_p"), default = 1.0, help = "Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter: -uL)", metavar = "double"), | |
make_option(opt_str = c("-W", "--max_unmatched_shorter_p"), default = 1.0, help = "Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter: -uS)", metavar = "double"), | |
make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999, help = "Maximum unmatched nucleotide on both sequences (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "integer"), | |
make_option(opt_str = c("-g", "--fast_cluster"), default = 1, help = "Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = %default (CD-HIT parameter)", metavar = "integer"), | |
make_option(opt_str = c("-r", "--strand"), default = 0, help = "Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = %default (CD-HIT parameter)", metavar = "integer"), | |
make_option(opt_str = c("--match"), default = 2, help = "Matching score. Default = %default (CD-HIT parameter)", metavar = "integer"), | |
make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"), | |
make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"), | |
make_option(opt_str = c("--gap_ext"), default = -1, help = "Gap extension score. Default = %default (CD-HIT parameter)", metavar = "integer"), | |
make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer"), | |
make_option(opt_str = c("--summary"), default = NULL, help = "Filepath where a small summary file of the run should be written. Default = %default", metavar = "character") | |
) | |
opt_parser <- OptionParser(option_list = option_list, | |
description = "CD-HIT-EST Wrapper function. See https://github.com/weizhongli/cdhit/wiki for more information regarding CD-HITs parameters.", | |
epilogue = "Author: Hendrik Schultheis <Hendrik.Schultheis@mpi-bn.mpg.de>") | |
opt <- parse_args(opt_parser) | |
#' cd-hit-est wrapper | |
#' Receives bed with sequences in last column and writes bed with added cluster numbers. | |
#' | |
#' @param input Data.table or file in bed format (requires names in fourth column and sequences in last column). | |
#' @param identity Identity threshold. Default = 0.8 (CD-HIT parameter) | |
#' @param coverage Minimal alignment length for both sequences in nucleotides. Default = 8 (CD-HIT parameter) | |
#' @param output Clustered bed-file. Adds cluster number in last column (numbering depend on sort_by_cluster_size parameter). Default = cluster.bed | |
#' @param clean Clean up after run. Default = TRUE | |
#' @param threads Number of threads to use (0 = all cores). Default = 1 (CD-HIT parameter) | |
#' @param global Global sequence identity = 1, local = 0. Default = 0 (CD-HIT parameter) | |
#' @param band_width "Alignment band width. Default = 20 (CD-HIT parameter)" | |
#' @param memory Memory limit in MB. 0 for unlimited. Default = 800 (CD-HIT parameter) | |
#' @param word_length Default = 3 (CD-HIT parameter) | |
#' @param throw_away_sequences Maximum length of sequences thrown away. Default = %default (CD-HIT parameter) | |
#' @param length_dif_cutoff_shorter_p Shorter sequences length must be at least x percent of longer sequences length. Default = 0 (CD-HIT parameter) | |
#' @param length_dif_cutoff_shorter_n Length difference between sequences can not be higher than x nucleotides. Default = 999999 (CD-HIT parameter) | |
#' @param alignment_coverage_longer_p Alignment must cover x percent of longer sequence. Default = 0 (CD-HIT parameter) | |
#' @param alignment_coverage_longer_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter) | |
#' @param alignment_coverage_shorter_p Alignment must cover x percent of shorter sequence. Default = 0 (CD-HIT parameter) | |
#' @param alignment_coverage_shorter_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter) | |
#' @param max_unmatched_longer_p Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) | |
#' @param max_unmatched_shorter_p Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) | |
#' @param max_unmatched_both_n Maximum unmatched nucleotide on both sequences (excludes leading tailing gap). Default = 99999999 (CD-HIT parameter) | |
#' @param fast_cluster Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = 1 (CD-HIT parameter) | |
#' @param strand Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = 0 (CD-HIT parameter) | |
#' @param match Matching score. Default = 2 (CD-HIT parameter) | |
#' @param mismatch Mismatch score. Default = -2 (CD-HIT parameter) | |
#' @param gap Gap score. Default = -6 (CD-HIT parameter) | |
#' @param gap_ext Gap extension score. Default = -1 (CD-HIT parameter) | |
#' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter) | |
#' @param summary Filepath where a small summary file of the run should be written. | |
#' | |
#' @details If there is a header supplied other then the default data.table naming scheme ('V1', 'V2', etc.) it will be kept and extended. | |
#' | |
#' @author Hendrik Schultheis <Hendrik.Schultheis@@mpi-bn.mpg.de> | |
#' | |
cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1, summary = NULL) { | |
# parameter checks | |
if (system("which cd-hit-est", ignore.stdout = FALSE) != 0) { | |
stop("Required program CD-HIT not found! Please check whether it is installed.") | |
} | |
if (missing(input)) { | |
stop("No input specified! Please forward a valid bed-file.") | |
} | |
if (!file.exists(input)) { | |
stop("File ", input, " does not exist!") | |
} | |
if (!is.logical(clean)) { | |
stop("Parameter clean has to be a boolean value.") | |
} | |
message("Loading bed.") | |
# load bed if necessary | |
if (!data.table::is.data.table(input)) { | |
bed_table <- data.table::fread(input = input) | |
} else { | |
bed_table <- input | |
} | |
# check if there are column names to keep them | |
default_col_names <- grepl(pattern = "^V+\\d$", names(bed_table), perl = TRUE) | |
keep_col_names <- ifelse(any(default_col_names), FALSE, TRUE) | |
# check for duplicated names | |
if (anyDuplicated(bed_table[, 4])) { | |
warning("Found duplicated names. Making names unique.") | |
bed_table[, 4 := make.unique(names(bed_table)[4])] | |
} | |
message("Convert to fasta.") | |
### convert bed to fasta | |
# 4th column = name | |
# last column = sequence | |
fasta_file <- "converted_bed.fasta" | |
seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file) | |
message("Clustering.") | |
### cd-hit-est | |
cdhit_output <- "cdhit_output" | |
cdhit_call <- paste("cd-hit-est -i", fasta_file, | |
"-o", cdhit_output, | |
"-c", identity, | |
"-A", coverage, | |
"-T", threads, | |
"-G", global, | |
"-b", band_width, | |
"-M", memory, | |
"-n", word_length, | |
"-l", throw_away_sequences, | |
"-s", length_dif_cutoff_shorter_p, | |
"-S", length_dif_cutoff_shorter_n, | |
"-aL", alignment_coverage_longer_p, | |
"-AL", alignment_coverage_longer_n, | |
"-aS", alignment_coverage_shorter_p, | |
"-AS", alignment_coverage_shorter_n, | |
"-uL", max_unmatched_longer_p, | |
"-uS", max_unmatched_shorter_p, | |
"-U", max_unmatched_both_n, | |
"-g", fast_cluster, | |
"-r", strand, | |
"-match", match, | |
"-mismatch", mismatch, | |
"-gap", gap, | |
"-gap-ext", gap_ext, | |
"-sc", sort_cluster_by_size, | |
"-d 0") | |
system(command = cdhit_call, wait = TRUE) | |
message("Formatting/ writing results.") | |
# reformat cluster file | |
# columns: id, clstr, clstr_size, length, clstr_rep, clstr_iden, clstr_cov | |
cluster_file <- "reformated.clstr" | |
cluster_call <- paste("clstr2txt.pl", paste0(cdhit_output, ".clstr"), ">", cluster_file) | |
system(command = cluster_call, wait = TRUE) | |
# load reformated file | |
cluster <- data.table::fread(cluster_file)[, c("id", "clstr")] | |
names(cluster) <- c("id", "cluster") | |
### add cluster to bed_table | |
result <- merge(x = bed_table, y = cluster, by.x = names(bed_table)[4], by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE] | |
# delete files | |
if (clean) { | |
file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file) | |
} | |
# write summary | |
if (!is.null(summary)) { | |
# script/ function call | |
file_name <- sub("--file=", "", commandArgs()[grep("--file=", commandArgs())]) | |
call <- ifelse(interactive(), deparse(match.call()), | |
paste("Rscript", basename(file_name), paste(commandArgs(trailingOnly = TRUE), collapse = " ")) | |
) | |
total_cluster <- length(unique(result[[ncol(result)]])) | |
# cluster size table | |
# columns: cluster_id, size | |
cluster_table <- data.table::as.data.table(table(result[[ncol(result)]])) | |
names(cluster_table) <- c("cluster_id", "size") | |
summary_data <- c( | |
"Call:", | |
paste0("\t", call), | |
"", | |
"Cluster:", | |
paste("\tTotal", total_cluster), | |
"" | |
) | |
write(x = summary_data, file = summary) | |
data.table::fwrite(x = cluster_table, file = summary, append = TRUE, sep = "\t", col.names = TRUE) | |
} | |
# cast start and end column to integer64 to prevent scientific notation e.g. 1e+10 | |
# start and end are assumed to be at position 2 and 3 | |
result[, c(2, 3) := lapply(.SD, bit64::as.integer64), .SDcols = c(2, 3)] | |
data.table::fwrite(x = result, file = output, sep = "\t", col.names = keep_col_names) | |
} | |
# call function with given parameter if not in interactive context (e.g. run from shell) | |
if (!interactive()) { | |
# remove last parameter (help param) | |
# show help if called without arguments | |
if (length(commandArgs(trailingOnly = TRUE)) <= 0) { | |
print_help(opt_parser) | |
} else { | |
params <- opt[-length(opt)] | |
do.call(cdhitest, args = params) | |
} | |
} |