diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R new file mode 100644 index 0000000..fd2ce49 --- /dev/null +++ b/bin/cdhit_wrapper.R @@ -0,0 +1,75 @@ +#! /bin/Rscript +library("optparse") + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"), + make_option(opt_str = c("-s", "--similarity"), default = 0.8, help = "Similarity threshold. Default = %default", metavar = "double >= 0.8"), + make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default", metavar = "integer"), + make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file. Default = %default", metavar = "character"), + make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical") + # TODO more args +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Reduce sequences to frequent regions.") + +opt <- parse_args(opt_parser) + +#' cd-hit wrapper +#' +#' @param input +#' @param similarity cdhit = -c +#' @param coverage In Nucleotides. cdhit = -A +#' @param output Clustered bedfile. Adds cluster number in last column (lower number = bigger cluster). +#' @param clean Clean up after run. +#' +#' @return bed_table with cluster in last column +#' TODO add all cdhit parameter +#' TODO check whether cdhit is installed +cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE) { + # load bed if neccessary + if (!data.table::is.data.table(input)) { + bed_table <- data.table::fread(input = input, header = FALSE) + } else { + bed_table <- input + } + + ### convert bed to fasta + # 4th column = name + # last column = sequence + fasta_file <- "converted_bed.fasta" + seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file) + + ### cd-hit-est + cdhit_output <- "cdhit_output" + cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", similarity, "-A", coverage, "-G 0 -n 3 -g 1 -r 0 -l 5 -sc 1") + + system(command = cdhit_call, wait = TRUE) + + # reformat cluster file + # columns: id, clstr, clstr_size, length, clstr_rep, clstr_iden, clstr_cov + cluster_file <- "reformated.clstr" + cluster_call <- paste("clstr2txt.pl", paste0(cdhit_output, ".clstr"), ">", cluster_file) + + system(command = cluster_call, wait = TRUE) + + # load reformated file + cluster <- data.table::fread(cluster_file) + + ### add cluster to bed_table + result <- merge(x = bed_table, y = cluster[, c("id", "clstr")], by.x = "V4", by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE] + + # delete files + if (clean) { + file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file) + } + + data.table::fwrite(x = result, file = output, sep = "\t", col.names = FALSE) +} + +# call function with given parameter if not in interactive context (e.g. run from shell) +if (!interactive()) { + # remove last parameter (help param) + params <- opt[-length(opt)] + do.call(cdhitest, args = params) +}