cdhit_wrapper.R

#! /bin/Rscript
library("optparse")

option_list <- list(
  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"),
  make_option(opt_str = c("-s", "--similarity"), default = 0.8, help = "Similarity threshold. Default = %default", metavar = "double >= 0.8"),
  make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default", metavar = "integer"),
  make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file. Default = %default", metavar = "character"),
  make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical")
  # TODO more args
)

opt_parser <- OptionParser(option_list = option_list,
                           description = "CD-HIT-EST Wrapper function.")

opt <- parse_args(opt_parser)

#' cd-hit wrapper
#'
#' @param input
#' @param similarity Similarity threshold.
#' @param coverage Minimal alignment length for both sequences in nucelotides.
#' @param output Clustered bedfile. Adds cluster number in last column (lower number = bigger cluster).
#' @param clean Clean up after run.
#'
#' @return bed_table with cluster in last column
#' TODO add all cdhit parameter
#' TODO check whether cdhit is installed
cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE) {
  # load bed if neccessary
  if (!data.table::is.data.table(input)) {
    bed_table <- data.table::fread(input = input, header = FALSE)
  } else {
    bed_table <- input
  }

  ### convert bed to fasta
  # 4th column = name
  # last column = sequence
  fasta_file <- "converted_bed.fasta"
  seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file)

  ### cd-hit-est
  cdhit_output <- "cdhit_output"
  cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", similarity, "-A", coverage, "-G 0 -n 3 -g 1 -r 0 -l 5 -sc 1 -d 0")

  system(command = cdhit_call, wait = TRUE)

  # reformat cluster file
  # columns: id, clstr, clstr_size, length, clstr_rep, clstr_iden, clstr_cov
  cluster_file <- "reformated.clstr"
  cluster_call <- paste("clstr2txt.pl", paste0(cdhit_output, ".clstr"), ">", cluster_file)

  system(command = cluster_call, wait = TRUE)

  # load reformated file
  cluster <- data.table::fread(cluster_file)

  ### add cluster to bed_table
  result <- merge(x = bed_table, y = cluster[, c("id", "clstr")], by.x = "V4", by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE]

  # delete files
  if (clean) {
    file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file)
  }

  data.table::fwrite(x = result, file = output, sep = "\t", col.names = FALSE)
}

# call function with given parameter if not in interactive context (e.g. run from shell)
if (!interactive()) {
  # remove last parameter (help param)
  params <- opt[-length(opt)]
  do.call(cdhitest, args = params)
}
	#! /bin/Rscript
	library("optparse")

	option_list <- list(
	make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"),
	make_option(opt_str = c("-s", "--similarity"), default = 0.8, help = "Similarity threshold. Default = %default", metavar = "double >= 0.8"),
	make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default", metavar = "integer"),
	make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file. Default = %default", metavar = "character"),
	make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical")
	# TODO more args
	)

	opt_parser <- OptionParser(option_list = option_list,
	description = "CD-HIT-EST Wrapper function.")

	opt <- parse_args(opt_parser)

	#' cd-hit wrapper
	#'
	#' @param input
	#' @param similarity Similarity threshold.
	#' @param coverage Minimal alignment length for both sequences in nucelotides.
	#' @param output Clustered bedfile. Adds cluster number in last column (lower number = bigger cluster).
	#' @param clean Clean up after run.
	#'
	#' @return bed_table with cluster in last column
	#' TODO add all cdhit parameter
	#' TODO check whether cdhit is installed
	cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE) {
	# load bed if neccessary
	if (!data.table::is.data.table(input)) {
	bed_table <- data.table::fread(input = input, header = FALSE)
	} else {
	bed_table <- input
	}

	### convert bed to fasta
	# 4th column = name
	# last column = sequence
	fasta_file <- "converted_bed.fasta"
	seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file)

	### cd-hit-est
	cdhit_output <- "cdhit_output"
	cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", similarity, "-A", coverage, "-G 0 -n 3 -g 1 -r 0 -l 5 -sc 1 -d 0")

	system(command = cdhit_call, wait = TRUE)

	# reformat cluster file
	# columns: id, clstr, clstr_size, length, clstr_rep, clstr_iden, clstr_cov
	cluster_file <- "reformated.clstr"
	cluster_call <- paste("clstr2txt.pl", paste0(cdhit_output, ".clstr"), ">", cluster_file)

	system(command = cluster_call, wait = TRUE)

	# load reformated file
	cluster <- data.table::fread(cluster_file)

	### add cluster to bed_table
	result <- merge(x = bed_table, y = cluster[, c("id", "clstr")], by.x = "V4", by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE]

	# delete files
	if (clean) {
	file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file)
	}

	data.table::fwrite(x = result, file = output, sep = "\t", col.names = FALSE)
	}

	# call function with given parameter if not in interactive context (e.g. run from shell)
	if (!interactive()) {
	# remove last parameter (help param)
	params <- opt[-length(opt)]
	do.call(cdhitest, args = params)
	}