bed_to_fasta.R

#!/usr/bin/env Rscript
library("optparse")

option_list <- list(
  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"),
  make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"),
  make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer")
)

opt_parser <- OptionParser(option_list = option_list,
                           description = "Convert BED-file to one FASTA-file per cluster")

opt <- parse_args(opt_parser)

#' Splitting BED-files depending on their cluster.
#' The Sequences of each cluster are writen as an FASTA-file.
#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns:
#'                              Sequence: second last column; Cluster ID: last column
#' @param prefix <string> prefix for filenames
#' @param min_seq <INT> min. number of sequences per cluster
#'
#' @author René Wiegandt
#' @contact rene.wiegandt(at)mpi-bn.mpg.de
bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){

  if (is.null(bedInput)) {
    stop("ERROR: Input parameter cannot be null! Please specify the input parameter.")
  }

  bed <- data.table::fread(bedInput, sep = "\t")

  # Get last column of data.table, which refers to the cluster, as a vector.
  cluster_no <- as.vector(bed[[ncol(bed)]])

  # Split data.table bed on its last column (cluster_no) into list of data.frames
  clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)

  # For each data.frame(cluster) in list clusters:
  discard <- lapply(1:length(clusters), function(i){
    clust <- as.data.frame(clusters[i])
    # Filter data.tables(clusters), which are to small
    if (nrow(clust) >= as.numeric(min_seq) ) {
      # Get second last column, which contains the nucleotide sequences
      sequences <- as.list(clust[[ncol(clust) - 1]])
      # Create filename
      outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
      # Write fasta file
      seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
    } else {
      print(paste0("Cluster: ",i," is to small"))
    }
  })
}

# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell)
if (!interactive()) {
  bed_to_fasta(opt$input, opt$prefix, opt$min_seq)
}
	#!/usr/bin/env Rscript
	library("optparse")

	option_list <- list(
	make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"),
	make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"),
	make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer")
	)

	opt_parser <- OptionParser(option_list = option_list,
	description = "Convert BED-file to one FASTA-file per cluster")

	opt <- parse_args(opt_parser)

	#' Splitting BED-files depending on their cluster.
	#' The Sequences of each cluster are writen as an FASTA-file.
	#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns:
	#' Sequence: second last column; Cluster ID: last column
	#' @param prefix <string> prefix for filenames
	#' @param min_seq <INT> min. number of sequences per cluster
	#'
	#' @author René Wiegandt
	#' @contact rene.wiegandt(at)mpi-bn.mpg.de
	bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){

	if (is.null(bedInput)) {
	stop("ERROR: Input parameter cannot be null! Please specify the input parameter.")
	}

	bed <- data.table::fread(bedInput, sep = "\t")

	# Get last column of data.table, which refers to the cluster, as a vector.
	cluster_no <- as.vector(bed[[ncol(bed)]])

	# Split data.table bed on its last column (cluster_no) into list of data.frames
	clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)

	# For each data.frame(cluster) in list clusters:
	discard <- lapply(1:length(clusters), function(i){
	clust <- as.data.frame(clusters[i])
	# Filter data.tables(clusters), which are to small
	if (nrow(clust) >= as.numeric(min_seq) ) {
	# Get second last column, which contains the nucleotide sequences
	sequences <- as.list(clust[[ncol(clust) - 1]])
	# Create filename
	outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
	# Write fasta file
	seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
	} else {
	print(paste0("Cluster: ",i," is to small"))
	}
	})
	}

	# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell)
	if (!interactive()) {
	bed_to_fasta(opt$input, opt$prefix, opt$min_seq)
	}