#!/usr/bin/env Rscript library("optparse") option_list <- list( make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"), make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"), make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer") ) opt_parser <- OptionParser(option_list = option_list, description = "Convert BED-file to one FASTA-file per cluster") opt <- parse_args(opt_parser) #' Splitting BED-files depending on their cluster. #' The Sequences of each cluster are written as an FASTA-file. #' @param bedInput <string> BED-file with sequences and cluster-id as last two columns: #' Sequence: second last column; Cluster ID: last column #' @param prefix <string> prefix for filenames #' @param min_seq <INT> min. number of sequences per cluster #' #' @author René Wiegandt #' @contact rene.wiegandt(at)mpi-bn.mpg.de bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){ if (is.null(bedInput)) { stop("ERROR: Input parameter cannot be null! Please specify the input parameter.") } bed <- data.table::fread(bedInput, sep = "\t") # Get last column of data.table, which refers to the cluster, as a vector. cluster_no <- as.vector(bed[[ncol(bed)]]) # Split data.table bed on its last column (cluster_no) into list of data.frames clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) # For each data.frame(cluster) in list clusters: discard <- lapply(1:length(clusters), function(i){ clust <- as.data.frame(clusters[i]) # Filter data.tables(clusters), which are to small if (nrow(clust) >= as.numeric(min_seq) ) { # Get second last column, which contains the nucleotide sequences sequences <- as.list(clust[[ncol(clust) - 1]]) # Create filename outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA") # Write fasta file seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) } else { print(paste0("Cluster: ",i," is to small")) } }) } # run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell) if (!interactive()) { bed_to_fasta(opt$input, opt$prefix, opt$min_seq) }