diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R index dce3839..56ac9dc 100644 --- a/bin/bed_to_fasta.R +++ b/bin/bed_to_fasta.R @@ -1,41 +1,58 @@ -#!/usr/bin/env Rscript - -#' Splitting BED-files depending on their cluster. -#' The Sequences of each cluster are writen as an FASTA-file. -#' @param bedInput BED-file with sequences and cluster-id as last two columns: -#' Sequence: second last column; Cluster ID: last column -#' @param prefix prefix for filenames -#' @param min_seq min. number of sequences per cluster -#' -#' @author René Wiegandt -#' @contact rene.wiegandt(at)mpi-bn.mpg.de - -args = commandArgs(trailingOnly = TRUE) - -bedInput <- args[1] -prefix <- args[2] -min_seq <- args[3] - -bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") - -# Get last column of data.table, which refers to the cluster, as a vector. -cluster_no <- as.vector(bed[[ncol(bed)]]) - -# Split data.table bed on its last column (cluster_no) into list of data.frames -clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) - -# For each data.frame(cluster) in list clusters: -discard <- lapply(1:length(clusters), function(i){ - clust <- as.data.frame(clusters[i]) - # Filter data.tables(clusters), which are to small - if (nrow(clust) >= as.numeric(min_seq) ) { - # Get second last column, which contains the nucleotide sequences - sequences <- as.list(clust[[ncol(clust) - 1]]) - # Create filename - outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA") - # Write fasta file - seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) - } else { - print(paste0("Cluster: ",i," is to small")) - } -}) +#!/usr/bin/env Rscript +library("optparse") + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"), + make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"), + make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Convert BED-file to one FASTA-file per cluster") + +opt <- parse_args(opt_parser) + +#' Splitting BED-files depending on their cluster. +#' The Sequences of each cluster are writen as an FASTA-file. +#' @param bedInput BED-file with sequences and cluster-id as last two columns: +#' Sequence: second last column; Cluster ID: last column +#' @param prefix prefix for filenames +#' @param min_seq min. number of sequences per cluster +#' +#' @author René Wiegandt +#' @contact rene.wiegandt(at)mpi-bn.mpg.de +bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){ + + if(is.null(bedInput)){ + stop("ERROR: Input parameter cannot be null! Please specify the input parameter.") + } + + bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") + + # Get last column of data.table, which refers to the cluster, as a vector. + cluster_no <- as.vector(bed[[ncol(bed)]]) + + # Split data.table bed on its last column (cluster_no) into list of data.frames + clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) + + # For each data.frame(cluster) in list clusters: + discard <- lapply(1:length(clusters), function(i){ + clust <- as.data.frame(clusters[i]) + # Filter data.tables(clusters), which are to small + if (nrow(clust) >= as.numeric(min_seq) ) { + # Get second last column, which contains the nucleotide sequences + sequences <- as.list(clust[[ncol(clust) - 1]]) + # Create filename + outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA") + # Write fasta file + seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) + } else { + print(paste0("Cluster: ",i," is to small")) + } + }) +} + +# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell) +if (!interactive()) { + bed_to_fasta(opt$input, opt$prefix, opt$min_seq) +} \ No newline at end of file