Skip to content
Permalink
3c4f733eb6
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
58 lines (48 sloc) 2.5 KB
#!/usr/bin/env Rscript
library("optparse")
option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"),
make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"),
make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer")
)
opt_parser <- OptionParser(option_list = option_list,
description = "Convert BED-file to one FASTA-file per cluster")
opt <- parse_args(opt_parser)
#' Splitting BED-files depending on their cluster.
#' The Sequences of each cluster are writen as an FASTA-file.
#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns:
#' Sequence: second last column; Cluster ID: last column
#' @param prefix <string> prefix for filenames
#' @param min_seq <INT> min. number of sequences per cluster
#'
#' @author René Wiegandt
#' @contact rene.wiegandt(at)mpi-bn.mpg.de
bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
if(is.null(bedInput)){
stop("ERROR: Input parameter cannot be null! Please specify the input parameter.")
}
bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")
# Get last column of data.table, which refers to the cluster, as a vector.
cluster_no <- as.vector(bed[[ncol(bed)]])
# Split data.table bed on its last column (cluster_no) into list of data.frames
clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)
# For each data.frame(cluster) in list clusters:
discard <- lapply(1:length(clusters), function(i){
clust <- as.data.frame(clusters[i])
# Filter data.tables(clusters), which are to small
if (nrow(clust) >= as.numeric(min_seq) ) {
# Get second last column, which contains the nucleotide sequences
sequences <- as.list(clust[[ncol(clust) - 1]])
# Create filename
outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
# Write fasta file
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
} else {
print(paste0("Cluster: ",i," is to small"))
}
})
}
# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell)
if (!interactive()) {
bed_to_fasta(opt$input, opt$prefix, opt$min_seq)
}