Skip to content

Commit

Permalink
bed_to_fasta.R: Imporved parametercalling with optparse
Browse files Browse the repository at this point in the history
  • Loading branch information
renewiegandt committed Dec 19, 2018
1 parent 62f6f3f commit cf9dcd8
Showing 1 changed file with 58 additions and 41 deletions.
99 changes: 58 additions & 41 deletions bin/bed_to_fasta.R
Original file line number Diff line number Diff line change
@@ -1,41 +1,58 @@
#!/usr/bin/env Rscript

#' Splitting BED-files depending on their cluster.
#' The Sequences of each cluster are writen as an FASTA-file.
#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns:
#' Sequence: second last column; Cluster ID: last column
#' @param prefix <string> prefix for filenames
#' @param min_seq <INT> min. number of sequences per cluster
#'
#' @author René Wiegandt
#' @contact rene.wiegandt(at)mpi-bn.mpg.de

args = commandArgs(trailingOnly = TRUE)

bedInput <- args[1]
prefix <- args[2]
min_seq <- args[3]

bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")

# Get last column of data.table, which refers to the cluster, as a vector.
cluster_no <- as.vector(bed[[ncol(bed)]])

# Split data.table bed on its last column (cluster_no) into list of data.frames
clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)

# For each data.frame(cluster) in list clusters:
discard <- lapply(1:length(clusters), function(i){
clust <- as.data.frame(clusters[i])
# Filter data.tables(clusters), which are to small
if (nrow(clust) >= as.numeric(min_seq) ) {
# Get second last column, which contains the nucleotide sequences
sequences <- as.list(clust[[ncol(clust) - 1]])
# Create filename
outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
# Write fasta file
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
} else {
print(paste0("Cluster: ",i," is to small"))
}
})
#!/usr/bin/env Rscript
library("optparse")

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"),
make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"),
make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer")
)

opt_parser <- OptionParser(option_list = option_list,
description = "Convert BED-file to one FASTA-file per cluster")

opt <- parse_args(opt_parser)

#' Splitting BED-files depending on their cluster.
#' The Sequences of each cluster are writen as an FASTA-file.
#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns:
#' Sequence: second last column; Cluster ID: last column
#' @param prefix <string> prefix for filenames
#' @param min_seq <INT> min. number of sequences per cluster
#'
#' @author René Wiegandt
#' @contact rene.wiegandt(at)mpi-bn.mpg.de
bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){

if(is.null(bedInput)){
stop("ERROR: Input parameter cannot be null! Please specify the input parameter.")
}

bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")

# Get last column of data.table, which refers to the cluster, as a vector.
cluster_no <- as.vector(bed[[ncol(bed)]])

# Split data.table bed on its last column (cluster_no) into list of data.frames
clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)

# For each data.frame(cluster) in list clusters:
discard <- lapply(1:length(clusters), function(i){
clust <- as.data.frame(clusters[i])
# Filter data.tables(clusters), which are to small
if (nrow(clust) >= as.numeric(min_seq) ) {
# Get second last column, which contains the nucleotide sequences
sequences <- as.list(clust[[ncol(clust) - 1]])
# Create filename
outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
# Write fasta file
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
} else {
print(paste0("Cluster: ",i," is to small"))
}
})
}

# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell)
if (!interactive()) {
bed_to_fasta(opt$input, opt$prefix, opt$min_seq)
}

0 comments on commit cf9dcd8

Please sign in to comment.