-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
bed_to_fasta.R: Imporved parametercalling with optparse
- Loading branch information
1 parent
62f6f3f
commit cf9dcd8
Showing
1 changed file
with
58 additions
and
41 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,41 +1,58 @@ | ||
#!/usr/bin/env Rscript | ||
|
||
#' Splitting BED-files depending on their cluster. | ||
#' The Sequences of each cluster are writen as an FASTA-file. | ||
#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns: | ||
#' Sequence: second last column; Cluster ID: last column | ||
#' @param prefix <string> prefix for filenames | ||
#' @param min_seq <INT> min. number of sequences per cluster | ||
#' | ||
#' @author René Wiegandt | ||
#' @contact rene.wiegandt(at)mpi-bn.mpg.de | ||
|
||
args = commandArgs(trailingOnly = TRUE) | ||
|
||
bedInput <- args[1] | ||
prefix <- args[2] | ||
min_seq <- args[3] | ||
|
||
bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") | ||
|
||
# Get last column of data.table, which refers to the cluster, as a vector. | ||
cluster_no <- as.vector(bed[[ncol(bed)]]) | ||
|
||
# Split data.table bed on its last column (cluster_no) into list of data.frames | ||
clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) | ||
|
||
# For each data.frame(cluster) in list clusters: | ||
discard <- lapply(1:length(clusters), function(i){ | ||
clust <- as.data.frame(clusters[i]) | ||
# Filter data.tables(clusters), which are to small | ||
if (nrow(clust) >= as.numeric(min_seq) ) { | ||
# Get second last column, which contains the nucleotide sequences | ||
sequences <- as.list(clust[[ncol(clust) - 1]]) | ||
# Create filename | ||
outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA") | ||
# Write fasta file | ||
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) | ||
} else { | ||
print(paste0("Cluster: ",i," is to small")) | ||
} | ||
}) | ||
#!/usr/bin/env Rscript | ||
library("optparse") | ||
|
||
option_list <- list( | ||
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"), | ||
make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"), | ||
make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer") | ||
) | ||
|
||
opt_parser <- OptionParser(option_list = option_list, | ||
description = "Convert BED-file to one FASTA-file per cluster") | ||
|
||
opt <- parse_args(opt_parser) | ||
|
||
#' Splitting BED-files depending on their cluster. | ||
#' The Sequences of each cluster are writen as an FASTA-file. | ||
#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns: | ||
#' Sequence: second last column; Cluster ID: last column | ||
#' @param prefix <string> prefix for filenames | ||
#' @param min_seq <INT> min. number of sequences per cluster | ||
#' | ||
#' @author René Wiegandt | ||
#' @contact rene.wiegandt(at)mpi-bn.mpg.de | ||
bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){ | ||
|
||
if(is.null(bedInput)){ | ||
stop("ERROR: Input parameter cannot be null! Please specify the input parameter.") | ||
} | ||
|
||
bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") | ||
|
||
# Get last column of data.table, which refers to the cluster, as a vector. | ||
cluster_no <- as.vector(bed[[ncol(bed)]]) | ||
|
||
# Split data.table bed on its last column (cluster_no) into list of data.frames | ||
clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) | ||
|
||
# For each data.frame(cluster) in list clusters: | ||
discard <- lapply(1:length(clusters), function(i){ | ||
clust <- as.data.frame(clusters[i]) | ||
# Filter data.tables(clusters), which are to small | ||
if (nrow(clust) >= as.numeric(min_seq) ) { | ||
# Get second last column, which contains the nucleotide sequences | ||
sequences <- as.list(clust[[ncol(clust) - 1]]) | ||
# Create filename | ||
outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA") | ||
# Write fasta file | ||
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) | ||
} else { | ||
print(paste0("Cluster: ",i," is to small")) | ||
} | ||
}) | ||
} | ||
|
||
# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell) | ||
if (!interactive()) { | ||
bed_to_fasta(opt$input, opt$prefix, opt$min_seq) | ||
} |