Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
master_project_JLU2018/bin/bed_to_fasta.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
58 lines (48 sloc)
2.54 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
library("optparse") | |
option_list <- list( | |
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"), | |
make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"), | |
make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer") | |
) | |
opt_parser <- OptionParser(option_list = option_list, | |
description = "Convert BED-file to one FASTA-file per cluster") | |
opt <- parse_args(opt_parser) | |
#' Splitting BED-files depending on their cluster. | |
#' The Sequences of each cluster are writen as an FASTA-file. | |
#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns: | |
#' Sequence: second last column; Cluster ID: last column | |
#' @param prefix <string> prefix for filenames | |
#' @param min_seq <INT> min. number of sequences per cluster | |
#' | |
#' @author René Wiegandt | |
#' @contact rene.wiegandt(at)mpi-bn.mpg.de | |
bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){ | |
if (is.null(bedInput)) { | |
stop("ERROR: Input parameter cannot be null! Please specify the input parameter.") | |
} | |
bed <- data.table::fread(bedInput, sep = "\t") | |
# Get last column of data.table, which refers to the cluster, as a vector. | |
cluster_no <- as.vector(bed[[ncol(bed)]]) | |
# Split data.table bed on its last column (cluster_no) into list of data.frames | |
clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) | |
# For each data.frame(cluster) in list clusters: | |
discard <- lapply(1:length(clusters), function(i){ | |
clust <- as.data.frame(clusters[i]) | |
# Filter data.tables(clusters), which are to small | |
if (nrow(clust) >= as.numeric(min_seq) ) { | |
# Get second last column, which contains the nucleotide sequences | |
sequences <- as.list(clust[[ncol(clust) - 1]]) | |
# Create filename | |
outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA") | |
# Write fasta file | |
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) | |
} else { | |
print(paste0("Cluster: ",i," is to small")) | |
} | |
}) | |
} | |
# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell) | |
if (!interactive()) { | |
bed_to_fasta(opt$input, opt$prefix, opt$min_seq) | |
} |