Skip to content

Motif estiamtion #12

Merged
merged 14 commits into from
Dec 20, 2018
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Reads BED-files with or without header
renewiegandt committed Dec 19, 2018
commit 2fca1581a160844d89be2fda2aea322a1740b9b3
116 changes: 58 additions & 58 deletions bin/bed_to_fasta.R
Original file line number Diff line number Diff line change
@@ -1,58 +1,58 @@
#!/usr/bin/env Rscript
library("optparse")

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"),
make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"),
make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer")
)

opt_parser <- OptionParser(option_list = option_list,
description = "Convert BED-file to one FASTA-file per cluster")

opt <- parse_args(opt_parser)

#' Splitting BED-files depending on their cluster.
#' The Sequences of each cluster are writen as an FASTA-file.
#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns:
#' Sequence: second last column; Cluster ID: last column
#' @param prefix <string> prefix for filenames
#' @param min_seq <INT> min. number of sequences per cluster
#'
#' @author René Wiegandt
#' @contact rene.wiegandt(at)mpi-bn.mpg.de
bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){

if(is.null(bedInput)){
stop("ERROR: Input parameter cannot be null! Please specify the input parameter.")
}

bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")

# Get last column of data.table, which refers to the cluster, as a vector.
cluster_no <- as.vector(bed[[ncol(bed)]])

# Split data.table bed on its last column (cluster_no) into list of data.frames
clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)

# For each data.frame(cluster) in list clusters:
discard <- lapply(1:length(clusters), function(i){
clust <- as.data.frame(clusters[i])
# Filter data.tables(clusters), which are to small
if (nrow(clust) >= as.numeric(min_seq) ) {
# Get second last column, which contains the nucleotide sequences
sequences <- as.list(clust[[ncol(clust) - 1]])
# Create filename
outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
# Write fasta file
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
} else {
print(paste0("Cluster: ",i," is to small"))
}
})
}

# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell)
if (!interactive()) {
bed_to_fasta(opt$input, opt$prefix, opt$min_seq)
}
#!/usr/bin/env Rscript
library("optparse")
option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"),
make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"),
make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer")
)
opt_parser <- OptionParser(option_list = option_list,
description = "Convert BED-file to one FASTA-file per cluster")
opt <- parse_args(opt_parser)
#' Splitting BED-files depending on their cluster.
#' The Sequences of each cluster are writen as an FASTA-file.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

written

#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns:
#' Sequence: second last column; Cluster ID: last column
#' @param prefix <string> prefix for filenames
#' @param min_seq <INT> min. number of sequences per cluster
#'
#' @author René Wiegandt
#' @contact rene.wiegandt(at)mpi-bn.mpg.de
bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
if (is.null(bedInput)) {
stop("ERROR: Input parameter cannot be null! Please specify the input parameter.")
}
bed <- data.table::fread(bedInput, sep = "\t")
# Get last column of data.table, which refers to the cluster, as a vector.
cluster_no <- as.vector(bed[[ncol(bed)]])
# Split data.table bed on its last column (cluster_no) into list of data.frames
clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)
# For each data.frame(cluster) in list clusters:
discard <- lapply(1:length(clusters), function(i){
clust <- as.data.frame(clusters[i])
# Filter data.tables(clusters), which are to small
if (nrow(clust) >= as.numeric(min_seq) ) {
# Get second last column, which contains the nucleotide sequences
sequences <- as.list(clust[[ncol(clust) - 1]])
# Create filename
outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
# Write fasta file
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
} else {
print(paste0("Cluster: ",i," is to small"))
}
})
}
# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell)
if (!interactive()) {
bed_to_fasta(opt$input, opt$prefix, opt$min_seq)
}