Reads BED-files with or without header

loosolab · renewiegandt · Dec 20, 2018 · Dec 18, 2018 · Dec 18, 2018 · Dec 18, 2018
commit 2fca1581a160844d89be2fda2aea322a1740b9b3
diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R
@@ -1,58 +1,58 @@
-#!/usr/bin/env Rscript
-library("optparse")
-
-option_list <- list(
-  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"),
-  make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"),
-  make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer")
-)
-
-opt_parser <- OptionParser(option_list = option_list, 
-                           description = "Convert BED-file to one FASTA-file per cluster")
-
-opt <- parse_args(opt_parser)
-
-#' Splitting BED-files depending on their cluster.
-#' The Sequences of each cluster are writen as an FASTA-file.
-#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns:
-#'                              Sequence: second last column; Cluster ID: last column
-#' @param prefix <string> prefix for filenames
-#' @param min_seq <INT> min. number of sequences per cluster
-#'
-#' @author René Wiegandt
-#' @contact rene.wiegandt(at)mpi-bn.mpg.de
-bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
-
-  if(is.null(bedInput)){
-    stop("ERROR: Input parameter cannot be null! Please specify the input parameter.")
-  }
-
-  bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")
-
-  # Get last column of data.table, which refers to the cluster, as a vector.
-  cluster_no <- as.vector(bed[[ncol(bed)]])
-
-  # Split data.table bed on its last column (cluster_no) into list of data.frames
-  clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)
-
-  # For each data.frame(cluster) in list clusters:
-  discard <- lapply(1:length(clusters), function(i){
-    clust <- as.data.frame(clusters[i])
-    # Filter data.tables(clusters), which are to small
-    if (nrow(clust) >= as.numeric(min_seq) ) {
-      # Get second last column, which contains the nucleotide sequences
-      sequences <- as.list(clust[[ncol(clust) - 1]])
-      # Create filename
-      outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
-      # Write fasta file
-      seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
-    } else {
-      print(paste0("Cluster: ",i," is to small"))
-    }
-  })
-}
-
-# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell)
-if (!interactive()) {
-  bed_to_fasta(opt$input, opt$prefix, opt$min_seq)
-}
+#!/usr/bin/env Rscript
+library("optparse")
+
+option_list <- list(
+  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"),
+  make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"),
+  make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer")
+)
+
+opt_parser <- OptionParser(option_list = option_list, 
+                           description = "Convert BED-file to one FASTA-file per cluster")
+
+opt <- parse_args(opt_parser)
+
+#' Splitting BED-files depending on their cluster.
+#' The Sequences of each cluster are writen as an FASTA-file.
+#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns:
+#'                              Sequence: second last column; Cluster ID: last column
+#' @param prefix <string> prefix for filenames
+#' @param min_seq <INT> min. number of sequences per cluster
+#'
+#' @author René Wiegandt
+#' @contact rene.wiegandt(at)mpi-bn.mpg.de
+bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
+  
+  if (is.null(bedInput)) {
+    stop("ERROR: Input parameter cannot be null! Please specify the input parameter.")
+  }
+
+  bed <- data.table::fread(bedInput, sep = "\t")
+  
+  # Get last column of data.table, which refers to the cluster, as a vector.
+  cluster_no <- as.vector(bed[[ncol(bed)]])
+  
+  # Split data.table bed on its last column (cluster_no) into list of data.frames
+  clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)
+  
+  # For each data.frame(cluster) in list clusters:
+  discard <- lapply(1:length(clusters), function(i){
+    clust <- as.data.frame(clusters[i])
+    # Filter data.tables(clusters), which are to small
+    if (nrow(clust) >= as.numeric(min_seq) ) {
+      # Get second last column, which contains the nucleotide sequences
+      sequences <- as.list(clust[[ncol(clust) - 1]])
+      # Create filename
+      outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
+      # Write fasta file
+      seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
+    } else {
+      print(paste0("Cluster: ",i," is to small"))
+    }
+  })
+}
+
+# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell)
+if (!interactive()) {
+  bed_to_fasta(opt$input, opt$prefix, opt$min_seq)
+}