From cc2325031a60709be32b05f512209e3cd5e1f7ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Tue, 18 Dec 2018 08:00:39 -0500
Subject: [PATCH] Bugfix in bed_to_fasta.R: Get last and second last instead of
 fixed indices

---
 bin/bed_to_fasta.R | 62 +++++++++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 28 deletions(-)
diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R
index 84910f3..8af8516 100644
--- a/bin/bed_to_fasta.R
+++ b/bin/bed_to_fasta.R
@@ -1,28 +1,34 @@
-#!/usr/bin/env Rscript
-
-# Splitting BED-files depending on their cluster.
-# The Sequences of each cluster are writen as an FASTA-file.
-# @parameter bedInput <string> BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8
-# @parameter prefix <string> prefix for filenames
-# @parameter min_seq <INT> min. number of sequences per cluster
-
-args = commandArgs(trailingOnly = TRUE)
-
-bedInput <- args[1]
-prefix <- args[2]
-min_seq <- args[3]
-
-bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")
-
-clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Cluster column
-discard <- lapply(1:length(clusters), function(i){
-  clust <- as.data.frame(clusters[i])
-  print(nrow(clust))
-  if (nrow(clust) >= as.numeric(min_seq) ) {
-    sequences <- as.list(clust[[10]])   # <---- sequenze column
-    outfile <- paste0(prefix,"_cluster_",i,".FASTA")
-    seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column
-  } else {
-    print(paste0("Cluster: ",i," is to small"))
-  }
-})
+#!/usr/bin/env Rscript
+
+#' Splitting BED-files depending on their cluster.
+#' The Sequences of each cluster are writen as an FASTA-file.
+#' @parameter bedInput <string> BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8
+#' @parameter prefix <string> prefix for filenames
+#' @parameter min_seq <INT> min. number of sequences per cluster
+#'
+#' @author René Wiegandt
+#' @contact rene.wiegandt(at)mpi-bn.mpg.de
+
+args = commandArgs(trailingOnly = TRUE)
+
+bedInput <- args[1]
+prefix <- args[2]
+min_seq <- args[3]
+
+bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")
+
+# Get last column of data.table, which refers to the cluster, as a vector.
+cluster_no <- as.vector(bed[[ncol(bed)]])
+
+clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) # <---- Cluster column
+
+discard <- lapply(1:length(clusters), function(i){
+  clust <- as.data.frame(clusters[i])
+  if (nrow(clust) >= as.numeric(min_seq) ) {
+    sequences <- as.list(clust[[ncol(clust) - 1]])   # <---- sequenze column
+    outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
+    seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column
+  } else {
+    print(paste0("Cluster: ",i," is to small"))
+  }
+})