Bugfix in bed_to_fasta.R: Get last and second last instead of fixed i…

…ndices
loosolab · Dec 18, 2018 · cc23250 · cc23250
1 parent 58c8478
commit cc23250
Showing 1 changed file with 34 additions and 28 deletions.
diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R
@@ -1,28 +1,34 @@
-#!/usr/bin/env Rscript
-
-# Splitting BED-files depending on their cluster.
-# The Sequences of each cluster are writen as an FASTA-file.
-# @parameter bedInput <string> BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8
-# @parameter prefix <string> prefix for filenames
-# @parameter min_seq <INT> min. number of sequences per cluster
-
-args = commandArgs(trailingOnly = TRUE)
-
-bedInput <- args[1]
-prefix <- args[2]
-min_seq <- args[3]
-
-bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")
-
-clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Cluster column
-discard <- lapply(1:length(clusters), function(i){
-  clust <- as.data.frame(clusters[i])
-  print(nrow(clust))
-  if (nrow(clust) >= as.numeric(min_seq) ) {
-    sequences <- as.list(clust[[10]])   # <---- sequenze column
-    outfile <- paste0(prefix,"_cluster_",i,".FASTA")
-    seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column
-  } else {
-    print(paste0("Cluster: ",i," is to small"))
-  }
-})
+#!/usr/bin/env Rscript
+
+#' Splitting BED-files depending on their cluster.
+#' The Sequences of each cluster are writen as an FASTA-file.
+#' @parameter bedInput <string> BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8
+#' @parameter prefix <string> prefix for filenames
+#' @parameter min_seq <INT> min. number of sequences per cluster
+#'
+#' @author René Wiegandt
+#' @contact rene.wiegandt(at)mpi-bn.mpg.de
+
+args = commandArgs(trailingOnly = TRUE)
+
+bedInput <- args[1]
+prefix <- args[2]
+min_seq <- args[3]
+
+bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")
+
+# Get last column of data.table, which refers to the cluster, as a vector.
+cluster_no <- as.vector(bed[[ncol(bed)]])
+
+clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) # <---- Cluster column
+
+discard <- lapply(1:length(clusters), function(i){
+  clust <- as.data.frame(clusters[i])
+  if (nrow(clust) >= as.numeric(min_seq) ) {
+    sequences <- as.list(clust[[ncol(clust) - 1]])   # <---- sequenze column
+    outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
+    seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column
+  } else {
+    print(paste0("Cluster: ",i," is to small"))
+  }
+})