From 62f6f3f1e95ad6038567619a3f846203f1e92751 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Tue, 18 Dec 2018 08:10:13 -0500
Subject: [PATCH] bed_to_fasta.R: Improved documentation

---
 bin/bed_to_fasta.R | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)
diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R
index 8af8516..dce3839 100644
--- a/bin/bed_to_fasta.R
+++ b/bin/bed_to_fasta.R
@@ -2,9 +2,10 @@
 
 #' Splitting BED-files depending on their cluster.
 #' The Sequences of each cluster are writen as an FASTA-file.
-#' @parameter bedInput <string> BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8
-#' @parameter prefix <string> prefix for filenames
-#' @parameter min_seq <INT> min. number of sequences per cluster
+#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns:
+#'                              Sequence: second last column; Cluster ID: last column
+#' @param prefix <string> prefix for filenames
+#' @param min_seq <INT> min. number of sequences per cluster
 #'
 #' @author René Wiegandt
 #' @contact rene.wiegandt(at)mpi-bn.mpg.de
@@ -20,14 +21,20 @@ bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")
 # Get last column of data.table, which refers to the cluster, as a vector.
 cluster_no <- as.vector(bed[[ncol(bed)]])
 
-clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) # <---- Cluster column
+# Split data.table bed on its last column (cluster_no) into list of data.frames
+clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)
 
+# For each data.frame(cluster) in list clusters:
 discard <- lapply(1:length(clusters), function(i){
   clust <- as.data.frame(clusters[i])
+  # Filter data.tables(clusters), which are to small
   if (nrow(clust) >= as.numeric(min_seq) ) {
-    sequences <- as.list(clust[[ncol(clust) - 1]])   # <---- sequenze column
+    # Get second last column, which contains the nucleotide sequences
+    sequences <- as.list(clust[[ncol(clust) - 1]])
+    # Create filename
     outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
-    seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column
+    # Write fasta file
+    seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
   } else {
     print(paste0("Cluster: ",i," is to small"))
   }