added filter for small clusters

loosolab · Nov 29, 2018 · a5e902f · a5e902f
1 parent 80ffd2a
commit a5e902f
Showing 1 changed file with 13 additions and 6 deletions.
diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R
@@ -4,18 +4,25 @@
 # The Sequences of each cluster are writen as an FASTA-file.
 # @parameter bedInput <string> BED-file with sequences and cluster-id as column"TEs
 # @parameter prefix <string> prefix for filenames
+# @parameter min_seq <INT> min. number of sequences per cluster
 
 args = commandArgs(trailingOnly = TRUE)
 
 bedInput <- args[1]
-prefix <- args[2] # "Fasta" 
+prefix <- args[2] 
+min_seq <- args[3] 
 
 bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")
 
-clusters <- split(bed, bed$V3, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster
+clusters <- split(bed, bed$V8, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster
 discard <- lapply(1:length(clusters), function(i){
-  sequences <- as.list(as.data.frame(clusters[i])[[2]])   # <---- Splate mit Sequenz
-  outfile <- paste0(prefix,"_cluster_",i,".FASTA")
-  seqinr::write.fasta(sequences = sequences, names = as.data.frame(clusters[i])[[1]]
-                      , file.out = outfile, as.string = TRUE) # <---- Spalte mit Name
+  clust <- as.data.frame(clusters[i])
+  print(nrow(clust))
+  if (nrow(clust) >= as.numeric(min_seq) ) {
+    sequences <- as.list(clust[[7]])   # <---- Splate mit Sequenz
+    outfile <- paste0(prefix,"_cluster_",i,".FASTA")
+    seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name
+  } else {
+    print(paste0("Cluster: ",i," is to small"))
+  }
 })