loosolab · renewiegandt · Dec 20, 2018 · Dec 18, 2018 · Dec 18, 2018 · Dec 18, 2018
diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R
@@ -1,28 +1,58 @@
 #!/usr/bin/env Rscript
+library("optparse")
 
-# Splitting BED-files depending on their cluster.
-# The Sequences of each cluster are writen as an FASTA-file.
-# @parameter bedInput <string> BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8
-# @parameter prefix <string> prefix for filenames
-# @parameter min_seq <INT> min. number of sequences per cluster
+option_list <- list(
+  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"),
+  make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"),
+  make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer")
+)
 
-args = commandArgs(trailingOnly = TRUE)
+opt_parser <- OptionParser(option_list = option_list, 
+                           description = "Convert BED-file to one FASTA-file per cluster")
 
-bedInput <- args[1]
-prefix <- args[2]
-min_seq <- args[3]
+opt <- parse_args(opt_parser)
 
-bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")
-
-clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Cluster column
-discard <- lapply(1:length(clusters), function(i){
-  clust <- as.data.frame(clusters[i])
-  print(nrow(clust))
-  if (nrow(clust) >= as.numeric(min_seq) ) {
-    sequences <- as.list(clust[[10]])   # <---- sequenze column
-    outfile <- paste0(prefix,"_cluster_",i,".FASTA")
-    seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column
-  } else {
-    print(paste0("Cluster: ",i," is to small"))
+#' Splitting BED-files depending on their cluster.
+#' The Sequences of each cluster are writen as an FASTA-file.
+#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns:
+#'                              Sequence: second last column; Cluster ID: last column
+#' @param prefix <string> prefix for filenames
+#' @param min_seq <INT> min. number of sequences per cluster
+#'
+#' @author René Wiegandt
+#' @contact rene.wiegandt(at)mpi-bn.mpg.de
+bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
+
+  if(is.null(bedInput)){
+    stop("ERROR: Input parameter cannot be null! Please specify the input parameter.")
   }
-})
+
+  bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")
+
+  # Get last column of data.table, which refers to the cluster, as a vector.
+  cluster_no <- as.vector(bed[[ncol(bed)]])
+
+  # Split data.table bed on its last column (cluster_no) into list of data.frames
+  clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)
+
+  # For each data.frame(cluster) in list clusters:
+  discard <- lapply(1:length(clusters), function(i){
+    clust <- as.data.frame(clusters[i])
+    # Filter data.tables(clusters), which are to small
+    if (nrow(clust) >= as.numeric(min_seq) ) {
+      # Get second last column, which contains the nucleotide sequences
+      sequences <- as.list(clust[[ncol(clust) - 1]])
+      # Create filename
+      outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
+      # Write fasta file
+      seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
+    } else {
+      print(paste0("Cluster: ",i," is to small"))
+    }
+  })
+}
+
+# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell)
+if (!interactive()) {
+  bed_to_fasta(opt$input, opt$prefix, opt$min_seq)
+}
diff --git a/bin/get_best_motif.py b/bin/get_best_motif.py
@@ -1,26 +1,64 @@
-# parses arguments using argparse
-# @return args list of all parameters
+'''
+parses arguments using argparse
+@return args list of all parameters
+'''
 def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("meme", help="Path to meme file")
+    parser = argparse.ArgumentParser(description='A script to convert from GLAM2 output to MEME-format and parsing only the [num] first motifs from file to the output.')
+    parser.add_argument("meme", help="Path to 'meme' file generated by GLAM2")
     parser.add_argument("output", help="Output file")
     parser.add_argument("num", help="Number of motifs parsed from file")
     args = parser.parse_args()
     return args
 
-# write lines of file till certain line (MOTIF + [num])
+'''
+The script has to functions:
+    1. Writing lines of file till certain line (MOTIF + [num])
+    2. Converting GLAM2 output to minimal meme-format
+@params meme STING Path to 'meme' file generated from Meme suite
+@parmas output STING Output file
+@params num INT Number of motifs parsed from file
+
+@author René Wiegandt
+@contact rene.wiegandt(at)mpi-bn.mpg.de
+'''
 def main():
+
     args = parse_arguments()
     out = open(args.output, "w+")
+
+    '''
+    Create pattern where script should stop writing
+    For Example:
+    If num == 3, which means that you want the first/best 3 Motifs, the script
+    should stop writing lines to output if loop reaches line 'MOTIF  4'
+    '''
     number = int(args.num) + 1
-    motif = "MOTIF " + str(number)
+    break_header = "MOTIF  " + str(number)
+
+    # Pattern for motif header
+    pattern = re.compile("^MOTIF\s{2}(\d)+")
+    # Init count
+    count = 0
+
     with open(args.meme) as f:
         for line in f:
-            if motif in line:
+            ## do not write [count] lines after each header -> needed for meme-format
+            if count > 0:
+                count-=1
+                continue
+            if pattern.match(line):
+                # if line is a motif header
+                count = 2
+            ##
+
+            if break_header in line:
+                # line matches breaking_header, e.g. 'MOTIF  4'
                 break
-            out.write(line)
+            else:
+                out.write(line)
 
 
 if __name__ == "__main__":
     import argparse
+    import re
     main()
diff --git a/pipeline.nf b/pipeline.nf
@@ -324,7 +324,7 @@ process bed_to_clustered_fasta {
 
     script:
     """
-    Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq}
+    Rscript ${path_bin}/bed_to_fasta.R -i ${bed} -p ${name} -m ${params.min_seq}
     """
 }