Merge pull request #47 from loosolab/estimation_motifs

2.2 creates json file containing motif sequences
loosolab · Jan 10, 2019 · 428302a · 428302a
2 parents 13a610f + b70a38b
commit 428302a
Show file tree

Hide file tree

Showing 6 changed files with 213 additions and 44 deletions.
diff --git a/bin/2.2_motif_estimation/bed_to_fasta.R b/bin/2.2_motif_estimation/bed_to_fasta.R
@@ -1,5 +1,5 @@
 #!/usr/bin/env Rscript
-if (!require(optparse)) install.packages("optparse"); library(optparse)
+if (!require(optparse, quietly = T)) install.packages("optparse"); library(optparse)
 
 option_list <- list(
   make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster id.", metavar = "character"),
@@ -36,8 +36,11 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
   # Split data.table bed on its last column (cluster_no) into list of data.frames
   clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)
 
+  # Get number of all clusters
+  num_clusters <- length(clusters)
+
   # For each data.frame(cluster) in list clusters:
-  discard <- lapply(seq_len(length(clusters)), function(i){
+  discard <- vapply(seq_len(length(clusters)), function(i){
     clust <- as.data.frame(clusters[i])
     # Filter data.tables(clusters), which are to small
     if (nrow(clust) >= as.numeric(min_seq) ) {
@@ -47,10 +50,21 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
       outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
       # Write fasta file
       seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
+      return(TRUE)
     } else {
       print(paste0("Cluster: ",i," is to small"))
+      return(FALSE)
     }
-  })
+  }, FUN.VALUE=logical(1))
+
+  # Get number of clusters which contain enought sequences
+  count_clust <- length(discard[discard == FALSE])
+
+  # Write log-file
+  write(paste0( "------------------------------------------------------------\nfile: ",
+                bedInput,"\nNumber of all clusters: ", num_clusters, "\nRemoved small clusters ( < ",
+                min_seq," sequences ): " ,count_clust , "\nRemaining number of clusters: ", num_clusters - count_clust),
+        file = "bed_to_fasta.log", append = T)
 }
 
 # run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell)

diff --git a/bin/2.2_motif_estimation/get_best_motif.py b/bin/2.2_motif_estimation/get_best_motif.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 '''
 parses arguments using argparse
 @return args list of all parameters
@@ -7,6 +9,7 @@ def parse_arguments():
     parser.add_argument("meme", help="Path to 'meme' file generated by GLAM2")
     parser.add_argument("output", help="Output file")
     parser.add_argument("num", help="Number of motifs parsed from file")
+    parser.add_argument("id", help="Cluster ID")
     args = parser.parse_args()
     return args
 
@@ -35,27 +38,31 @@ def main():
     number = int(args.num) + 1
     break_header = "MOTIF  " + str(number)
 
+    # Get cluster_id
+    cluster_id = args.id
+
     # Pattern for motif header
     pattern = re.compile("^MOTIF\s{2}(\d)+")
     # Init count
     count = 0
 
     with open(args.meme) as f:
         for line in f:
-            ## do not write [count] lines after each header -> needed for meme-format
-            if count > 0:
-                count-=1
-                continue
-            if pattern.match(line):
-                # if line is a motif header
-                count = 2
-            ##
-
             if break_header in line:
                 # line matches breaking_header, e.g. 'MOTIF  4'
                 break
             else:
-                out.write(line)
+                ## do not write [count] lines after each header -> needed for meme-format
+                if count > 0:
+                    count-=1
+                    continue
+                if pattern.match(line):
+                    # if line is a motif header
+                    count = 2
+                    out.write(line.strip('\n').replace("  "," ") + " Cluster_" + cluster_id + '\n')
+                ##
+                else:
+                    out.write(line)
 
 
 if __name__ == "__main__":

diff --git a/bin/2.2_motif_estimation/get_motif_seq.R b/bin/2.2_motif_estimation/get_motif_seq.R
@@ -0,0 +1,85 @@
+#!/usr/bin/env Rscript
+if (!require(optparse, quietly = T)) install.packages("optparse"); library(optparse)
+
+option_list <- list(
+  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input file. Output txt-file from GLAM2.", metavar = "character"),
+  make_option(opt_str = c("-o", "--output"), default = "sequences.json" , help = "Output JSON-file. Default = '%default'", metavar = "character"),
+  make_option(opt_str = c("-n", "--num"), default = 3 , help = "Get best (num) motifs. Default = '%default'", metavar = "numeric"),
+  make_option(opt_str = c("-c", "--cluster_id"), default = "./" , help = "Cluster ID", metavar = "numeric"),
+  make_option(opt_str = c("-t", "--tmp"), default = "./" , help = "Path for tmp-files. Default = '%default'", metavar = "character")
+)
+
+opt_parser <- OptionParser(option_list = option_list,
+                           description = "Creating JSON-file with sequence ids which were used to create the best (num) motifs.",
+                           epilogue = "Author: Rene Wiegandt <Rene.Wiegandt@mpi-bn.mpg.de>")
+
+opt <- parse_args(opt_parser)
+
+#' Reading files with fread.
+#' Only read the first column.
+#' @param path Path to file
+#' @return first column as vector
+read_data <- function(path){
+
+  f <- data.table::fread(path, select = 1)
+  return(f[[1]])
+}
+
+
+#' Creating JSON-file with sequence ids which were used to create the best (num) motifs.
+#'
+#' @param input Input file.Output txt-file from GLAM2.
+#' @param output Output JSON-file
+#' @param num Get best (num) motifs.
+#'
+#' @author René Wiegandt <Rene.Wiegandt(at)mpi-bn.mpg.de>
+create_seq_json <- function(input, output, num, tmp_path, cluster_id) {
+
+  if (!file.exists(input)) {
+    stop(paste0("Input file does not exists. Please check the given path: ", input))
+  }
+
+  if ( !is.numeric(num)) {
+    stop("Parameter num needs to be an integer")
+  }
+
+  if (num > 10 || num <= 0 ) {
+    stop(paste0("Parameter 'num' needs to be an number between 1 and 10! Your input: ", num))
+  }
+
+  if ( !varhandle::check.numeric(cluster_id)) {
+    stop(paste0("CLUSTER ID could not be found. Please make sure that your file path contains _[cluster_id] at the end. Found: ", cluster_id,"\n For example: /test_cluster_1/glam.txt"))
+  }
+
+  dir.create(tmp_path, showWarnings = FALSE)
+
+  file_dir <- tmp_path
+
+  # Split glam.txt file on lines that start with Score:
+  system(paste0("csplit ", input, " '/^Score:.*/' '{*}' -f ", file_dir, "/f_id_test.pholder"))
+  # Only keep the lines that start with 'f' to get the lines with the sequence ids
+  system(paste0("for i in ", file_dir, "/*.pholder0[1-", num, "];do grep \"^f\" $i > \"${i}.done\";done"))
+
+  # Getting the filepaths of first 3 files with sequence ids
+  fnames <- file.path(file_dir,dir(file_dir, pattern = "done"))
+
+  # Running read_data on files
+  datalist <- lapply(fnames, read_data)
+
+  # Create json file
+  ## naming
+  names(datalist) <- paste0(c("Motif_", "Motif_", "Motif_"),seq(1,as.numeric(num),1) , " Cluster_", cluster_id)
+  ## creating json object
+  json <- RJSONIO::toJSON(datalist, pretty = T , .withNames = T)
+  ## writing file
+  write(json, file = output )
+}
+
+# run function create_seq_json with given parameteres if not in interactive context (e.g. run from shell)
+if (!interactive()) {
+  if (length(commandArgs(trailingOnly = TRUE)) <= 0) {
+    print_help(opt_parser)
+  } else {
+    create_seq_json(opt$input, opt$output, opt$num, opt$tmp, opt$cluster_id)
+  }
+}
diff --git a/bin/2.2_motif_estimation/motif_estimation.nf b/bin/2.2_motif_estimation/motif_estimation.nf
@@ -1,3 +1,4 @@
+#!/usr/bin/env nextflow
 
 params.bed = ""
 params.out = ""

diff --git a/masterenv.yml b/masterenv.yml
@@ -14,6 +14,8 @@ dependencies:
   - r-stringi
   - r-stringr
   - r-optparse
+  - r-RJSONIO
+  - r-varhandle
   - bioconductor-iranges
   - moods
   - biopython