From fe5ab42391b9ebfde7d2cc229ea0e59a03170972 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Tue, 8 Jan 2019 08:25:56 -0500
Subject: [PATCH 01/17] get_best_motif.py: Added alternative name to best_motif
 file

---
 bin/2.2_motif_estimation/get_best_motif.py | 25 +++++++++++++---------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/bin/2.2_motif_estimation/get_best_motif.py b/bin/2.2_motif_estimation/get_best_motif.py
index a506bd8..20922b9 100644
--- a/bin/2.2_motif_estimation/get_best_motif.py
+++ b/bin/2.2_motif_estimation/get_best_motif.py
@@ -35,6 +35,9 @@ def main():
     number = int(args.num) + 1
     break_header = "MOTIF  " + str(number)
 
+    # Get cluster_id
+    cluster_id = os.path.splitext(args.meme)[0].split('/')[-2].split('_')[-1]
+
     # Pattern for motif header
     pattern = re.compile("^MOTIF\s{2}(\d)+")
     # Init count
@@ -42,23 +45,25 @@ def main():
 
     with open(args.meme) as f:
         for line in f:
-            ## do not write [count] lines after each header -> needed for meme-format
-            if count > 0:
-                count-=1
-                continue
-            if pattern.match(line):
-                # if line is a motif header
-                count = 2
-            ##
-
             if break_header in line:
                 # line matches breaking_header, e.g. 'MOTIF  4'
                 break
             else:
-                out.write(line)
+                ## do not write [count] lines after each header -> needed for meme-format
+                if count > 0:
+                    count-=1
+                    continue
+                if pattern.match(line):
+                    # if line is a motif header
+                    count = 2
+                    out.write(line.strip('\n') + "  Cluster_" + cluster_id + '\n')
+                ##
+                else:
+                    out.write(line)
 
 
 if __name__ == "__main__":
     import argparse
     import re
+    import os
     main()

From 79b7e2476e39ac29e1274e90bee69876b62d33e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Tue, 8 Jan 2019 14:55:39 -0500
Subject: [PATCH 02/17] get_best_motif.py: removed whitespace from motif header

---
 bin/2.2_motif_estimation/get_best_motif.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/2.2_motif_estimation/get_best_motif.py b/bin/2.2_motif_estimation/get_best_motif.py
index 20922b9..beacb1c 100644
--- a/bin/2.2_motif_estimation/get_best_motif.py
+++ b/bin/2.2_motif_estimation/get_best_motif.py
@@ -56,7 +56,7 @@ def main():
                 if pattern.match(line):
                     # if line is a motif header
                     count = 2
-                    out.write(line.strip('\n') + "  Cluster_" + cluster_id + '\n')
+                    out.write(line.strip('\n') + " Cluster_" + cluster_id + '\n')
                 ##
                 else:
                     out.write(line)

From 837734031c8b2b26f0212ba0ea90ce49cb090b27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Tue, 8 Jan 2019 14:56:16 -0500
Subject: [PATCH 03/17] added script get_motif_seq.R

---
 bin/2.2_motif_estimation/get_motif_seq.R | 86 ++++++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 bin/2.2_motif_estimation/get_motif_seq.R

diff --git a/bin/2.2_motif_estimation/get_motif_seq.R b/bin/2.2_motif_estimation/get_motif_seq.R
new file mode 100644
index 0000000..b79ac8d
--- /dev/null
+++ b/bin/2.2_motif_estimation/get_motif_seq.R
@@ -0,0 +1,86 @@
+#!/usr/bin/env Rscript
+if (!require(optparse, quietly = T)) install.packages("optparse"); library(optparse)
+
+option_list <- list(
+  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input file. Output txt-file from GLAM2.", metavar = "character"),
+  make_option(opt_str = c("-o", "--output"), default = "sequences.json" , help = "Output JSON-file. Default = '%default'", metavar = "character"),
+  make_option(opt_str = c("-n", "--num"), default = 3 , help = "Get best (num) motifs. Default = '%default'", metavar = "numeric")
+)
+
+opt_parser <- OptionParser(option_list = option_list,
+                           description = "Creating JSON-file with sequence ids which were used to create the best (num) motifs.",
+                           epilogue = "Author: Rene Wiegandt <Rene.Wiegandt@mpi-bn.mpg.de>")
+
+opt <- parse_args(opt_parser)
+
+#' Reading files with fread.
+#' Only read the first column.
+#' @param path Path to file
+#' @return first column as vector
+read_data <- function(path){
+  
+  f <- data.table::fread(path, select = 1)
+  return(f[[1]])
+}
+
+
+#' Creating JSON-file with sequence ids which were used to create the best (num) motifs.
+#' 
+#' @param input Input file.Output txt-file from GLAM2.
+#' @param output Output JSON-file
+#' @param num Get best (num) motifs.
+#' 
+#' @author René Wiegandt <Rene.Wiegandt(at)mpi-bn.mpg.de>
+create_seq_json <- function(input, output, num) {
+  
+  if (!file.exists(input)) {
+    stop(paste0("Input file does not exists. Please check the given path: ", input))
+  }
+  
+  if ( !is.numeric(num)) {
+    stop("Parameter num needs to be an integer")
+  }
+  
+  if (num > 10 || num <= 0 ) {
+    stop(paste0("Parameter 'num' needs to be an number between 1 and 10! Your input: ", num))
+  }
+
+  # Getting cluster id
+  split_path <- unlist(strsplit(dirname(input),'_'))
+  cluster_id <- split_path[[length(split_path)]]
+  
+  if ( !varhandle::check.numeric(cluster_id)) {
+    stop(paste0("CLUSTER ID could not be found. Please make sure that your file path contains _[cluster_id] at the end. Found: ", cluster_id,"\n For example: /test_cluster_1/glam.txt"))
+  }
+  
+  file_dir <- dirname(input)
+  
+  # Split glam.txt file on lines that start with Score:
+  system(paste0("csplit ", input, " '/^Score:.*/' '{*}' -f ", file_dir, "/f_id_test.pholder"))
+  # Only keep the lines that start with 'f' to get the lines with the sequence ids
+  system(paste0("for i in ", file_dir, "/*.pholder0[1-", num, "];do grep \"^f\" $i > \"${i}.done\";done"))
+  
+  # Getting the filepaths of first 3 files with sequence ids
+  fnames <- file.path(file_dir,dir(file_dir, pattern = "done"))
+  
+  # Running read_data on files
+  datalist <- lapply(fnames, read_data)
+  
+  
+  # Create json file
+  ## naming
+  names(datalist) <- paste0(c("Motif_", "Motif_", "Motif_"),seq(1,as.numeric(num),1) , " Cluster_", cluster_id)
+  ## creating json object
+  json <- RJSONIO::toJSON(datalist, pretty = T , .withNames = T)
+  ## writing file
+  write(json, file = output )
+}
+
+# run function create_seq_json with given parameteres if not in interactive context (e.g. run from shell)
+if (!interactive()) {
+  if (length(commandArgs(trailingOnly = TRUE)) <= 0) {
+    print_help(opt_parser)
+  } else {
+    create_seq_json(opt$input, opt$output, opt$num)
+  }
+}

From 3f9e4bce32cf700e1e5b69b8e6bc2776596c9930 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Tue, 8 Jan 2019 15:14:28 -0500
Subject: [PATCH 04/17] pipeline.nf: implemented get_motif_seq.R

---
 pipeline.nf | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/pipeline.nf b/pipeline.nf
index 93190a3..af27a1f 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -357,6 +357,7 @@ process glam2 {
 	set name, file("${name}/*.meme") into meme_for_tomtom
 	set name, file("${name}/*.meme") into meme_to_search_in_merged
 	set name, file("${name}/*.meme") into meme_for_filter
+	set name, file("${name}/*.txt") into glam_for_seq
 	file ('*')
 
     script:
@@ -508,7 +509,7 @@ Run GLAM2 on emrged FASTA-files
 */
 process clustered_glam2 {
 
-	publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/glam2/${name}/", mode: 'copy'
+	publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/glam2/", mode: 'copy'
 	tag{name}
 	conda "${path_env}"
 
@@ -516,8 +517,9 @@ process clustered_glam2 {
 	file (fasta) from motif_clustered_fasta_flat
 
 	output:
-	set name, file ('*.meme') into clustered_meme_for_tomtom
-  	set name, file ('*.meme') into clustered_meme_for_filter
+	set name, file ("/${name}/*.meme") into clustered_meme_for_tomtom
+  	set name, file ("/${name}/*.meme") into clustered_meme_for_filter
+	set name, file("/${name}/glam2.txt") into clust_glam_for_seq
   	file('*')
 
 	when:
@@ -526,7 +528,7 @@ process clustered_glam2 {
 	script:
   	name = fasta.getBaseName()
 	"""
-	glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration}
+	glam2 n ${fasta} -O ./${name}/ -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration}
 	"""
 }
 
@@ -537,9 +539,11 @@ If motif clustering is activated or not.
 if(params.cluster_motif == 1){
 	for_tomtom = clustered_meme_for_tomtom
 	for_filter = clustered_meme_for_filter
+	seq_to_json = clust_glam_for_seq
 } else {
 	for_tomtom = meme_for_tomtom
 	for_filter = meme_for_filter
+	seq_to_json = glam_for_seq
 }
 
 
@@ -594,7 +598,7 @@ process check_for_unknown_motifs {
 }
 
 
-//Get the best(first) Motif from each MEME-file
+//Get the best (first) X Motifs from each MEME-file
 process get_best_motif {
 
 	conda "${path_env}"
@@ -614,6 +618,25 @@ process get_best_motif {
 }
 
 
+process get_best_motif_seq {
+
+	conda "${path_env}"
+	tag{name}
+	publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/motif_sequences/", mode: 'copy'
+
+	input:
+	set name, file(txt) from seq_to_json
+
+	output:
+	file('*.json')
+
+	script:
+	"""
+	Rscript ${path_bin}/2.2_motif_estimation/ get_motif_seq.R -i ${txt} -o ${name}_seq.json -n ${ĂľĂ¦rams.best_motif}
+	"""
+}
+
+
 best_motif.combine(fa_scan).set {files_for_genome_scan}
 
 /*

From 49ec389e943f6d10f88daabb4a32f7ab1654094e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Tue, 8 Jan 2019 15:30:36 -0500
Subject: [PATCH 05/17] added r packages: RJSONIO, varhandle to masterenv.yml

---
 masterenv.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/masterenv.yml b/masterenv.yml
index 751964e..bd0f2ca 100644
--- a/masterenv.yml
+++ b/masterenv.yml
@@ -14,6 +14,8 @@ dependencies:
   - r-stringi
   - r-stringr
   - r-optparse
+  - r-RJSONIO
+  - r-varhandle
   - bioconductor-iranges
   - moods
   - biopython

From d979e16e25877207654c7269c9461082ac7914cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Wed, 9 Jan 2019 08:23:28 -0500
Subject: [PATCH 06/17] get_best_motif.py: added parameter cluster id

---
 bin/2.2_motif_estimation/get_best_motif.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bin/2.2_motif_estimation/get_best_motif.py b/bin/2.2_motif_estimation/get_best_motif.py
index beacb1c..7f9589d 100644
--- a/bin/2.2_motif_estimation/get_best_motif.py
+++ b/bin/2.2_motif_estimation/get_best_motif.py
@@ -7,6 +7,7 @@ def parse_arguments():
     parser.add_argument("meme", help="Path to 'meme' file generated by GLAM2")
     parser.add_argument("output", help="Output file")
     parser.add_argument("num", help="Number of motifs parsed from file")
+    parser.add_argument("id", help="Cluster ID")
     args = parser.parse_args()
     return args
 
@@ -36,7 +37,7 @@ def main():
     break_header = "MOTIF  " + str(number)
 
     # Get cluster_id
-    cluster_id = os.path.splitext(args.meme)[0].split('/')[-2].split('_')[-1]
+    cluster_id = args.id
 
     # Pattern for motif header
     pattern = re.compile("^MOTIF\s{2}(\d)+")

From ea2e1710e418e9ef61f3542b33b7fa8f11ff5f48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Wed, 9 Jan 2019 08:24:12 -0500
Subject: [PATCH 07/17] get_motif_seq.R: added parameter tmp_path and
 cluster_id

---
 bin/2.2_motif_estimation/get_motif_seq.R | 42 ++++++++++++------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/bin/2.2_motif_estimation/get_motif_seq.R b/bin/2.2_motif_estimation/get_motif_seq.R
index b79ac8d..6f298b3 100644
--- a/bin/2.2_motif_estimation/get_motif_seq.R
+++ b/bin/2.2_motif_estimation/get_motif_seq.R
@@ -4,7 +4,9 @@ if (!require(optparse, quietly = T)) install.packages("optparse"); library(optpa
 option_list <- list(
   make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input file. Output txt-file from GLAM2.", metavar = "character"),
   make_option(opt_str = c("-o", "--output"), default = "sequences.json" , help = "Output JSON-file. Default = '%default'", metavar = "character"),
-  make_option(opt_str = c("-n", "--num"), default = 3 , help = "Get best (num) motifs. Default = '%default'", metavar = "numeric")
+  make_option(opt_str = c("-n", "--num"), default = 3 , help = "Get best (num) motifs. Default = '%default'", metavar = "numeric"),
+  make_option(opt_str = c("-c", "--cluster_id"), default = "./" , help = "Cluster ID", metavar = "numeric"),
+  make_option(opt_str = c("-t", "--tmp"), default = "./" , help = "Path for tmp-files. Default = '%default'", metavar = "character")
 )
 
 opt_parser <- OptionParser(option_list = option_list,
@@ -18,55 +20,53 @@ opt <- parse_args(opt_parser)
 #' @param path Path to file
 #' @return first column as vector
 read_data <- function(path){
-  
+
   f <- data.table::fread(path, select = 1)
   return(f[[1]])
 }
 
 
 #' Creating JSON-file with sequence ids which were used to create the best (num) motifs.
-#' 
+#'
 #' @param input Input file.Output txt-file from GLAM2.
 #' @param output Output JSON-file
 #' @param num Get best (num) motifs.
-#' 
-#' @author René Wiegandt <Rene.Wiegandt(at)mpi-bn.mpg.de>
-create_seq_json <- function(input, output, num) {
-  
+#'
+#' @author Renďż˝ Wiegandt <Rene.Wiegandt(at)mpi-bn.mpg.de>
+create_seq_json <- function(input, output, num, tmp_path, cluster_id) {
+
   if (!file.exists(input)) {
     stop(paste0("Input file does not exists. Please check the given path: ", input))
   }
-  
+
   if ( !is.numeric(num)) {
     stop("Parameter num needs to be an integer")
   }
-  
+
   if (num > 10 || num <= 0 ) {
     stop(paste0("Parameter 'num' needs to be an number between 1 and 10! Your input: ", num))
   }
 
-  # Getting cluster id
-  split_path <- unlist(strsplit(dirname(input),'_'))
-  cluster_id <- split_path[[length(split_path)]]
-  
   if ( !varhandle::check.numeric(cluster_id)) {
     stop(paste0("CLUSTER ID could not be found. Please make sure that your file path contains _[cluster_id] at the end. Found: ", cluster_id,"\n For example: /test_cluster_1/glam.txt"))
   }
-  
-  file_dir <- dirname(input)
-  
+
+  dir.create(tmp_path, showWarnings = FALSE)
+
+  file_dir <- tmp_path
+
   # Split glam.txt file on lines that start with Score:
   system(paste0("csplit ", input, " '/^Score:.*/' '{*}' -f ", file_dir, "/f_id_test.pholder"))
   # Only keep the lines that start with 'f' to get the lines with the sequence ids
   system(paste0("for i in ", file_dir, "/*.pholder0[1-", num, "];do grep \"^f\" $i > \"${i}.done\";done"))
-  
+
   # Getting the filepaths of first 3 files with sequence ids
   fnames <- file.path(file_dir,dir(file_dir, pattern = "done"))
-  
+
   # Running read_data on files
   datalist <- lapply(fnames, read_data)
-  
-  
+
+
   # Create json file
   ## naming
   names(datalist) <- paste0(c("Motif_", "Motif_", "Motif_"),seq(1,as.numeric(num),1) , " Cluster_", cluster_id)
@@ -81,6 +81,6 @@ if (!interactive()) {
   if (length(commandArgs(trailingOnly = TRUE)) <= 0) {
     print_help(opt_parser)
   } else {
-    create_seq_json(opt$input, opt$output, opt$num)
+    create_seq_json(opt$input, opt$output, opt$num, opt$tmp, opt$cluster_id)
   }
 }

From 8e5049ea4092ecc5e0c62481f02ac38852128e5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Wed, 9 Jan 2019 08:25:34 -0500
Subject: [PATCH 08/17] pipeline.nf: adjusting to new parameters required by 
 get_best_motif; get_motif_seq

---
 pipeline.nf | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/pipeline.nf b/pipeline.nf
index af27a1f..4ebab19 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -189,7 +189,7 @@ bigwig_input.combine(bed_input).set{footprint_in}
 This process uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest
 */
 process footprint_extraction {
-	conda "${path_env}"
+	//conda "${path_env}"
 
 	tag{name}
 	publishDir "${params.out}/1.1_footprint_extraction/", mode: 'copy', pattern: '*.bed'
@@ -214,7 +214,7 @@ for_tfbs = fa_overlap.combine(db_for_motivscan).combine(bed_for_tfbsscan)
 */
 process extract_known_TFBS {
 
-	conda "${path_env}"
+	//conda "${path_env}"
 
 	publishDir "${params.out}/1.2_filter_motifs/TFBSscan/", mode: 'copy', pattern: '*.bed'
 
@@ -250,7 +250,7 @@ if(params.tfbs_path == "") {
 /*
 */
 process overlap_with_known_TFBS {
-	conda "${path_env}"
+	//conda "${path_env}"
 	publishDir "${params.out}/1.2_filter_motifs/compareBed/", mode :'copy'
 
 	input:
@@ -272,8 +272,8 @@ process overlap_with_known_TFBS {
 Reduce each sequence to its most conserved region.
 */
 process reduce_sequence {
-	conda "${path_env}"
-	echo true
+	//conda "${path_env}"
+
 	publishDir "${params.out}/2.1_clustering/reduced_bed/", mode: 'copy'
 
 	input:
@@ -293,7 +293,7 @@ process reduce_sequence {
 Cluster all sequences. Appends a column with cluster numbers to the bed-file.
 */
 process clustering {
-	conda "${path_env}"
+	//conda "${path_env}"
 	echo true
 
 	publishDir "${params.out}/2.1_clustering/", mode: 'copy', pattern: '*.bed'
@@ -316,7 +316,7 @@ Converting BED-File to one FASTA-File per cluster
 */
 process bed_to_clustered_fasta {
 
-	conda "${path_env}"
+	//conda "${path_env}"
 	publishDir "${params.out}/2.2_motif_estimation/fasta/", mode: 'copy'
     tag{name}
 
@@ -347,7 +347,7 @@ process glam2 {
 
     tag{name}
 	publishDir "${params.out}/2.2_motif_estimation/glam2/", mode: 'copy'
-	conda "${path_env}"
+	//conda "${path_env}"
 
     input:
     set name, file (fasta) from fasta_for_glam2
@@ -373,7 +373,7 @@ The paths are sorted numerically depending on the cluster ID.
 process merge_meme {
 
 	publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_meme/", mode: 'copy'
-	conda "${path_env}"
+	//conda "${path_env}"
 
 	input:
 	val (memelist) from meme_to_merge.toList()
@@ -406,7 +406,7 @@ process find_similar_motifs {
 
 	tag{name}
 	publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/cluster_similarity/", mode: 'copy'
-	conda "${path_env}"
+	//conda "${path_env}"
 
 	input:
 	set name, file (meme) ,file (merged_meme) from to_find_similar_motifs
@@ -429,7 +429,7 @@ Label first column of TSV-file with Cluster ID
 process label_cluster {
 
 	tag{name}
-	conda "${path_env}"
+	//conda "${path_env}"
 
 	input:
 	set name, file (tsv) from motif_similarity
@@ -480,7 +480,7 @@ Merging FASTA-files of similar clusters
 */
 process merge_fasta {
 
-	conda "${path_env}"
+	//conda "${path_env}"
 	publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_fasta/", mode: 'copy'
 
 	input:
@@ -511,7 +511,7 @@ process clustered_glam2 {
 
 	publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/glam2/", mode: 'copy'
 	tag{name}
-	conda "${path_env}"
+	//conda "${path_env}"
 
 	input:
 	file (fasta) from motif_clustered_fasta_flat
@@ -555,7 +555,7 @@ process tomtom {
 
     tag{name}
 	publishDir "${params.out}/2.2_motif_estimation/tomtom/", mode: 'copy'
-	conda "${path_env}"
+	//conda "${path_env}"
 
     input:
     set name, file (meme) from for_tomtom
@@ -601,7 +601,7 @@ process check_for_unknown_motifs {
 //Get the best (first) X Motifs from each MEME-file
 process get_best_motif {
 
-	conda "${path_env}"
+	//conda "${path_env}"
 	tag{name}
 	publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/", mode: 'copy'
 
@@ -612,27 +612,29 @@ process get_best_motif {
 	set name, file('*_best.meme') into best_motif
 
 	script:
+	cluster_id = name.split('_')[-1]
 	"""
-	python ${path_bin}/2.2_motif_estimation/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif}
+	python ${path_bin}/2.2_motif_estimation/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif} ${cluster_id}
 	"""
 }
 
 
 process get_best_motif_seq {
 
-	conda "${path_env}"
+	//conda "${path_env}"
 	tag{name}
 	publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/motif_sequences/", mode: 'copy'
 
 	input:
-	set name, file(txt) from seq_to_json
+	set name, file (txt) from seq_to_json
 
 	output:
-	file('*.json')
+	file("${name}_seq.json")
 
 	script:
+	cluster_id = name.split('_')[-1]
 	"""
-	Rscript ${path_bin}/2.2_motif_estimation/ get_motif_seq.R -i ${txt} -o ${name}_seq.json -n ${ĂľĂ¦rams.best_motif}
+	Rscript ${path_bin}/2.2_motif_estimation/get_motif_seq.R -i ${txt} -o ${name}_seq.json -n ${params.best_motif} -t ${path_bin}/2.2_motif_estimation/tmp/cluster_${cluster_id} -c ${cluster_id}
 	"""
 }
 
@@ -641,7 +643,7 @@ best_motif.combine(fa_scan).set {files_for_genome_scan}
 
 /*
 process genome_scan {
-	conda "${path_env}"
+	//conda "${path_env}"
 
 	input:
 	set name, file(meme), file(fasta) from files_for_genome_scan
@@ -670,7 +672,7 @@ process cluster_quality {
 
 
 process create_GTF {
-	conda "${path_env}"
+	//conda "${path_env}"
 
 	publishDir "${params.out}/3.1_create_gtf/", mode: 'copy'
 

From 14390450c3056a6db8b9c8fb11a6586679ee0d6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Wed, 9 Jan 2019 09:39:58 -0500
Subject: [PATCH 09/17] Added log file for part 2.2_motif_estimation

---
 bin/2.2_motif_estimation/bed_to_fasta.R | 17 +++++++++++++++-
 pipeline.nf                             | 27 ++++++++++++++++++++++---
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/bin/2.2_motif_estimation/bed_to_fasta.R b/bin/2.2_motif_estimation/bed_to_fasta.R
index 7e716b7..7dbb36f 100644
--- a/bin/2.2_motif_estimation/bed_to_fasta.R
+++ b/bin/2.2_motif_estimation/bed_to_fasta.R
@@ -1,5 +1,5 @@
 #!/usr/bin/env Rscript
-if (!require(optparse)) install.packages("optparse"); library(optparse)
+if (!require(optparse, quietly = T)) install.packages("optparse"); library(optparse)
 
 option_list <- list(
   make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster id.", metavar = "character"),
@@ -36,6 +36,9 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
   # Split data.table bed on its last column (cluster_no) into list of data.frames
   clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)
 
+  # Get number of all clusters
+  num_clusters <- length(clusters)
+
   # For each data.frame(cluster) in list clusters:
   discard <- lapply(seq_len(length(clusters)), function(i){
     clust <- as.data.frame(clusters[i])
@@ -47,10 +50,22 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
       outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
       # Write fasta file
       seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
+      return(TRUE)
     } else {
       print(paste0("Cluster: ",i," is to small"))
+      return(FALSE)
     }
   })
+
+  # Get number of clusters which contain enought sequences
+  tmp <- unlist(discard)
+  count_clust <- length(tmp[tmp == FALSE])
+
+  # Write log-file
+  write(paste0( "------------------------------------------------------------\nfile: ",
+                bedInput,"\nNumber of all clusters: ", num_clusters, "\nRemoved small clusters ( < ",
+                min_seq," sequences ): " ,count_clust , "\nRemaining number of clusters: ", num_clusters - count_clust),
+        file = "bed_to_fasta.log", append = T)
 }
 
 # run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell)
diff --git a/pipeline.nf b/pipeline.nf
index 4ebab19..48b1455 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -200,6 +200,7 @@ process footprint_extraction {
 
 	output:
 	set name, file ('*.bed') into bed_for_overlap_with_TFBS
+	file('*.log')
 
 	script:
 	"""
@@ -317,7 +318,7 @@ Converting BED-File to one FASTA-File per cluster
 process bed_to_clustered_fasta {
 
 	//conda "${path_env}"
-	publishDir "${params.out}/2.2_motif_estimation/fasta/", mode: 'copy'
+	publishDir "${params.out}/2.2_motif_estimation/fasta/", mode: 'copy', pattern: '*.FASTA'
     tag{name}
 
     input:
@@ -326,6 +327,7 @@ process bed_to_clustered_fasta {
     output:
     file ('*.FASTA') into fasta_for_glam2
     file ('*.FASTA') into fasta_for_motif_cluster
+	file ('*.log') into 22_log
 
     script:
     """
@@ -572,14 +574,33 @@ process tomtom {
 
 //Joining channels with meme and tsv files. Filter joined channel on line count.
 //Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel.
-for_filter2 = for_filter.join( tsv_for_filter )
+for_filter.join( tsv_for_filter ).into {for_filter2; for_log}
 for_filter2
     .filter { name, meme, tsv ->
         long count = tsv.readLines().size()
         count <= 1
         }
-    .into { meme_for_scan; check }
+    .into { meme_for_scan; check; num_cluster }
 
+count_cluster = num_cluster.count()
+count_cluster_before_filter = for_log.count()
+
+process write_log_for_motif_estimation {
+
+	publishDir "${params.out}/2.2_motif_estimation/log/", mode: 'copy'
+
+	input:
+	file log, val (after_filter), val (before_filter) from 22_log.combine(count_cluster).combine(count_cluster_before_filter = for_log.count())
+
+	output:
+	file ('*.log')
+
+	script:
+	removed = before_filter - after_filter
+	"""
+	printf "\nMotifs found in Database: ${removed}\nNumber of remaining unknown motifs/cluster${after_filter}" >> log
+	"""
+}
 
 //If channel 'check' is empty print errormessage
 process check_for_unknown_motifs {

From 0f4bf4694c967395116e2670c4aaee549eb52c6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Wed, 9 Jan 2019 14:24:23 -0500
Subject: [PATCH 10/17] added missing shebangs to 2.2 scripts  #44

---
 bin/2.2_motif_estimation/get_best_motif.py   | 2 ++
 bin/2.2_motif_estimation/get_motif_seq.R     | 2 +-
 bin/2.2_motif_estimation/motif_estimation.nf | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/bin/2.2_motif_estimation/get_best_motif.py b/bin/2.2_motif_estimation/get_best_motif.py
index 7f9589d..532a1f3 100644
--- a/bin/2.2_motif_estimation/get_best_motif.py
+++ b/bin/2.2_motif_estimation/get_best_motif.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 '''
 parses arguments using argparse
 @return args list of all parameters
diff --git a/bin/2.2_motif_estimation/get_motif_seq.R b/bin/2.2_motif_estimation/get_motif_seq.R
index 6f298b3..dda4bd3 100644
--- a/bin/2.2_motif_estimation/get_motif_seq.R
+++ b/bin/2.2_motif_estimation/get_motif_seq.R
@@ -32,7 +32,7 @@ read_data <- function(path){
 #' @param output Output JSON-file
 #' @param num Get best (num) motifs.
 #'
-#' @author Renďż˝ Wiegandt <Rene.Wiegandt(at)mpi-bn.mpg.de>
+#' @author RenĂ© Wiegandt <Rene.Wiegandt(at)mpi-bn.mpg.de>
 create_seq_json <- function(input, output, num, tmp_path, cluster_id) {
 
   if (!file.exists(input)) {
diff --git a/bin/2.2_motif_estimation/motif_estimation.nf b/bin/2.2_motif_estimation/motif_estimation.nf
index 430040e..f0eb530 100644
--- a/bin/2.2_motif_estimation/motif_estimation.nf
+++ b/bin/2.2_motif_estimation/motif_estimation.nf
@@ -1,3 +1,4 @@
+#!/usr/bin/env nextflow
 
 params.bed = ""
 params.out = ""

From c0a462be625151a9d2900d3884c15128246285aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Wed, 9 Jan 2019 14:25:14 -0500
Subject: [PATCH 11/17] pipeline.nf: adjusting to new parameter required by 
 2.1 scripts

---
 pipeline.nf | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/pipeline.nf b/pipeline.nf
index 48b1455..4e8708c 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -20,6 +20,7 @@
 //filter_unknown_motifs
 	params.min_size_fp=10
 	params.max_size_fp=100
+	params.tfbsscan_method = "moods"
 
 //clustering
   	//reduce_sequence
@@ -88,6 +89,7 @@ Optional arguments:
 	Filter unknown motifs:
 	--min_size_fp INT	Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10)
 	--max_size_fp INT	Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 100)
+	--tfbsscan_method [moods|fimo] Method used by tfbsscan. (Default: moods)
 
 	Clustering:
 	Sequence preparation/ reduction:
@@ -143,6 +145,7 @@ int_params = ["window_length", "step", "min_size_fp", "max_size_fp", "kmer",
 req_params = ["bigwig", "bed", "genome_fasta", "motif_db", "config"]
 
 valid_organism = ["hg38", "hg19", "mm9", "mm10"]
+valid_tfbsscan_methods = ["moods","fimo"]
 
 params.each { key, value ->
   if(int_params.contains(key)) {
@@ -163,6 +166,11 @@ if (!("${params.identity}" ==~ /^0\.[8-9][[0-9]*]?|^1(\.0)?/ )){
   System.exit(2)
 }
 
+if (!valid_tfbsscan_methods.contains(params.tfbsscan_method)) {
+  println("ERROR: Invalid Method for tfbsscan! Valid methods: " + valid_tfbsscan_methods)
+  System.exit(2)
+}
+
 if (!valid_organism.contains(params.organism)) {
   println("ERROR: Invalid organism! Valid organisms: " + valid_organism)
   System.exit(2)
@@ -230,7 +238,7 @@ process extract_known_TFBS {
 
 	script:
 	"""
-	python ${path_bin}/1.2_filter_motifs/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path} -b ${bed}
+	python ${path_bin}/1.2_filter_motifs/tfbsscan.py --use ${params.tfbsscan_method} --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path} -b ${bed}
 	"""
 }
 
@@ -275,17 +283,19 @@ Reduce each sequence to its most conserved region.
 process reduce_sequence {
 	//conda "${path_env}"
 
-	publishDir "${params.out}/2.1_clustering/reduced_bed/", mode: 'copy'
+	publishDir "${params.out}/2.1_clustering/reduced_bed/", mode: 'copy', pattern: '*.bed'
+	publishDir "${params.out}/2.1_clustering//log", mode: 'copy', pattern: '*.log'
 
 	input:
 	set name, file (bed) from bed_for_reducing
 
 	output:
 	set name, file ('*.bed') into bed_for_clustering
+	file ('*.log')
 
 	script:
 	"""
-	Rscript ${path_bin}/2.1_clustering/reduce_sequence.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length}
+	Rscript ${path_bin}/2.1_clustering/reduce_sequence.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} --summary reduce_sequence.log
 	"""
 }
 
@@ -295,19 +305,20 @@ Cluster all sequences. Appends a column with cluster numbers to the bed-file.
 */
 process clustering {
 	//conda "${path_env}"
-	echo true
 
 	publishDir "${params.out}/2.1_clustering/", mode: 'copy', pattern: '*.bed'
+	publishDir "${params.out}/2.1_clustering//log", mode: 'copy', pattern: '*.log'
 
 	input:
 	set name, file (bed) from bed_for_clustering
 
 	output:
 	set name, file ('*.bed') into bed_for_motif_esitmation
+	file ('*.log')
 
 	script:
 	"""
-	Rscript ${path_bin}/2.1_clustering/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads}
+	Rscript ${path_bin}/2.1_clustering/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} --summary reduce_sequence.log
 	"""
 }
 
@@ -327,7 +338,7 @@ process bed_to_clustered_fasta {
     output:
     file ('*.FASTA') into fasta_for_glam2
     file ('*.FASTA') into fasta_for_motif_cluster
-	file ('*.log') into 22_log
+	file ('*.log') into log22
 
     script:
     """
@@ -585,12 +596,14 @@ for_filter2
 count_cluster = num_cluster.count()
 count_cluster_before_filter = for_log.count()
 
+log22.combine(count_cluster).combine(count_cluster_before_filter ).set {log22_final}
+
 process write_log_for_motif_estimation {
 
 	publishDir "${params.out}/2.2_motif_estimation/log/", mode: 'copy'
 
 	input:
-	file log, val (after_filter), val (before_filter) from 22_log.combine(count_cluster).combine(count_cluster_before_filter = for_log.count())
+	set file (logfile), after_filter, before_filter from log22_final
 
 	output:
 	file ('*.log')
@@ -598,7 +611,8 @@ process write_log_for_motif_estimation {
 	script:
 	removed = before_filter - after_filter
 	"""
-	printf "\nMotifs found in Database: ${removed}\nNumber of remaining unknown motifs/cluster${after_filter}" >> log
+	cat ${logfile} > motif_estimation.log
+	printf "\nMotifs found in Database: $removed\nNumber of remaining unknown motifs/cluster: $after_filter" >> motif_estimation.log
 	"""
 }
 

From e62bb3674ff4f328a51132467156385b33ac583c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Wed, 9 Jan 2019 19:45:28 -0500
Subject: [PATCH 12/17] pipeline.nf: fixed typos

---
 pipeline.nf | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pipeline.nf b/pipeline.nf
index 4e8708c..74da2e1 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -284,7 +284,7 @@ process reduce_sequence {
 	//conda "${path_env}"
 
 	publishDir "${params.out}/2.1_clustering/reduced_bed/", mode: 'copy', pattern: '*.bed'
-	publishDir "${params.out}/2.1_clustering//log", mode: 'copy', pattern: '*.log'
+	publishDir "${params.out}/2.1_clustering/log", mode: 'copy', pattern: '*.log'
 
 	input:
 	set name, file (bed) from bed_for_reducing
@@ -307,7 +307,7 @@ process clustering {
 	//conda "${path_env}"
 
 	publishDir "${params.out}/2.1_clustering/", mode: 'copy', pattern: '*.bed'
-	publishDir "${params.out}/2.1_clustering//log", mode: 'copy', pattern: '*.log'
+	publishDir "${params.out}/2.1_clustering/log", mode: 'copy', pattern: '*.log'
 
 	input:
 	set name, file (bed) from bed_for_clustering
@@ -318,7 +318,7 @@ process clustering {
 
 	script:
 	"""
-	Rscript ${path_bin}/2.1_clustering/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} --summary reduce_sequence.log
+	Rscript ${path_bin}/2.1_clustering/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} --summary clustering.log
 	"""
 }
 
@@ -715,7 +715,7 @@ process create_GTF {
 	file ('*.gtf') into gtf
 
 	when:
-	params.gtf_path == ""
+	params.gtf_path == "kill"
 
 	script:
 	"""

From f3b3db40d86559ab3a59cf695654d1986b9425d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Thu, 10 Jan 2019 03:55:50 -0500
Subject: [PATCH 13/17] get_best_motif.py: Removed additional whitespace in
 motif header

---
 bin/2.2_motif_estimation/get_best_motif.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/2.2_motif_estimation/get_best_motif.py b/bin/2.2_motif_estimation/get_best_motif.py
index 532a1f3..a4c57bf 100644
--- a/bin/2.2_motif_estimation/get_best_motif.py
+++ b/bin/2.2_motif_estimation/get_best_motif.py
@@ -59,7 +59,7 @@ def main():
                 if pattern.match(line):
                     # if line is a motif header
                     count = 2
-                    out.write(line.strip('\n') + " Cluster_" + cluster_id + '\n')
+                    out.write(line.strip('\n').replace("  "," ") + " Cluster_" + cluster_id + '\n')
                 ##
                 else:
                     out.write(line)

From 2d2f8cbfb7b2d4af40b718448dbc93b64f753b3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Thu, 10 Jan 2019 03:58:16 -0500
Subject: [PATCH 14/17] pipeline.nf: removed code for debuging

---
 pipeline.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline.nf b/pipeline.nf
index 74da2e1..6499a41 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -715,7 +715,7 @@ process create_GTF {
 	file ('*.gtf') into gtf
 
 	when:
-	params.gtf_path == "kill"
+	params.gtf_path == ""
 
 	script:
 	"""

From 43c55aee173630cd0270fd664546a3ff4d565ea6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Thu, 10 Jan 2019 07:34:16 -0500
Subject: [PATCH 15/17] bed_to_fasta.R: changes lapply to vapply

---
 bin/2.2_motif_estimation/bed_to_fasta.R | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/bin/2.2_motif_estimation/bed_to_fasta.R b/bin/2.2_motif_estimation/bed_to_fasta.R
index 7dbb36f..322f01f 100644
--- a/bin/2.2_motif_estimation/bed_to_fasta.R
+++ b/bin/2.2_motif_estimation/bed_to_fasta.R
@@ -40,7 +40,7 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
   num_clusters <- length(clusters)
 
   # For each data.frame(cluster) in list clusters:
-  discard <- lapply(seq_len(length(clusters)), function(i){
+  discard <- vapply(seq_len(length(clusters)), function(i){
     clust <- as.data.frame(clusters[i])
     # Filter data.tables(clusters), which are to small
     if (nrow(clust) >= as.numeric(min_seq) ) {
@@ -55,11 +55,10 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
       print(paste0("Cluster: ",i," is to small"))
       return(FALSE)
     }
-  })
+  }, FUN.VALUE=logical(1))
 
   # Get number of clusters which contain enought sequences
-  tmp <- unlist(discard)
-  count_clust <- length(tmp[tmp == FALSE])
+  count_clust <- length(discard[discard == FALSE])
 
   # Write log-file
   write(paste0( "------------------------------------------------------------\nfile: ",

From 310702eaf5ff39eb60db9aba7b25402cbb3eebc5 Mon Sep 17 00:00:00 2001
From: renewiegandt <rene.wiegandt@mpi-bn.mpg.de>
Date: Thu, 10 Jan 2019 13:54:12 +0100
Subject: [PATCH 16/17] Removed import os from get_best_motif.py

---
 bin/2.2_motif_estimation/get_best_motif.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bin/2.2_motif_estimation/get_best_motif.py b/bin/2.2_motif_estimation/get_best_motif.py
index a4c57bf..21ddcbb 100644
--- a/bin/2.2_motif_estimation/get_best_motif.py
+++ b/bin/2.2_motif_estimation/get_best_motif.py
@@ -68,5 +68,4 @@ def main():
 if __name__ == "__main__":
     import argparse
     import re
-    import os
     main()

From b70a38b8c3fd531bbfcd1f534169f0764ee9863c Mon Sep 17 00:00:00 2001
From: renewiegandt <rene.wiegandt@mpi-bn.mpg.de>
Date: Thu, 10 Jan 2019 13:55:10 +0100
Subject: [PATCH 17/17] Removed newline from get_motif_seq

---
 bin/2.2_motif_estimation/get_motif_seq.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bin/2.2_motif_estimation/get_motif_seq.R b/bin/2.2_motif_estimation/get_motif_seq.R
index dda4bd3..acce6f2 100644
--- a/bin/2.2_motif_estimation/get_motif_seq.R
+++ b/bin/2.2_motif_estimation/get_motif_seq.R
@@ -66,7 +66,6 @@ create_seq_json <- function(input, output, num, tmp_path, cluster_id) {
   # Running read_data on files
   datalist <- lapply(fnames, read_data)
 
-
   # Create json file
   ## naming
   names(datalist) <- paste0(c("Motif_", "Motif_", "Motif_"),seq(1,as.numeric(num),1) , " Cluster_", cluster_id)