From 6954ecd11635c1ffb84fae041944cd4da5eec91e Mon Sep 17 00:00:00 2001
From: renewiegandt <rene.wiegandt@mpi-bn.mpg.de>
Date: Sat, 15 Dec 2018 19:47:19 +0100
Subject: [PATCH 01/11] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 3bf19c3..87be038 100644
--- a/README.md
+++ b/README.md
@@ -59,9 +59,9 @@ Optional arguments:
 	--tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run.
 	
 	Footprint extraction:
-	--window_length INT	(Default: 200)
-	--step INT		(Default: 100)
-	--percentage INT	(Default: 0)
+	--window_length INT	This parameter sets the length of a sliding window. (Default: 200)
+	--step INT		This parameter sets the number of positions to slide the window forward. (Default: 100)
+	--percentage INT	Threshold in percent (Default: 0)
 	
 	Filter unknown motifs:
 	--min_size_fp INT	(Default: 10)

From 6c42389c831626713dcfc1eb0c351c052bd44ca3 Mon Sep 17 00:00:00 2001
From: renewiegandt <rene.wiegandt@mpi-bn.mpg.de>
Date: Sat, 15 Dec 2018 19:49:05 +0100
Subject: [PATCH 02/11] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 87be038..a3c4947 100644
--- a/README.md
+++ b/README.md
@@ -64,8 +64,8 @@ Optional arguments:
 	--percentage INT	Threshold in percent (Default: 0)
 	
 	Filter unknown motifs:
-	--min_size_fp INT	(Default: 10)
-	--max_size_fp INT	(Default: 100)
+	--min_size_fp INT	Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10)
+	--max_size_fp INT	Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 100)
 	
 	Clustering:
 	Sequence preparation/ reduction:

From a159e4137773e7e2e0ebbafb0a4c93101136801c Mon Sep 17 00:00:00 2001
From: renewiegandt <rene.wiegandt@mpi-bn.mpg.de>
Date: Sat, 15 Dec 2018 19:49:39 +0100
Subject: [PATCH 03/11] Update pipeline.nf

---
 pipeline.nf | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/pipeline.nf b/pipeline.nf
index e1977d2..3148d11 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -71,23 +71,23 @@ Required arguments:
 Optional arguments:
 
 	--tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run.
-
+	
 	Footprint extraction:
-	--window_length INT	(Default: 200)
-	--step INT		(Default: 100)
-	--percentage INT	(Default: 0)
-
+	--window_length INT	This parameter sets the length of a sliding window. (Default: 200)
+	--step INT		This parameter sets the number of positions to slide the window forward. (Default: 100)
+	--percentage INT	Threshold in percent (Default: 0)
+	
 	Filter unknown motifs:
-	--min_size_fp INT	(Default: 10)
-	--max_size_fp INT	(Default: 100)
-
+	--min_size_fp INT	Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10)
+	--max_size_fp INT	Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 100)
+	
 	Clustering:
 	Sequence preparation/ reduction:
 	--kmer INT		Kmer length (Default: 10)
 	--aprox_motif_len INT	Motif length (Default: 10)
 	--motif_occurence FLOAT	Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
 	--min_seq_length Interations	Remove all sequences below this value. (Default: 10)
-
+	
 	Clustering:
 	--global INT		Global (=1) or local (=0) alignment. (Default: 0)
 	--identity FLOAT	Identity threshold. (Default: 0.8)
@@ -95,7 +95,7 @@ Optional arguments:
 	--memory INT		Memory limit in MB. 0 for unlimited. (Default: 800)
 	--throw_away_seq INT	Remove all sequences equal or below this length before clustering. (Default: 9)
 	--strand INT		Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0)
-
+	
 	Motif estimation:
 	--min_seq INT 		Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100)
 	--motif_min_key INT	Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8)
@@ -103,12 +103,12 @@ Optional arguments:
 	--iteration INT		Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000)
 	--tomtom_treshold float	Threshold for similarity score. (Default: 0.01)
 	--best_motif INT	Get the best X motifs per cluster. (Default: 3)
-
+	
 	Moitf clustering:
 	--cluster_motif	Boolean	If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0)
 	--edge_weight INT	Minimum weight of edges in motif-cluster-graph (Default: 5)
 	--motif_similarity_thresh FLOAT	Threshold for motif similarity score (Default: 0.00001)
-
+	
 	Creating GTF:
 	--organism [hg38 | hg19 | mm9 | mm10]	Input organism
 	--tissues List/String 	List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON

From 550ad78e628049645499fe7ae0c15ad7eb60f07e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Sat, 15 Dec 2018 16:59:30 -0500
Subject: [PATCH 04/11] added parameter help

---
 pipeline.nf | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/pipeline.nf b/pipeline.nf
index 3148d11..ad4b199 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -8,6 +8,7 @@
 	params.config=""
 	params.tfbs_path=""
 	params.create_known_tfbs_path = "./"
+	params.help = 0
 
 //peak_calling
 	params.window_length = 200
@@ -56,7 +57,7 @@
 	params.organism="hg38"
 	params.tissue=""
 
-if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == ""){
+if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" == "1"){
 log.info """
 Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file]
 
@@ -71,23 +72,23 @@ Required arguments:
 Optional arguments:
 
 	--tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run.
-	
+
 	Footprint extraction:
 	--window_length INT	This parameter sets the length of a sliding window. (Default: 200)
 	--step INT		This parameter sets the number of positions to slide the window forward. (Default: 100)
 	--percentage INT	Threshold in percent (Default: 0)
-	
+
 	Filter unknown motifs:
 	--min_size_fp INT	Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10)
 	--max_size_fp INT	Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 100)
-	
+
 	Clustering:
 	Sequence preparation/ reduction:
 	--kmer INT		Kmer length (Default: 10)
 	--aprox_motif_len INT	Motif length (Default: 10)
 	--motif_occurence FLOAT	Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
 	--min_seq_length Interations	Remove all sequences below this value. (Default: 10)
-	
+
 	Clustering:
 	--global INT		Global (=1) or local (=0) alignment. (Default: 0)
 	--identity FLOAT	Identity threshold. (Default: 0.8)
@@ -95,7 +96,7 @@ Optional arguments:
 	--memory INT		Memory limit in MB. 0 for unlimited. (Default: 800)
 	--throw_away_seq INT	Remove all sequences equal or below this length before clustering. (Default: 9)
 	--strand INT		Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0)
-	
+
 	Motif estimation:
 	--min_seq INT 		Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100)
 	--motif_min_key INT	Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8)
@@ -103,12 +104,12 @@ Optional arguments:
 	--iteration INT		Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000)
 	--tomtom_treshold float	Threshold for similarity score. (Default: 0.01)
 	--best_motif INT	Get the best X motifs per cluster. (Default: 3)
-	
+
 	Moitf clustering:
 	--cluster_motif	Boolean	If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0)
 	--edge_weight INT	Minimum weight of edges in motif-cluster-graph (Default: 5)
 	--motif_similarity_thresh FLOAT	Threshold for motif similarity score (Default: 0.00001)
-	
+
 	Creating GTF:
 	--organism [hg38 | hg19 | mm9 | mm10]	Input organism
 	--tissues List/String 	List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON
@@ -116,6 +117,7 @@ Optional arguments:
 All arguments can be set in the configuration files
  ```
 """
+System.exit(2)
 } else {
 	Channel.fromPath(params.bigwig).map {it -> [it.simpleName, it]}.set {bigwig_input}
 	Channel.fromPath(params.bed).set {bed_input}

From 1bb8521dbf4e12a2d59402a87e1cf437afe691eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Sat, 15 Dec 2018 17:00:22 -0500
Subject: [PATCH 05/11] added separate motif_estimation script

---
 bin/motif_estimation.nf | 283 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 283 insertions(+)
 create mode 100644 bin/motif_estimation.nf

diff --git a/bin/motif_estimation.nf b/bin/motif_estimation.nf
new file mode 100644
index 0000000..f5b96e3
--- /dev/null
+++ b/bin/motif_estimation.nf
@@ -0,0 +1,283 @@
+
+params.bed = ""
+params.out = ""
+params.motif_db = ""
+params.path_env = ""
+params.help = 0
+params.path_bin = "."
+
+//bed_to_clustered_fasta
+params.min_seq = 100  // Minimum number of sequences in the fasta-files for glam2
+
+//glam2
+params.motif_min_key = 8 // Minimum number of key positions (aligned columns)
+params.motif_max_key = 20 // Maximum number of key positions (aligned columns)
+params.iteration = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime.
+
+//tomtom
+params.tomtom_treshold = 0.01 // threshold for similarity score.
+
+//cluster motifs
+params.cluster_motif = 0 // Boolean if 1 motifs are clustered else they are not
+params.edge_weight = 50 // Minimum weight of edges in motif-cluster-graph
+motif_similarity_thresh = 0.00001 // threshold for motif similarity score
+
+if (params.bed == "" || params.out == "" || params.motif_db == "" || params.path_env || "${params.help}" == "1") {
+log.info """
+Usage: nextflow run pipeline.nf --bed [PATH] --out [PATH] --motif_db [PATH] --path_env YAML-file
+--bed Path			Input BED-file (Column with cluster: 11; column with sequenze: 10)!!
+--out Path			Output Path
+--motif_db Path		Path to MEME-Database
+--path_env Path		Path to YAML file for conda enviroment.
+--path_bin Path		Path to directory with subscripts
+--help 0|1 Print help message if help == 1 (Default: 0)
+
+Motif estimation:
+--min_seq INT 		Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100)
+--motif_min_key INT	Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8)
+--motif_max_key INT	Maximum number of key positions (aligned columns) in the alignment done by GLAM2.f (Default: 20)
+--iteration INT		Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000)
+--tomtom_treshold float	Threshold for similarity score. (Default: 0.01)
+--best_motif INT	Get the best X motifs per cluster. (Default: 3)
+
+Moitf clustering:
+--cluster_motif	Boolean	If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0)
+--edge_weight INT	Minimum weight of edges in motif-cluster-graph (Default: 5)
+--motif_similarity_thresh FLOAT	Threshold for motif similarity score (Default: 0.00001)
+"""
+System.exit(2)
+} else {
+	Channel.fromPath(params.bed).map {it -> [it.simpleName, it]}.set {bed_for_motif_esitmation}
+}
+
+/*
+Converting BED-File to one FASTA-File per cluster
+*/
+process bed_to_clustered_fasta {
+	conda "${params.path_env}"
+	publishDir "${params.out}/esimated_motifs/clustered_motifs/clustered_fasta/", mode: 'copy'
+    tag{name}
+
+    input:
+    set name, file (bed) from bed_for_motif_esitmation
+
+    output:
+    file ('*.FASTA') into fasta_for_glam2
+    file ('*.FASTA') into fasta_for_motif_cluster
+
+    script:
+    """
+    Rscript ${params.path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq}
+    """
+}
+
+
+//flatten list and adding name of file to channel value
+fasta_for_glam2 = fasta_for_glam2.flatten().map {it -> [it.simpleName, it]}
+//combine fasta files in one list
+fasta_for_motif_cluster = fasta_for_motif_cluster.toList()
+
+/*
+Running GLAM2 on FASTA-files.
+Generating Motifs through alignment and scoring best local matches.
+*/
+process glam2 {
+
+    tag{name}
+	publishDir "${params.out}/esimated_motifs/clustered_motifs/${name}/", mode: 'copy'
+
+    input:
+    set name, file (fasta) from fasta_for_glam2
+
+    output:
+    file("${name}/*.meme") into meme_to_merge
+	set name, file("${name}/*.meme") into meme_for_tomtom
+	set name, file("${name}/*.meme") into meme_for_filter
+	file ('*')
+
+    script:
+    """
+    glam2 n ${fasta} -O ./${name}/ -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration}
+    """
+}
+
+/*
+Combining all MEME-files to one big MEME-file.
+The paths are sorted numerically depending on the cluster number.
+*/
+process merge_meme {
+
+	publishDir "${params.out}/esimated_motifs/merged_meme/", mode: 'copy'
+
+	input:
+	val (memelist) from meme_to_merge.toList()
+
+	output:
+	file ('merged_meme.meme') into merged_meme
+
+	when:
+	params.cluster_motif == 1
+
+	script:
+	memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") }
+	meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer }
+	meme_sorted_full = meme_sorted.collect {it.toString() + "/glam2.meme"}
+	meme_list = meme_sorted_full.toString().replaceAll(/\,|\[|\]/,"")
+	"""
+	meme2meme ${meme_list} > merged_meme.meme
+	"""
+}
+
+/*
+Running Tomtom on merged meme-files.
+Output table has the information which clusters are similar to each other.
+*/
+process find_similar_motifs {
+
+	publishDir "${params.out}/esimated_motifs/cluster_similarity/", mode: 'copy'
+	input:
+	file (merged_meme) from merged_meme
+
+	output:
+	file ('all_to_all.tsv') into motif_similarity
+
+	when:
+	params.cluster_motif == 1
+
+	script:
+	"""
+	tomtom ${merged_meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > all_to_all.tsv
+	"""
+}
+
+
+files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster)
+
+
+/*
+Merging FASTA-files of similar clusters
+*/
+process merge_fasta {
+	conda "${params.path_env}"
+	publishDir "${params.out}/esimated_motifs/merged_fasta/", mode: 'copy'
+
+	input:
+	set file (motiv_sim), val (fasta_list) from files_for_merge_fasta
+
+	output:
+	file ('*.fasta') into motif_clustered_fasta_list
+  	file('*.png')
+
+  	when:
+  	params.cluster_motif == 1
+
+	script:
+  	fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer }
+  	fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"")
+	"""
+	Rscript ${params.path_bin}/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight}
+	"""
+}
+
+
+motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten()
+
+
+process clustered_glam2 {
+
+	publishDir "${params.out}/final_esimated_motifs/${name}/", mode: 'copy'
+
+	input:
+	file (fasta) from motif_clustered_fasta_flat
+
+	output:
+	set name, file ('*.meme') into clustered_meme_for_tomtom
+  	set name, file ('*.meme') into clustered_meme_for_filter
+  	file('*')
+
+	when:
+	params.cluster_motif == 1
+
+	script:
+  	name = fasta.getBaseName()
+	"""
+	glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration}
+	"""
+}
+
+if(params.cluster_motif == 1){
+	for_tomtom = clustered_meme_for_tomtom
+	for_filter = clustered_meme_for_filter
+} else {
+	for_tomtom = meme_for_tomtom
+	for_filter = meme_for_filter
+}
+
+
+/*
+Running Tomtom on meme-files generated by GLAM2.
+Tomtom searches motifs in databases.
+*/
+process tomtom {
+
+    tag{name}
+	publishDir "${params.out}/esimated_motifs/tomtom/", mode: 'copy'
+
+    input:
+    set name, file (meme) from for_tomtom
+
+    output:
+    set name, file ('*.tsv') into tsv_for_filter
+
+    script:
+    """
+    tomtom ${meme} ${params.motif_db} -thresh ${params.tomtom_treshold} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv
+    """
+}
+
+
+//Joining channels with meme and tsv files. Filter joined channel on line count.
+//Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel.
+for_filter2 = for_filter.join( tsv_for_filter )
+for_filter2
+    .filter { name, meme, tsv ->
+        long count = tsv.readLines().size()
+        count <= 1
+        }
+    .into { meme_for_scan; check }
+
+
+//If channel 'check' is empty print errormessage
+process check_for_unknown_motifs {
+    echo true
+
+    input:
+    val x from check.ifEmpty('EMPTY')
+
+    when:
+    x == 'EMPTY'
+
+    """
+    echo '>>> STOPPED: No unknown Motifs were found.'
+    """
+
+}
+
+
+//Get the best(first) Motif from each MEME-file
+process get_best_motif {
+	conda "${params.path_env}"
+
+	publishDir "${params.out}/esimated_motifs/unknown_motifs/", mode: 'copy'
+
+	input:
+	set name, file(meme), file(tsv) from meme_for_scan
+
+	output:
+	set name, file('*_best.meme') into best_motif
+
+	script:
+	"""
+	python ${params.path_bin}/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif}
+	"""
+}

From 76b07f929fe19375166e7288c325a24a4dafa97f Mon Sep 17 00:00:00 2001
From: renewiegandt <rene.wiegandt@mpi-bn.mpg.de>
Date: Sat, 15 Dec 2018 23:09:35 +0100
Subject: [PATCH 06/11] Update pipeline.nf

---
 pipeline.nf | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pipeline.nf b/pipeline.nf
index ad4b199..71709f9 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -9,6 +9,7 @@
 	params.tfbs_path=""
 	params.create_known_tfbs_path = "./"
 	params.help = 0
+	params.out = "./out/"
 
 //peak_calling
 	params.window_length = 200
@@ -57,9 +58,9 @@
 	params.organism="hg38"
 	params.tissue=""
 
-if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" == "1"){
+if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" != "0"){
 log.info """
-Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file]
+Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file]
 
 Required arguments:
 	--bigwig		 Path to BigWig-file
@@ -69,9 +70,12 @@ Required arguments:
 	--config		 Path to UROPA configuration file
 	--create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored.
 				 Path can be set as tfbs_path in next run. (Default: './')
+	--out			 Output Directory (Default: './out/')	
+	
 Optional arguments:
-
-	--tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run.
+	
+	--help [0|1]		1 to show this help message. (Default: 0)
+	--tfbs_path 		Path to directory with output from tfbsscan. If given tfbsscan will not be run.
 
 	Footprint extraction:
 	--window_length INT	This parameter sets the length of a sliding window. (Default: 200)

From e0ec81bee9d9358d30bda5107eabe65ce75653a5 Mon Sep 17 00:00:00 2001
From: renewiegandt <rene.wiegandt@mpi-bn.mpg.de>
Date: Sat, 15 Dec 2018 23:10:12 +0100
Subject: [PATCH 07/11] Update README.md

---
 README.md | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index a3c4947..1aa01e6 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ When the enviroment is created, set the variable 'path_env' in the configuration
 
 ## Quick Start
 ```console
-nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file]
+nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file]
 ```
 ## Parameters
 For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration).
@@ -54,26 +54,29 @@ Required arguments:
 	--config		 Path to UROPA configuration file
 	--create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored.
 				 Path can be set as tfbs_path in next run. (Default: './')
+	--out			 Output Directory (Default: './out/')	
+	
 Optional arguments:
-
-	--tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run.
 	
+	--help [0|1]		1 to show this help message. (Default: 0)
+	--tfbs_path 		Path to directory with output from tfbsscan. If given tfbsscan will not be run.
+
 	Footprint extraction:
 	--window_length INT	This parameter sets the length of a sliding window. (Default: 200)
 	--step INT		This parameter sets the number of positions to slide the window forward. (Default: 100)
 	--percentage INT	Threshold in percent (Default: 0)
-	
+
 	Filter unknown motifs:
 	--min_size_fp INT	Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10)
 	--max_size_fp INT	Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 100)
-	
+
 	Clustering:
 	Sequence preparation/ reduction:
 	--kmer INT		Kmer length (Default: 10)
 	--aprox_motif_len INT	Motif length (Default: 10)
 	--motif_occurence FLOAT	Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
 	--min_seq_length Interations	Remove all sequences below this value. (Default: 10)
-	
+
 	Clustering:
 	--global INT		Global (=1) or local (=0) alignment. (Default: 0)
 	--identity FLOAT	Identity threshold. (Default: 0.8)
@@ -81,7 +84,7 @@ Optional arguments:
 	--memory INT		Memory limit in MB. 0 for unlimited. (Default: 800)
 	--throw_away_seq INT	Remove all sequences equal or below this length before clustering. (Default: 9)
 	--strand INT		Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0)
-	
+
 	Motif estimation:
 	--min_seq INT 		Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100)
 	--motif_min_key INT	Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8)
@@ -89,12 +92,12 @@ Optional arguments:
 	--iteration INT		Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000)
 	--tomtom_treshold float	Threshold for similarity score. (Default: 0.01)
 	--best_motif INT	Get the best X motifs per cluster. (Default: 3)
-	
+
 	Moitf clustering:
 	--cluster_motif	Boolean	If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0)
 	--edge_weight INT	Minimum weight of edges in motif-cluster-graph (Default: 5)
 	--motif_similarity_thresh FLOAT	Threshold for motif similarity score (Default: 0.00001)
-	
+
 	Creating GTF:
 	--organism [hg38 | hg19 | mm9 | mm10]	Input organism
 	--tissues List/String 	List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON

From 50fb433424cc63f14539cca2d46fd37f1f7fee8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Sat, 15 Dec 2018 17:14:31 -0500
Subject: [PATCH 08/11] documentation

---
 bin/bed_to_fasta.R      | 6 +++---
 bin/motif_estimation.nf | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R
index 6cefe43..84910f3 100644
--- a/bin/bed_to_fasta.R
+++ b/bin/bed_to_fasta.R
@@ -14,14 +14,14 @@ min_seq <- args[3]
 
 bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")
 
-clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster
+clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Cluster column
 discard <- lapply(1:length(clusters), function(i){
   clust <- as.data.frame(clusters[i])
   print(nrow(clust))
   if (nrow(clust) >= as.numeric(min_seq) ) {
-    sequences <- as.list(clust[[10]])   # <---- Splate mit Sequenz
+    sequences <- as.list(clust[[10]])   # <---- sequenze column
     outfile <- paste0(prefix,"_cluster_",i,".FASTA")
-    seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name
+    seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column
   } else {
     print(paste0("Cluster: ",i," is to small"))
   }
diff --git a/bin/motif_estimation.nf b/bin/motif_estimation.nf
index f5b96e3..430040e 100644
--- a/bin/motif_estimation.nf
+++ b/bin/motif_estimation.nf
@@ -24,7 +24,7 @@ motif_similarity_thresh = 0.00001 // threshold for motif similarity score
 
 if (params.bed == "" || params.out == "" || params.motif_db == "" || params.path_env || "${params.help}" == "1") {
 log.info """
-Usage: nextflow run pipeline.nf --bed [PATH] --out [PATH] --motif_db [PATH] --path_env YAML-file
+Usage: nextflow run motif_estimation.nf --bed [PATH] --out [PATH] --motif_db [PATH] --path_env YAML-file
 --bed Path			Input BED-file (Column with cluster: 11; column with sequenze: 10)!!
 --out Path			Output Path
 --motif_db Path		Path to MEME-Database

From c998452f74999ee34d8295dd289b9e09c0b415af Mon Sep 17 00:00:00 2001
From: renewiegandt <rene.wiegandt@mpi-bn.mpg.de>
Date: Sat, 15 Dec 2018 23:44:59 +0100
Subject: [PATCH 09/11] Changes output path for create_gtf

---
 pipeline.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline.nf b/pipeline.nf
index 71709f9..fe031af 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -575,7 +575,7 @@ process cluster_quality {
 process create_GTF {
 	conda "${path_env}"
 
-	publishDir 'Path', mode:'copy'
+	publishDir "${params.out}/gtf/", mode: 'copy'
 
 	output:
 	file ('*.gtf') into gtf_for_uropa

From a2f15b396f660934cf08a1b75bf8474d70389c32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Sat, 15 Dec 2018 18:29:03 -0500
Subject: [PATCH 10/11] added complete info text

---
 pipeline.nf | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/pipeline.nf b/pipeline.nf
index fe031af..f5358a6 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -70,10 +70,10 @@ Required arguments:
 	--config		 Path to UROPA configuration file
 	--create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored.
 				 Path can be set as tfbs_path in next run. (Default: './')
-	--out			 Output Directory (Default: './out/')	
-	
+	--out			 Output Directory (Default: './out/')
+
 Optional arguments:
-	
+
 	--help [0|1]		1 to show this help message. (Default: 0)
 	--tfbs_path 		Path to directory with output from tfbsscan. If given tfbsscan will not be run.
 
@@ -634,3 +634,16 @@ process filter {
 	"""
 	"""
 } */
+
+workflow.onComplete {
+log.info"""
+Pipeline execution summary
+---------------------------
+Completed at: ${workflow.complete}
+Duration    : ${workflow.duration}
+Success     : ${workflow.success}
+workDir     : ${workflow.workDir}
+exit status : ${workflow.exitStatus}
+Error report: ${workflow.errorReport ?: '-'}
+"""
+}

From e330274ec3230ba22a912d347eebfd12c67d7846 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Sun, 16 Dec 2018 08:32:17 -0500
Subject: [PATCH 11/11] bug fix

---
 bin/merge_similar_clusters.R | 12 ++++++++----
 pipeline.nf                  | 10 +++++-----
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/bin/merge_similar_clusters.R b/bin/merge_similar_clusters.R
index e72fa03..03b8cf1 100644
--- a/bin/merge_similar_clusters.R
+++ b/bin/merge_similar_clusters.R
@@ -1,7 +1,7 @@
 #!/usr/bin/env Rscript
 
 # Merging FASTA-files, which motifs are similar.
-# 
+#
 # @parameter tsv_in <string> Path to TSV file generated by Tomtom.
 #                            The input for Tomtom is a from all clusters merged meme-file.
 # @parameter file_list <string> Numerically sorted whitespace separated list of absolute fasta-file paths
@@ -10,7 +10,7 @@
 
 args = commandArgs(trailingOnly = TRUE)
 
-tsv_in <- args[1] 
+tsv_in <- args[1]
 file_list <- args[2]
 min_weight <- args[3]
 
@@ -38,7 +38,7 @@ edgelist <- sim_not_unique[query_cluster != target_cluster]
 g <- igraph::graph_from_edgelist(as.matrix(edgelist))
 # converting graph to adjacency matrix
 adj_matrix <- igraph::get.adjacency(g, names = T)
-# generating weighted graph from adjacency matrix 
+# generating weighted graph from adjacency matrix
 g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T)
 
 # get subgraphs from graph with edges of weight > min_weight
@@ -47,7 +47,11 @@ png('motif_clusters.png')
 plot(s1)
 dev.off()
 clust <- igraph::clusters(s1)
-
+if (clust$no < 1){
+  b <- lapply(files, function(f){
+    system(paste("cat",f,">",basename(f)))
+  })
+}
 # merge FASTA-files depending on the clustered graphs
 a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){
   cl <- as.vector(which(clust$membership %in% c(i)))
diff --git a/pipeline.nf b/pipeline.nf
index f5358a6..a39616a 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -124,7 +124,7 @@ All arguments can be set in the configuration files
 System.exit(2)
 } else {
 	Channel.fromPath(params.bigwig).map {it -> [it.simpleName, it]}.set {bigwig_input}
-	Channel.fromPath(params.bed).set {bed_input}
+	Channel.fromPath(params.bed).into {bed_input; bed_for_tfbsscan}
 	Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2}
 	Channel.fromPath(params.motif_db).into {db_for_motivscan; db_for_tomtom}
 	Channel.fromPath(params.config).set {config}
@@ -205,6 +205,7 @@ process footprint_extraction {
 	"""
 }
 
+for_tfbs = fa_overlap.combine(db_for_motivscan).combine(bed_for_tfbsscan)
 
 /*
 
@@ -216,8 +217,7 @@ process extract_known_TFBS {
 	publishDir "${params.out}/known_TFBS/", mode: 'copy', pattern: '*.bed'
 
 	input:
-	file (fasta) from fa_overlap
-	file (db) from db_for_motivscan
+	set file (fasta), file (db), file (bed) from for_tfbs
 
 	output:
 	val ('done') into known_TFBS_for_overlap
@@ -227,7 +227,7 @@ process extract_known_TFBS {
 
 	script:
 	"""
-	python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path}
+	python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path} -b ${bed}
 	"""
 }
 
@@ -417,7 +417,7 @@ Merging FASTA-files of similar clusters
 process merge_fasta {
 	conda "${path_env}"
 	publishDir "${params.out}/esimated_motifs/merged_fasta/", mode: 'copy'
-
+	echo true
 	input:
 	set file (motiv_sim), val (fasta_list) from files_for_merge_fasta