From 603c814f4120e2580b760c698e71558d43107c8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Thu, 8 Nov 2018 04:50:05 -0500
Subject: [PATCH 1/8] added uropa config

---
 uropa.config | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 uropa.config

diff --git a/uropa.config b/uropa.config
new file mode 100644
index 0000000..3c4a189
--- /dev/null
+++ b/uropa.config
@@ -0,0 +1,16 @@
+{
+"queries":[
+    {"feature":"",
+    "feature.anchor": "",
+    "distance": [ , ],
+    "strand":"",
+    "direction":"",
+    "internals":"",
+    "filter.attribute":"",
+    "attribute.value":"",
+    "show.attributes":"" }
+          ],
+"priority": "",
+"gtf": "placeholder_gtf",
+"bed": "placeholder_bed"
+}
\ No newline at end of file

From 9f4e4809cdb59e43f02e0d1b7caf4f9ab2c8bed6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Thu, 8 Nov 2018 04:51:21 -0500
Subject: [PATCH 2/8] added process create_uropa_config

---
 pipeline.nf | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/pipeline.nf b/pipeline.nf
index 49adb09..0609bb1 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -3,6 +3,7 @@
 Channel.fromPath(params.input).map {it -> [it.simpleName, it]}.set {bigwig_input}
 Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan}
 Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom}
+Channel.fromPath(params.config).set {config}
 
 process footprint_extraction {
 
@@ -212,15 +213,23 @@ process create_GTF {
 }
 
 
-process create_UROPA_config  {
+bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in}
+
+
+process create_uropa_config {
+
+	publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/', mode: 'copy'
 
 	input:
+	set val(bed), val(gtf) from uropa_in.toList()
+	file (conf) from config
 
 	output:
 	file ('uropa.config') into uropa_config
 
 	script:
 	"""
+  	sed -- 's/placeholder_gtf/${gtf}/g; s/placeholder_bed/${bed}/g' ${conf} > uropa.config.final
 	"""
 }
 

From 87bbda0c4cd6ccd1d5507f54e480aa975f13813d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Thu, 8 Nov 2018 07:26:59 -0500
Subject: [PATCH 3/8] added scripts for part 1.2

---
 bin/compareBed.sh | 183 ++++++++++++++++++++++++++++++++++++++++++++++
 bin/maxScore.R    |   9 +++
 bin/merge.R       |  23 ++++++
 3 files changed, 215 insertions(+)
 create mode 100644 bin/compareBed.sh
 create mode 100644 bin/maxScore.R
 create mode 100644 bin/merge.R

diff --git a/bin/compareBed.sh b/bin/compareBed.sh
new file mode 100644
index 0000000..bcf52fb
--- /dev/null
+++ b/bin/compareBed.sh
@@ -0,0 +1,183 @@
+#!/bin/bash
+#data
+#motifs
+#workdir
+#fasta
+#min
+#max
+#output
+wrong=()
+da=false
+mo=false
+wo=false
+fa=false
+mi=false
+ma=false
+ou=false
+he=false
+
+if [ $# -eq 0 ]
+then
+	he=true
+fi
+
+while [[ $# -gt 0 ]]
+do
+key="$1"
+
+case $key in
+	-d|--data)
+	data="$2"
+	da=true
+	shift
+	shift
+	;;
+	-m|--motifs)
+	motifs="$2"
+	mo=true
+	shift
+	shift
+	;;
+	-w|--workdir)
+	workdir="$2"
+	wo=true
+	shift
+	shift
+	;;
+	-f|--fasta)
+	fasta="$2"
+	fa=true
+	shift
+	shift
+	;;
+	-min|--min)
+	min=$2
+	mi=true
+	shift
+	shift
+	;;
+	-max|--max)
+	max=$2
+	ma=true
+	shift
+	shift
+	;;
+	-o|--output)
+	output="$2"
+	ou=true
+	shift
+	shift
+	;;
+	-h|--help)
+	help=true
+	he=true
+	shift
+	;;
+	*)
+	wrong+=("$1")
+	shift
+	;;
+esac
+done
+
+count=${#wrong[@]}
+if [ $count -gt 0 ]
+then
+	for i in ${wrong[@]}
+	do
+		echo wrong parameter $i
+		echo call script without parameters for help or call --help
+		echo exit
+	done
+exit 1
+fi
+
+if [ $he == true ]
+then
+	echo "This script utilies bedtools to select new footprints from data."
+	echo "Therefore the data is compared with known footprint motifs."
+	echo "The output is a new .bed file with added sequence information."
+	echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences."
+	echo "If a parameter is chosen, a value must be provided or an error will occur."
+	echo "--------------------"
+	echo "usage: compareBed.sh --data \<path/to/file.bed\> --motifs \<path/to/known/motifs.bed\> --fasta \<path/to/genome.fasta\> \[optional_parameter \<value\>\] ..."
+	echo "--------------------"
+	echo "              required parameter:"
+	echo "              -d   --data         the path to the .bed file of the footprints"
+	echo "              -m   --motifs       the path to the .bed file of the scanned motifs"
+	echo "              -f   --fasta        the path to the .fasta file of genome"
+	echo "              "
+	echo "              optional parameter:"
+	echo "              -w   --workdir      the path to directory where temporary files will be created"
+	echo "                                  default is a new directory that is created in the current directory"
+	echo "              -min --min          minimum size of footprints\; default is 10"
+	echo "              -max --max          maximum size of footprints\; default is 60"
+	echo "              -o   --output       output path/file \; default dir is workdir and filename is newFootprints.bed and newFootprints.bed.fasta"
+	echo "              -h --help           shows this help message"
+exit 0
+fi
+
+echo selected parameters
+echo -------------------
+echo data: $da \(required\)
+echo motifs: $mo \(required\)
+echo workdir: $wo
+echo fasta: $fa \(required\)
+echo min: $mi
+echo max: $ma
+echo output: $ou
+echo help: $he
+
+if [ $da == false ] || [ $mo == false ] || [ $fa == false ]
+then
+	echo required parameters not given.
+	echo required are: --data \<path/data.bed\> --motifs \<path/motifs.bed\> --fasta \<path/file.fasta\>
+	exit 1
+fi
+
+if [ $wo == false ]
+then
+	if [ ! -d workdir1337 ]
+	then
+		mkdir workdir1337
+	fi
+	wo=true
+	workdir="workdir1337"
+fi
+
+if [ $ou == false ]
+then
+	output="newFootprints.bed"
+	ou=true
+fi
+
+if [ $mi == false ]
+then
+	min=10
+	mi=true
+fi
+
+if [ $ma == false ]
+then
+	max=60
+	ma=true
+fi
+#1. first filter. no overlap vs. overlap
+bedtools intersect -v -a $data -b $motifs > "$workdir"/pass1Tr.bed
+bedtools intersect -wa -a $data -b $motifs > "$workdir"/pass1Fa.bed
+
+#2. compute absolut maxscore position
+Rscript --vanilla maxScore.R "$workdir"/pass1Fa.bed
+
+#3. subtract overlapping regions for additional motifs
+bedtools subtract -a "$workdir"/pass1Fa.bed -b $motifs > "$workdir"/pass2Tr.bed
+
+#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition 
+Rscript --vanilla merge.R $min $max "$workdir"
+
+#5. add fasta sequences to bed and create fasta file
+bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$workdir"/"$output"
+bedtools getfasta -name -fi $fasta -bed "$workdir"/"$output" -fo "$workdir"/"$output".fasta
+
+#6 clean up
+rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed
\ No newline at end of file
diff --git a/bin/maxScore.R b/bin/maxScore.R
new file mode 100644
index 0000000..4ea545b
--- /dev/null
+++ b/bin/maxScore.R
@@ -0,0 +1,9 @@
+#!/home/jhamp/.conda/envs/tfbs/bin/Rscript
+args = commandArgs(TRUE)
+file = args[1]
+
+tab = read.table(file, header=FALSE)
+colnames(tab) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length")
+tab$maxpos = tab$start + tab$maxpos
+
+write.table(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t')
diff --git a/bin/merge.R b/bin/merge.R
new file mode 100644
index 0000000..bbe3c9a
--- /dev/null
+++ b/bin/merge.R
@@ -0,0 +1,23 @@
+#!/home/jhamp/.conda/envs/tfbs/bin/Rscript
+args=commandArgs(TRUE)
+min=as.numeric(args[1])
+max=as.numeric(args[2])
+folder=args[3]
+splitted = read.table(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE)
+colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length")
+p1 = read.table(paste(folder, "/pass1Tr.bed", sep=''), header=FALSE)
+colnames(p1) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length")
+p1$maxpos = p1$start + p1$maxpos
+
+splitted=rbind(splitted, p1)
+
+splitted=splitted[which(splitted$stop - splitted$start >= min),]
+splitted=splitted[which(splitted$stop - splitted$start <= max),]
+splitted$id=make.unique(as.character(splitted$id))
+splitted$length=splitted$stop - splitted$start
+
+splitted=cbind(splitted, containsMaxpos=0)
+splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1
+splitted$maxpos = splitted$maxpos - splitted$start
+write.table(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t')
+

From da7736423dc4cf5effc7797ebcd742ebc5f7b32d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Thu, 8 Nov 2018 07:28:05 -0500
Subject: [PATCH 4/8] Update: process overlap_with_known_TFBS

---
 pipeline.nf | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pipeline.nf b/pipeline.nf
index 0609bb1..83e4736 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -1,7 +1,7 @@
 //!/usr/bin/env nextflow
 
 Channel.fromPath(params.input).map {it -> [it.simpleName, it]}.set {bigwig_input}
-Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan}
+Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2}
 Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom}
 Channel.fromPath(params.config).set {config}
 
@@ -13,14 +13,14 @@ process footprint_extraction {
 	set name, file (bigWig) from bigwig_input
 
 	output:
-	set name, file ('*.bed') into bed_for_overlapp_with_TFBS
+	set name, file ('*.bed') into bed_for_overlap_with_TFBS
 
 	script:
 	"""
 	"""
 }
 
-
+//Abfrage ob ausgeführt werden muss.
 process extract_known_TFBS {
 
 	input:
@@ -36,16 +36,20 @@ process extract_known_TFBS {
 }
 
 
+bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap).combine(fa_overlap_2).set {for_overlap}
+
+
 process overlap_with_known_TFBS {
 
 	input:
-	file (TFBS) from known_TFBS_for_overlap
+	set file (bed_footprints), file (bed_motifs), file (fasta) from for_overlap
 
 	output:
 	file ('*.FASTA') into FASTA_for_clustering
 
 	script:
 	"""
+	${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${bed_motifs} --fasta ${fasta} -o ${name_placeholder} -min ${params.min_size_fp} -max ${params.max_size_fp}
 	"""
 }
 

From 1a6ce69573c213fad5655810a12eed2b6d43940a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Mon, 19 Nov 2018 04:43:48 -0500
Subject: [PATCH 5/8] added script for process 'footprint_extraction'

---
 pipeline.nf | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pipeline.nf b/pipeline.nf
index 83e4736..dd67c85 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -58,7 +58,7 @@ process overlap_with_known_TFBS {
 process clustering {
 
 	input:
-	file (faste) from FASTA_for_clustering
+	file (fasta) from FASTA_for_clustering
 
 	output:
 	set name, file ('*.bed') into bed_for_motif_esitmation
@@ -132,7 +132,7 @@ process tomtom {
 }
 
 
-//Joining channels with meme and tsv files. Filter joined channel on linecount.
+//Joining channels with meme and tsv files. Filter joined channel on line count.
 //Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel.
 for_filter = meme_for_filter.join( tsv_for_filter )
 for_filter
@@ -143,7 +143,7 @@ for_filter
     .into { meme_for_scan; check }
 
 
-//If unknown channel is empty print errormessage
+//If channel 'check' is empty print errormessage
 process check_for_unknown_motifs {
     echo true
 
@@ -160,6 +160,7 @@ process check_for_unknown_motifs {
 }
 
 
+//Get the best(first) Motif from each MEME-file
 process get_best_motif {
 
 	input:
@@ -220,6 +221,8 @@ process create_GTF {
 bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in}
 
 
+// Create configuration file for UROPA.
+// Takes template and replaces bed- and gtf-placeholders with actual paths.
 process create_uropa_config {
 
 	publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/', mode: 'copy'

From 80ffd2a7ae43dce8877863fb0decbdd128d26ec7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Mon, 19 Nov 2018 06:35:08 -0500
Subject: [PATCH 6/8] added script for process 'footprint_extraction'

---
 pipeline.nf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipeline.nf b/pipeline.nf
index dd67c85..45db184 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -17,6 +17,7 @@ process footprint_extraction {
 
 	script:
 	"""
+	python ${path_bin}/call_peaks.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --threshold ${params.threshold_footprint_extraction}
 	"""
 }
 

From a5e902fa670b55a3ed08075adc827e085308b80b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Thu, 29 Nov 2018 07:53:04 -0500
Subject: [PATCH 7/8] added filter for small clusters

---
 bin/bed_to_fasta.R | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R
index 6767ab5..09e3f4f 100644
--- a/bin/bed_to_fasta.R
+++ b/bin/bed_to_fasta.R
@@ -4,18 +4,25 @@
 # The Sequences of each cluster are writen as an FASTA-file.
 # @parameter bedInput <string> BED-file with sequences and cluster-id as column"TEs
 # @parameter prefix <string> prefix for filenames
+# @parameter min_seq <INT> min. number of sequences per cluster
 
 args = commandArgs(trailingOnly = TRUE)
 
 bedInput <- args[1]
-prefix <- args[2] # "Fasta" 
+prefix <- args[2] 
+min_seq <- args[3] 
 
 bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")
 
-clusters <- split(bed, bed$V3, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster
+clusters <- split(bed, bed$V8, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster
 discard <- lapply(1:length(clusters), function(i){
-  sequences <- as.list(as.data.frame(clusters[i])[[2]])   # <---- Splate mit Sequenz
-  outfile <- paste0(prefix,"_cluster_",i,".FASTA")
-  seqinr::write.fasta(sequences = sequences, names = as.data.frame(clusters[i])[[1]]
-                      , file.out = outfile, as.string = TRUE) # <---- Spalte mit Name
+  clust <- as.data.frame(clusters[i])
+  print(nrow(clust))
+  if (nrow(clust) >= as.numeric(min_seq) ) {
+    sequences <- as.list(clust[[7]])   # <---- Splate mit Sequenz
+    outfile <- paste0(prefix,"_cluster_",i,".FASTA")
+    seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name
+  } else {
+    print(paste0("Cluster: ",i," is to small"))
+  }
 })

From 420cb1f07ece20199ccaaa6bd157074394b7831e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Thu, 29 Nov 2018 07:53:52 -0500
Subject: [PATCH 8/8] changed tomtom/glam2 parameters; added min_seq parameter

---
 pipeline.nf | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pipeline.nf b/pipeline.nf
index 45db184..e4d502f 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -5,6 +5,8 @@ Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2}
 Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom}
 Channel.fromPath(params.config).set {config}
 
+params.min_seq = 10
+
 process footprint_extraction {
 
 	tag{name}
@@ -84,7 +86,7 @@ process bed_to_clustered_fasta {
 
     script:
     """
-    Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name}
+    Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq}
     """
 }
 
@@ -107,7 +109,7 @@ process glam2 {
 
     script:
     """
-    glam2 n ${fasta} -O .
+    glam2 n ${fasta} -O . -a 10 -b 20 -z 5
     """
 }
 
@@ -128,7 +130,7 @@ process tomtom {
 
     script:
     """
-    tomtom ${meme} ${jaspar_db} -thresh 0.01 -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv
+    tomtom ${meme} ${jaspar_db} -thresh 0.01 -text --norc | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv
     """
 }