From 694291cba9a542cea84365ec79c23c6970a2a2cd Mon Sep 17 00:00:00 2001
From: JannikHamp <Jannik.Hamp@mpi-bn.mpg.de>
Date: Fri, 4 Jan 2019 12:20:06 +0100
Subject: [PATCH 01/21] updated script. Now the path of subscripts is
 automatically aquired. Fixed error which wrote output to the path of
 subscripts instead of the working directory

---
 bin/1.2_filter_motifs/compareBed.sh | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh
index aa2c37e..938be63 100644
--- a/bin/1.2_filter_motifs/compareBed.sh
+++ b/bin/1.2_filter_motifs/compareBed.sh
@@ -14,6 +14,7 @@ fa=false
 mi=false
 ma=false
 ou=false
+pa=false
 he=false
 
 if [ $# -eq 0 ]
@@ -70,6 +71,7 @@ case $key in
 	;;
 	-p|--path)
 	path="$2"
+	pa=true
 	shift
 	shift
 	;;
@@ -120,6 +122,7 @@ then
 	echo "              -max --max          maximum size of footprints; default is 80"
 	echo "              -o   --output       output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta"
 	echo "              -h --help           shows this help message"
+	echo "              -p --path           the path where the required scripts merge.R and maxScore.R are stored. Default: same path as this scripts path"
 exit 0
 fi
 
@@ -133,6 +136,7 @@ echo min: $mi
 echo max: $ma
 echo output: $ou
 echo help: $he
+echo path of scripts: $pa
 
 if [ $da == false ] || [ $mo == false ] || [ $fa == false ]
 then
@@ -143,12 +147,8 @@ fi
 
 if [ $wo == false ]
 then
-	#if [ ! -d workdir1337 ]
-	#then
-	#	mkdir workdir1337
-	#fi
 	wo=true
-	workdir=$path
+	workdir=$PWD
 fi
 
 if [ $ou == false ]
@@ -169,9 +169,10 @@ then
 	ma=true
 fi
 
-if [ ! -d $workdir ]
+if [ $pa == false ]
 then
-	mkdir $workdir
+	path=`echo $0 | sed 's/\/[^\/]*$/\//g'`
+	pa=true
 fi
 
 #1. first filter. no overlap vs. overlap
@@ -268,8 +269,5 @@ fi
 Rscript --vanilla $path/merge.R $min $max $workdir $data
 
 #5. add fasta sequences to bed and create fasta file
-bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > $output
+bedtools getfasta -fi $fasta -bed $workdir/merged.bed -bedOut > $output
 bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta
-
-#6 clean up
-#rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed

From 3ebb1b8c6cd96abaffa7dc378c885ca427fcd6ef Mon Sep 17 00:00:00 2001
From: JannikHamp <Jannik.Hamp@mpi-bn.mpg.de>
Date: Fri, 4 Jan 2019 16:28:59 +0100
Subject: [PATCH 02/21] added some documentation

---
 bin/1.2_filter_motifs/compareBed.sh | 33 ++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh
index 938be63..5e8e6d9 100644
--- a/bin/1.2_filter_motifs/compareBed.sh
+++ b/bin/1.2_filter_motifs/compareBed.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
-#data
-#motifs
-#workdir
-#fasta
-#min
-#max
-#output
+
+# This script utilizes bedtools to gain non-overlapping sequence parts between bed-files
+# merge.R and maxScore.R are needed to be saved in the same directory than this to make it work
+# For more information read the wiki or run ./compareBed.sh without parameters
+# author Jannik Hamp
+# jannik.hamp@googlemail.com
+
 wrong=()
 da=false
 mo=false
@@ -17,11 +17,13 @@ ou=false
 pa=false
 he=false
 
+#no parameters chosen means help is going to be displayed
 if [ $# -eq 0 ]
 then
 	he=true
 fi
 
+#parsing the parameters, if a parameter is chosen, a value must be provided aswell. No standalone parameters
 while [[ $# -gt 0 ]]
 do
 key="$1"
@@ -87,6 +89,7 @@ case $key in
 esac
 done
 
+# stores unknown selected parameters for error report 
 count=${#wrong[@]}
 if [ $count -gt 0 ]
 then
@@ -99,6 +102,7 @@ then
 exit 1
 fi
 
+# the help message
 if [ $he == true ]
 then
 	echo "This script utilies bedtools to select new footprints from data."
@@ -126,6 +130,7 @@ then
 exit 0
 fi
 
+# prints which parameters have been selected
 echo selected parameters
 echo -------------------
 echo data: $da \(required\)
@@ -138,6 +143,7 @@ echo output: $ou
 echo help: $he
 echo path of scripts: $pa
 
+#checks if the 3 required parameters have been selected
 if [ $da == false ] || [ $mo == false ] || [ $fa == false ]
 then
 	echo required parameters not given.
@@ -145,12 +151,14 @@ then
 	exit 1
 fi
 
+#workdir is set to the current directory if it was not specified with a parameter
 if [ $wo == false ]
 then
 	wo=true
 	workdir=$PWD
 fi
 
+#if output was not specified it will be at "$workdir"/newMotifs.bed
 if [ $ou == false ]
 then
 	output=${workdir}/"newMotifs.bed"
@@ -165,10 +173,12 @@ fi
 
 if [ $ma == false ]
 then
-	max=80
+	max=200
 	ma=true
 fi
 
+#default path of the scripts merge.R and maxScore.R is the same path as compareBed.sh has
+# the `echo $0 | sed 's/\/[^\/]*$/\//g'` command extracts the path from $0, where the command itself is stored 
 if [ $pa == false ]
 then
 	path=`echo $0 | sed 's/\/[^\/]*$/\//g'`
@@ -176,8 +186,12 @@ then
 fi
 
 #1. first filter. no overlap vs. overlap
+# This is done with the data of new footprints and each motif file subsequently.
+# The output of one iteration is the input data for the new footprints of the next iteration, in which a new motif is checked.
 echo get sequences with no overlap
 cat $data > "$workdir"/pass1Tr.bed
+# help variable is needed, because in bash I cannot write to the same file from which I am also reading the data.
+# Thus, there are 2 files which are alternating in each iteration
 help=true
 
 if [ -d "$motifs" ]
@@ -194,6 +208,7 @@ do
 	fi
 	echo $i
 done
+# if the -m parameter is not a directory, it is expected to be a comma separated list of files
 else
 declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`)
 for i in ${motiffiles[@]}
@@ -210,6 +225,7 @@ do
 done
 fi
 
+# After the final iteration, the last output is written to pass1Tr.bed if it is not already there
 if [ $help == false ]
 then
 	cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed
@@ -256,6 +272,7 @@ do
 done
 fi
 
+#The output of the last iteration of the subtract loop is written to pass2Tr.bed 
 if [ $help == false ]
 then
 	cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed

From 7d834a4a37be5ba4054a1e13333cfcea8d437056 Mon Sep 17 00:00:00 2001
From: JannikHamp <Jannik.Hamp@mpi-bn.mpg.de>
Date: Fri, 4 Jan 2019 16:32:19 +0100
Subject: [PATCH 03/21] added collumn for strand information

---
 bin/1.2_filter_motifs/maxScore.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/1.2_filter_motifs/maxScore.R b/bin/1.2_filter_motifs/maxScore.R
index 6b90984..fae6af7 100644
--- a/bin/1.2_filter_motifs/maxScore.R
+++ b/bin/1.2_filter_motifs/maxScore.R
@@ -4,7 +4,7 @@ args = commandArgs(TRUE)
 file = args[1]
 
 tab = fread(file)
-colnames(tab) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info")
+colnames(tab) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info")
 
 tab$maxpos = tab$start + tab$maxpos
 

From 7de9a1b3e9e9e937310b86afaff35f06ebf6c8ac Mon Sep 17 00:00:00 2001
From: JannikHamp <Jannik.Hamp@mpi-bn.mpg.de>
Date: Fri, 4 Jan 2019 16:34:07 +0100
Subject: [PATCH 04/21] added column "strand"

---
 bin/1.2_filter_motifs/merge.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bin/1.2_filter_motifs/merge.R b/bin/1.2_filter_motifs/merge.R
index 129e41d..c4a0d4d 100644
--- a/bin/1.2_filter_motifs/merge.R
+++ b/bin/1.2_filter_motifs/merge.R
@@ -6,9 +6,9 @@ max=as.numeric(args[2])
 folder=args[3]
 
 splitted = fread(paste(folder, "/pass2Tr.bed", sep=''))
-colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info")
+colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info")
 p1 = fread(paste(folder, "/pass1Tr.bed", sep=''))
-colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info")
+colnames(p1) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info")
 
 p1$maxpos = p1$start + p1$maxpos
 
@@ -33,4 +33,4 @@ loss = formatC(1 - suma/sumb, digits=2)
 lengthb = formatC(mean(before$V3-before$V2), digits=4)
 lengtha = formatC(mean(splitted$length), digits=4)
 stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha, flag_1_ratio=length(which(splitted$containsMaxpos == 1))/dim(splitted)[1])
-write.table(stats, "../FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t')
+write.table(stats, "./FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t')

From 1a498214752125b5701beea593d0b4c0801c7416 Mon Sep 17 00:00:00 2001
From: JannikHamp <Jannik.Hamp@mpi-bn.mpg.de>
Date: Fri, 4 Jan 2019 16:35:55 +0100
Subject: [PATCH 05/21] documentation

---
 bin/1.2_filter_motifs/compareBed.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh
index 5e8e6d9..d0a1baf 100644
--- a/bin/1.2_filter_motifs/compareBed.sh
+++ b/bin/1.2_filter_motifs/compareBed.sh
@@ -123,7 +123,7 @@ then
 	echo "              -w   --workdir      the path to directory where temporary files will be created"
 	echo "                                  default is the current directory"
 	echo "              -min --min          minimum size of footprints; default is 10"
-	echo "              -max --max          maximum size of footprints; default is 80"
+	echo "              -max --max          maximum size of footprints; default is 200"
 	echo "              -o   --output       output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta"
 	echo "              -h --help           shows this help message"
 	echo "              -p --path           the path where the required scripts merge.R and maxScore.R are stored. Default: same path as this scripts path"

From 9b5b06135e24f46ff62c5889e7566651e38fb061 Mon Sep 17 00:00:00 2001
From: JannikHamp <Jannik.Hamp@mpi-bn.mpg.de>
Date: Fri, 4 Jan 2019 16:51:17 +0100
Subject: [PATCH 06/21] documentation

---
 bin/1.2_filter_motifs/merge.R | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/bin/1.2_filter_motifs/merge.R b/bin/1.2_filter_motifs/merge.R
index c4a0d4d..750cf93 100644
--- a/bin/1.2_filter_motifs/merge.R
+++ b/bin/1.2_filter_motifs/merge.R
@@ -1,29 +1,49 @@
 #!/bin/Rscript
+
+# This script merges the non-overlapping regions and the non-overlapping parts of overlapping regions
+# It also removes sequences that are smaller than the --min paramter or bigger than the --max parameter
+# Additionally, information of the comparison is written into an output file
+# author Janni Hamp
+# email jannik.hamp@googlemail.com
+
 library(data.table)
 args=commandArgs(TRUE)
 min=as.numeric(args[1])
 max=as.numeric(args[2])
 folder=args[3]
 
+# reading the first dataframe: called splitted (contains splitted regions because of partial overlap)
 splitted = fread(paste(folder, "/pass2Tr.bed", sep=''))
 colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info")
+
+# reading the second dataframe: called p1 (all sequences with zero overlap)
 p1 = fread(paste(folder, "/pass1Tr.bed", sep=''))
 colnames(p1) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info")
 
+# calculates the absolute position of maximum signal of each footprint
 p1$maxpos = p1$start + p1$maxpos
 
+# the new combined dataframe is now called splitted
 splitted=rbind(splitted, p1)
 
+# only keep entries with a larger sequence than min and a smaller sequence than max
 splitted=splitted[which(splitted$stop - splitted$start >= min),]
 splitted=splitted[which(splitted$stop - splitted$start <= max),]
+
+# make the ids unique (because of duplicated ids of some footprints that got spliited in 2)
 splitted$id=make.unique(as.character(splitted$id))
+# calculate new length values (because of the splitted footprints)
 splitted$length=splitted$stop - splitted$start
 
+# add column containsMaxpos (0 means that maxpos has any overlap and 1 means that maxpos has no overlap with any motif) 
 splitted=cbind(splitted, containsMaxpos=0)
 splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1
+
+#calculate relative maxpos values
 splitted$maxpos = splitted$maxpos - splitted$start
 data.table::fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t')
 
+#aditional information of the comparison of the unknown footprints and the known motifs are computed and written to FilterMotifs.stats
 before = fread(args[4], header=FALSE)
 
 sumb=sum(before$V3-before$V2)

From 9ecba8ac1de0e83a556f723dd8277ad49453b3a2 Mon Sep 17 00:00:00 2001
From: JannikHamp <Jannik.Hamp@mpi-bn.mpg.de>
Date: Fri, 4 Jan 2019 17:08:38 +0100
Subject: [PATCH 07/21] documentation

---
 bin/1.2_filter_motifs/maxScore.R | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/bin/1.2_filter_motifs/maxScore.R b/bin/1.2_filter_motifs/maxScore.R
index fae6af7..e93429a 100644
--- a/bin/1.2_filter_motifs/maxScore.R
+++ b/bin/1.2_filter_motifs/maxScore.R
@@ -1,4 +1,10 @@
 #!/bin/Rscript
+
+# The script is used with the script: compareBed.sh
+# This calculates the absolute maxpos values of a bed-file and overwrites them
+# author: Jannik Hamp
+# email: jannik.hamp@googlemail.com
+
 library(data.table)
 args = commandArgs(TRUE)
 file = args[1]

From 58eff9871baa56c7765731870e09b247f5002151 Mon Sep 17 00:00:00 2001
From: JannikHamp <Jannik.Hamp@mpi-bn.mpg.de>
Date: Fri, 4 Jan 2019 17:09:40 +0100
Subject: [PATCH 08/21] documentation

---
 bin/1.2_filter_motifs/merge.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bin/1.2_filter_motifs/merge.R b/bin/1.2_filter_motifs/merge.R
index 750cf93..507a746 100644
--- a/bin/1.2_filter_motifs/merge.R
+++ b/bin/1.2_filter_motifs/merge.R
@@ -1,9 +1,10 @@
 #!/bin/Rscript
 
+# The script is used in the script: compareBed.sh
 # This script merges the non-overlapping regions and the non-overlapping parts of overlapping regions
 # It also removes sequences that are smaller than the --min paramter or bigger than the --max parameter
 # Additionally, information of the comparison is written into an output file
-# author Janni Hamp
+# author Jannik Hamp
 # email jannik.hamp@googlemail.com
 
 library(data.table)

From b1cab105b50e7537cc28187ffd3352c13ce6f1c3 Mon Sep 17 00:00:00 2001
From: JannikHamp <Jannik.Hamp@mpi-bn.mpg.de>
Date: Sat, 5 Jan 2019 19:03:55 +0100
Subject: [PATCH 09/21] Update compareBed.sh

---
 bin/1.2_filter_motifs/compareBed.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh
index d0a1baf..e1ac62f 100644
--- a/bin/1.2_filter_motifs/compareBed.sh
+++ b/bin/1.2_filter_motifs/compareBed.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # This script utilizes bedtools to gain non-overlapping sequence parts between bed-files
-# merge.R and maxScore.R are needed to be saved in the same directory than this to make it work
+# merge.R and maxScore.R are needed to be saved in the same directory as this to make it work
 # For more information read the wiki or run ./compareBed.sh without parameters
 # author Jannik Hamp
 # jannik.hamp@googlemail.com

From cca8ac474cb4980eb40d55c2d457cf0e260e6083 Mon Sep 17 00:00:00 2001
From: renewiegandt <rene.wiegandt@mpi-bn.mpg.de>
Date: Sun, 6 Jan 2019 15:34:38 +0100
Subject: [PATCH 10/21] Added check for \t at the end of lines in BED-file

---
 bin/1.2_filter_motifs/compareBed.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh
index e1ac62f..99423f7 100644
--- a/bin/1.2_filter_motifs/compareBed.sh
+++ b/bin/1.2_filter_motifs/compareBed.sh
@@ -89,7 +89,7 @@ case $key in
 esac
 done
 
-# stores unknown selected parameters for error report 
+# stores unknown selected parameters for error report
 count=${#wrong[@]}
 if [ $count -gt 0 ]
 then
@@ -178,7 +178,7 @@ then
 fi
 
 #default path of the scripts merge.R and maxScore.R is the same path as compareBed.sh has
-# the `echo $0 | sed 's/\/[^\/]*$/\//g'` command extracts the path from $0, where the command itself is stored 
+# the `echo $0 | sed 's/\/[^\/]*$/\//g'` command extracts the path from $0, where the command itself is stored
 if [ $pa == false ]
 then
 	path=`echo $0 | sed 's/\/[^\/]*$/\//g'`
@@ -198,6 +198,7 @@ if [ -d "$motifs" ]
 then
 for i in "$motifs"/*.bed
 do
+	sed -i 's/[ \t]*$//' $i
 	if [ $help == true ]
 	then
 		help=false
@@ -272,7 +273,7 @@ do
 done
 fi
 
-#The output of the last iteration of the subtract loop is written to pass2Tr.bed 
+#The output of the last iteration of the subtract loop is written to pass2Tr.bed
 if [ $help == false ]
 then
 	cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed

From f983d2222bf681bc10ca4e5d6487e3312b42965d Mon Sep 17 00:00:00 2001
From: renewiegandt <rene.wiegandt@mpi-bn.mpg.de>
Date: Sun, 6 Jan 2019 15:35:17 +0100
Subject: [PATCH 11/21] Remove header = false from fread

---
 bin/1.2_filter_motifs/merge.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/1.2_filter_motifs/merge.R b/bin/1.2_filter_motifs/merge.R
index 507a746..9d97da0 100644
--- a/bin/1.2_filter_motifs/merge.R
+++ b/bin/1.2_filter_motifs/merge.R
@@ -45,7 +45,7 @@ splitted$maxpos = splitted$maxpos - splitted$start
 data.table::fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t')
 
 #aditional information of the comparison of the unknown footprints and the known motifs are computed and written to FilterMotifs.stats
-before = fread(args[4], header=FALSE)
+before = fread(args[4])
 
 sumb=sum(before$V3-before$V2)
 suma=sum(splitted$length)

From 1134d7485c0814ca73e0ecf002edb2247d60d213 Mon Sep 17 00:00:00 2001
From: renewiegandt <rene.wiegandt@mpi-bn.mpg.de>
Date: Sun, 6 Jan 2019 21:49:02 +0100
Subject: [PATCH 12/21] merge.R: Set separator from auto to '\t' in fread

---
 bin/1.2_filter_motifs/merge.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/1.2_filter_motifs/merge.R b/bin/1.2_filter_motifs/merge.R
index 9d97da0..bd009cc 100644
--- a/bin/1.2_filter_motifs/merge.R
+++ b/bin/1.2_filter_motifs/merge.R
@@ -14,11 +14,11 @@ max=as.numeric(args[2])
 folder=args[3]
 
 # reading the first dataframe: called splitted (contains splitted regions because of partial overlap)
-splitted = fread(paste(folder, "/pass2Tr.bed", sep=''))
+splitted = fread(paste(folder, "/pass2Tr.bed", sep=''), sep = '\t')
 colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info")
 
 # reading the second dataframe: called p1 (all sequences with zero overlap)
-p1 = fread(paste(folder, "/pass1Tr.bed", sep=''))
+p1 = fread(paste(folder, "/pass1Tr.bed", sep=''), sep = '\t')
 colnames(p1) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info")
 
 # calculates the absolute position of maximum signal of each footprint

From 37c69e6660ebd826241f4e46bc1bd951897d0459 Mon Sep 17 00:00:00 2001
From: JannikHamp <Jannik.Hamp@mpi-bn.mpg.de>
Date: Tue, 8 Jan 2019 13:53:06 +0100
Subject: [PATCH 13/21] this r script is no more necessary, the other does its
 job

---
 bin/1.2_filter_motifs/maxScore.R | 17 -----------------
 1 file changed, 17 deletions(-)
 delete mode 100644 bin/1.2_filter_motifs/maxScore.R

diff --git a/bin/1.2_filter_motifs/maxScore.R b/bin/1.2_filter_motifs/maxScore.R
deleted file mode 100644
index e93429a..0000000
--- a/bin/1.2_filter_motifs/maxScore.R
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/Rscript
-
-# The script is used with the script: compareBed.sh
-# This calculates the absolute maxpos values of a bed-file and overwrites them
-# author: Jannik Hamp
-# email: jannik.hamp@googlemail.com
-
-library(data.table)
-args = commandArgs(TRUE)
-file = args[1]
-
-tab = fread(file)
-colnames(tab) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info")
-
-tab$maxpos = tab$start + tab$maxpos
-
-fwrite(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t')

From 6802d72736a0c2a4f272f7b364e347389f4d0dd9 Mon Sep 17 00:00:00 2001
From: JannikHamp <Jannik.Hamp@mpi-bn.mpg.de>
Date: Tue, 8 Jan 2019 13:54:10 +0100
Subject: [PATCH 14/21] New updated version. Faster and more robust

---
 bin/1.2_filter_motifs/compareBed.sh | 406 ++++++++++++++--------------
 1 file changed, 198 insertions(+), 208 deletions(-)

diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh
index 99423f7..1571065 100644
--- a/bin/1.2_filter_motifs/compareBed.sh
+++ b/bin/1.2_filter_motifs/compareBed.sh
@@ -1,95 +1,101 @@
 #!/bin/bash
 
-# This script utilizes bedtools to gain non-overlapping sequence parts between bed-files
-# merge.R and maxScore.R are needed to be saved in the same directory as this to make it work
-# For more information read the wiki or run ./compareBed.sh without parameters
-# author Jannik Hamp
-# jannik.hamp@googlemail.com
+# author: Jannik Hamp
+# email: jannik.hamp@googlemail.com
 
-wrong=()
+# desciption: This scripts uses bedtools to find new, non overlapping regions
+#             between input data in BED-format and a group of known motifs in
+#             BED-format. 
+
+#             For parameter description, run the script without parameters or -h.
+
+# One R script is used, called compareBed_runinfo.R, stored in the same directory
+
+# default parameters
+workdir=$PWD
+output="newMotifs.bed"
+min=10
+max=200
+path=`echo $0 | sed 's/\/[^\/]*$/\//g'`
+help=false
+# required parameters given
 da=false
 mo=false
-wo=false
 fa=false
-mi=false
-ma=false
-ou=false
-pa=false
-he=false
 
-#no parameters chosen means help is going to be displayed
+# display help when no parameters chosen
 if [ $# -eq 0 ]
 then
 	he=true
 fi
 
-#parsing the parameters, if a parameter is chosen, a value must be provided aswell. No standalone parameters
+# parsing parameters
+wrong=()
 while [[ $# -gt 0 ]]
 do
-key="$1"
+	key="$1"
+	if [[ "^-" =~ $2 ]]
+	then
+		echo "Each parameter needs a value (except the help parameter), values must not start with a '-'!"
+		exit 1
+	fi
 
-case $key in
-	-d|--data)
-	data="$2"
-	da=true
-	shift
-	shift
-	;;
-	-m|--motifs)
-	motifs="$2"
-	mo=true
-	shift
-	shift
-	;;
-	-w|--workdir)
-	workdir="$2"
-	wo=true
-	shift
-	shift
-	;;
-	-f|--fasta)
-	fasta="$2"
-	fa=true
-	shift
-	shift
-	;;
-	-min|--min)
-	min=$2
-	mi=true
-	shift
-	shift
-	;;
-	-max|--max)
-	max=$2
-	ma=true
-	shift
-	shift
-	;;
-	-o|--output)
-	output="$2"
-	ou=true
-	shift
-	shift
-	;;
-	-p|--path)
-	path="$2"
-	pa=true
-	shift
-	shift
-	;;
-	-h|--help)
-	help=true
-	he=true
-	shift
-	;;
-	*)
-	wrong+=("$1")
-	shift
-	;;
-esac
+	case $key in
+		-d|--data)
+			data="$2"
+			da=true
+			shift
+			shift
+		;;
+		-m|--motifs)
+			motifs="$2"
+			mo=true
+			shift
+			shift
+		;;
+		-w|--workdir)
+			workdir="$2"
+			shift
+			shift
+		;;
+		-f|--fasta)
+			fasta="$2"
+			fa=true
+			shift
+			shift
+		;;
+		-min|--min)
+			min=$2
+			shift
+			shift
+		;;
+		-max|--max)
+			max=$2
+			shift
+			shift
+		;;
+		-o|--output)
+			output="$2"
+			shift
+			shift
+		;;
+		-p|--path)
+			path="$2"
+			shift
+			shift
+		;;
+		-h|--help)
+			help=true
+			shift
+		;;
+		*)
+			wrong+=("$1")
+			shift
+		;;
+	esac
 done
 
-# stores unknown selected parameters for error report
+# if wrong parameters were chosen..
 count=${#wrong[@]}
 if [ $count -gt 0 ]
 then
@@ -103,7 +109,7 @@ exit 1
 fi
 
 # the help message
-if [ $he == true ]
+if [ $help == true ]
 then
 	echo "This script utilies bedtools to select new footprints from data."
 	echo "Therefore the data is compared with known footprint motifs."
@@ -123,169 +129,153 @@ then
 	echo "              -w   --workdir      the path to directory where temporary files will be created"
 	echo "                                  default is the current directory"
 	echo "              -min --min          minimum size of footprints; default is 10"
-	echo "              -max --max          maximum size of footprints; default is 200"
-	echo "              -o   --output       output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta"
+	echo "              -max --max          maximum size of footprints; default is 80"
+	echo "              -o   --output       output path/file ; default dir current directory and filename is newMotifs.bed and newMotifs.bed.fasta"
 	echo "              -h --help           shows this help message"
 	echo "              -p --path           the path where the required scripts merge.R and maxScore.R are stored. Default: same path as this scripts path"
 exit 0
 fi
 
-# prints which parameters have been selected
+# summary of parameters
 echo selected parameters
 echo -------------------
-echo data: $da \(required\)
-echo motifs: $mo \(required\)
-echo workdir: $wo
-echo fasta: $fa \(required\)
-echo min: $mi
-echo max: $ma
-echo output: $ou
-echo help: $he
-echo path of scripts: $pa
+echo data: $data \(required\)
+echo motifs: $motifs \(required\)
+echo workdir: $workdir
+echo fasta: $fasta \(required\)
+echo min: $min
+echo max: $max
+echo output: $output
+echo path of scripts: $path
 
-#checks if the 3 required parameters have been selected
-if [ $da == false ] || [ $mo == false ] || [ $fa == false ]
+# check required parameters
+if [ -z $data ] || [ -z $motifs ] || [ -z $fasta ]
 then
 	echo required parameters not given.
 	echo required are: --data \<path/data.bed\> --motifs \<path/motifs.bed\> --fasta \<path/file.fasta\>
 	exit 1
 fi
 
-#workdir is set to the current directory if it was not specified with a parameter
-if [ $wo == false ]
-then
-	wo=true
-	workdir=$PWD
-fi
-
-#if output was not specified it will be at "$workdir"/newMotifs.bed
-if [ $ou == false ]
-then
-	output=${workdir}/"newMotifs.bed"
-	ou=true
-fi
-
-if [ $mi == false ]
-then
-	min=10
-	mi=true
-fi
-
-if [ $ma == false ]
-then
-	max=200
-	ma=true
-fi
-
-#default path of the scripts merge.R and maxScore.R is the same path as compareBed.sh has
-# the `echo $0 | sed 's/\/[^\/]*$/\//g'` command extracts the path from $0, where the command itself is stored
-if [ $pa == false ]
-then
-	path=`echo $0 | sed 's/\/[^\/]*$/\//g'`
-	pa=true
-fi
-
-#1. first filter. no overlap vs. overlap
-# This is done with the data of new footprints and each motif file subsequently.
-# The output of one iteration is the input data for the new footprints of the next iteration, in which a new motif is checked.
-echo get sequences with no overlap
-cat $data > "$workdir"/pass1Tr.bed
-# help variable is needed, because in bash I cannot write to the same file from which I am also reading the data.
-# Thus, there are 2 files which are alternating in each iteration
-help=true
+# comparing unknown footprints with regions of known motifs
+# comparison is done iteratively
+# remove overlapping regions in unknown footprints
 
+# remove trailing tabs in footprints
+cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed
+temp_switch=true
+all_empty=true
 if [ -d "$motifs" ]
 then
-for i in "$motifs"/*.bed
-do
-	sed -i 's/[ \t]*$//' $i
-	if [ $help == true ]
+	# check if all motiffiles are empty/only consist of header. exit if all are empty 
+	for i in "$motifs"/*.bed
+	do
+		if [ $all_empty == true ]
+		then
+			lines=`cat $i | wc -l`
+			if [ $lines -gt 1 ]
+			then
+				all_empty=false
+			fi
+		fi
+	done
+	if [ $all_empty == true ]
 	then
-		help=false
-		bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed
-	else
-		help=true
-		bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed
+		echo All motiffiles were empty!
+		echo Fix motiffiles and try again.
+		echo exiting
+		exit 1
 	fi
-	echo $i
-done
-# if the -m parameter is not a directory, it is expected to be a comma separated list of files
+
+	# bedtools comparisons
+	for i in "$motifs"/*.bed
+	do
+		# remove trailing tabs in motiffile
+		sed -i 's/[ \t]*$//' $i
+
+		if [ $temp_switch == true ]
+		then
+			temp_switch=false
+			bedtools subtract -a "$workdir"/filtered.bed -b $i > "$workdir"/filtered_temp.bed
+		else
+			temp_switch=true
+			bedtools subtract -a "$workdir"/filtered_temp.bed	-b $i > "$workdir"/filtered.bed
+		fi
+		echo $i
+	done
+
+# the else case means, that the motiffiles were passed comma separated with no whitespace.
 else
-declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`)
-for i in ${motiffiles[@]}
-do
-	if [ $help == true ]
+	declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`)
+	# check if files exist and if they are all empty (exiting if all empty)
+	for i in ${motiffiles[@]}
+	do
+		if [ -f $i ]
+		then
+			if [ $all_empty == true ]
+			then
+				lines=`cat $i | wc -l`
+				if [ $lines -gt 1 ]
+				then
+					all_empty=false
+				fi
+			fi
+		else
+			echo file $i does not exist
+			echo please use correct paths. exiting.
+			exit 1
+		fi
+	done
+	if [ $all_empty == true ]
 	then
-		help=false
-		bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed
-	else
-		help=true
-		bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed
-	fi
-	echo $i
-done
+		echo All motiffiles were empty!
+		echo Fix motiffiles and try again.
+		echo exiting
+		exit 1
+	fi	
+
+	# bedtools comparisons
+	for i in ${motiffiles[@]}
+	do
+		# remove trailing tabs in motiffile
+		sed -i 's/[ \t]*$//' $i
+
+		if [ $temp_switch == true ]
+		then
+			help=false
+			bedtools subtract -a "$workdir"/filtered.bed -b $i > "$workdir"/filtered_temp.bed
+		else
+			temp_switch=true
+			bedtools subtract -a "$workdir"/filtered_temp.bed	-b $i > "$workdir"/filtered.bed
+		fi
+		echo $i
+	done
 fi
 
-# After the final iteration, the last output is written to pass1Tr.bed if it is not already there
-if [ $help == false ]
+# get file of last iteration an write its content into filtered.bed
+if [ $temp_switch == false ]
 then
-	cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed
+	cat "$workdir"/filtered_temp.bed > "$workdir"/filtered.bed
 fi
-echo get overlapping sequences
-bedtools intersect -v -a $data -b "$workdir"/pass1Tr.bed > "$workdir"/pass1Fa.bed
-cat "$workdir"/pass1Fa.bed | wc -l
-
-echo calc maxScore
-#2. compute absolut maxscore position
 
-Rscript --vanilla $path/maxScore.R "$workdir"/pass1Fa.bed
-
-#3 subtract overlapping regions for additional motifs
-#echo "also get subsequences with no overlap of overlapping sequences"
-help=true
-
-if [ -d "$motifs" ]
+# remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition
+# also creates a small output file with information about the comparison
+Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed
+# check if Rscript executed without errors
+if [ $? -gt 0 ] 
 then
-for i in "$motifs"/*.bed
-do
-	if [ $help == true ]
-	then
-		help=false
-		bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed
-	else
-		help=true
-		bedtools subtract -a "$workdir"/pass1FaHelp.bed	-b $i > "$workdir"/pass1Fa.bed
-	fi
-	echo $i
-done
-else
-for i in ${motiffiles[@]}
-do
-	if [ $help == true ]
-	then
-		help=false
-		bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed
-	else
-		help=true
-		bedtools subtract -a "$workdir"/pass1FaHelp.bed	-b $i > "$workdir"/pass1Fa.bed
-	fi
-	echo $i
-done
+	exit 1
 fi
 
-#The output of the last iteration of the subtract loop is written to pass2Tr.bed
-if [ $help == false ]
+# check if header existed. If so, final output also has a header.
+first_line=`sed -n 1p $data | sed "s/$/\tcontains_maxpos\tsequence/"`
+if [[ ${first_line:0:1} == "#" ]]
 then
-	cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed
+	echo "$first_line" > $output
 else
-	cat "$workdir"/pass1Fa.bed > "$workdir"/pass2Tr.bed
+	# output will be overwritten if it exists
+	rm -f $output
 fi
 
-#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition
-# also creates a small output file with information about the comparison
-
-Rscript --vanilla $path/merge.R $min $max $workdir $data
-
-#5. add fasta sequences to bed and create fasta file
-bedtools getfasta -fi $fasta -bed $workdir/merged.bed -bedOut > $output
+# add fasta sequences to bed and create fasta file
+bedtools getfasta -fi $fasta -bed "$workdir"/filtered_flagged.bed -bedOut >> $output
 bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta

From b787d14b46e9ad21d14f9df28161cb7c262c1cbc Mon Sep 17 00:00:00 2001
From: JannikHamp <Jannik.Hamp@mpi-bn.mpg.de>
Date: Tue, 8 Jan 2019 13:55:39 +0100
Subject: [PATCH 15/21] updated version.. make unique updated, more robust in
 general

---
 bin/1.2_filter_motifs/compareBed_runinfo.R | 65 ++++++++++++++++++++++
 bin/1.2_filter_motifs/merge.R              | 57 -------------------
 2 files changed, 65 insertions(+), 57 deletions(-)
 create mode 100644 bin/1.2_filter_motifs/compareBed_runinfo.R
 delete mode 100644 bin/1.2_filter_motifs/merge.R

diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R
new file mode 100644
index 0000000..4a7af6f
--- /dev/null
+++ b/bin/1.2_filter_motifs/compareBed_runinfo.R
@@ -0,0 +1,65 @@
+#!/bin/Rscript
+
+# author: Jannik Hamp
+# email: jannik.hamp@googlemail.com
+
+# desciption: This script gets called by compareBed.sh
+#             It removes too small/large footprints, makes names unique,
+#             adds a column with a flag "contains_maxpos"
+#             and creates a file with information of the bedtool comparison
+
+# TODO: check number of columns, implement make.unique in a better way 
+
+# parsing parameters
+library(data.table)
+args = commandArgs(TRUE)
+min = as.numeric(args[1])
+max = as.numeric(args[2])
+input_raw = args[3]
+input_filtered = args[4]
+output = args[5]
+
+data_filtered = fread(input_filtered)
+# check if data has less than 9 columns
+if (ncol(data_filtered) < 9) {
+	stop("footprint file has less than 9 columns. exiting.")
+}
+
+# remove sequences that are smaller than minimum (parameter)
+# remove sequences that are longer than maximum (parameter)
+data_filtered = data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),]
+data_filtered = data_filtered[which(data_filtered[[3]] - data_filtered[[2]] <= max),]
+
+# make unique names and adjust length for splitted footprints
+# duplicated names have .0 , .1 , .2 ... added
+names=data_filtered[[4]]
+names(data_filtered)[4] = "name"
+# all duplicated names
+duplicants = unique(data_filtered[duplicated(name)][[4]])
+data_filtered[[4]] = make.unique(as.character(data_filtered[[4]]))
+data_filtered[match(duplicants, names), name := paste0(name,".0")]
+
+# recalculate length of sequences
+data_filtered[[7]] = data_filtered[[3]] - data_filtered[[2]]
+
+# adding column "contains_maxpos", containing flag (0 or 1)
+# max_pos is the position of maximum score of a footprint
+data_filtered = cbind(data_filtered, contains_maxpos = 0)
+data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1
+data_filtered[[8]] = data_filtered[[8]] - data_filtered[[2]]
+
+fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t')
+
+# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh)
+data = fread(input_raw)
+
+# some statistics about the bedtool comparisons are stored in FilterMotifs.stats
+sum_data = sum(data[[3]]-data[[2]])
+sum_filtered = sum(data_filtered[[7]])
+difference_nt = formatC(sum_data/sum_filtered, digits = 4)
+loss_nt = formatC(1 - sum_filtered/sum_data, digits = 2)
+length_data = formatC(mean(data[[3]]-data[[2]]), digits = 4)
+length_filtered = formatC(mean(data_filtered[[7]]), digits = 4)
+stats = data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1])
+stats = t(stats)
+write.table(stats, "./FilterMotifs.stats", col.names = FALSE, quote = FALSE, sep = '\t')
diff --git a/bin/1.2_filter_motifs/merge.R b/bin/1.2_filter_motifs/merge.R
deleted file mode 100644
index bd009cc..0000000
--- a/bin/1.2_filter_motifs/merge.R
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/bin/Rscript
-
-# The script is used in the script: compareBed.sh
-# This script merges the non-overlapping regions and the non-overlapping parts of overlapping regions
-# It also removes sequences that are smaller than the --min paramter or bigger than the --max parameter
-# Additionally, information of the comparison is written into an output file
-# author Jannik Hamp
-# email jannik.hamp@googlemail.com
-
-library(data.table)
-args=commandArgs(TRUE)
-min=as.numeric(args[1])
-max=as.numeric(args[2])
-folder=args[3]
-
-# reading the first dataframe: called splitted (contains splitted regions because of partial overlap)
-splitted = fread(paste(folder, "/pass2Tr.bed", sep=''), sep = '\t')
-colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info")
-
-# reading the second dataframe: called p1 (all sequences with zero overlap)
-p1 = fread(paste(folder, "/pass1Tr.bed", sep=''), sep = '\t')
-colnames(p1) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info")
-
-# calculates the absolute position of maximum signal of each footprint
-p1$maxpos = p1$start + p1$maxpos
-
-# the new combined dataframe is now called splitted
-splitted=rbind(splitted, p1)
-
-# only keep entries with a larger sequence than min and a smaller sequence than max
-splitted=splitted[which(splitted$stop - splitted$start >= min),]
-splitted=splitted[which(splitted$stop - splitted$start <= max),]
-
-# make the ids unique (because of duplicated ids of some footprints that got spliited in 2)
-splitted$id=make.unique(as.character(splitted$id))
-# calculate new length values (because of the splitted footprints)
-splitted$length=splitted$stop - splitted$start
-
-# add column containsMaxpos (0 means that maxpos has any overlap and 1 means that maxpos has no overlap with any motif) 
-splitted=cbind(splitted, containsMaxpos=0)
-splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1
-
-#calculate relative maxpos values
-splitted$maxpos = splitted$maxpos - splitted$start
-data.table::fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t')
-
-#aditional information of the comparison of the unknown footprints and the known motifs are computed and written to FilterMotifs.stats
-before = fread(args[4])
-
-sumb=sum(before$V3-before$V2)
-suma=sum(splitted$length)
-difference = formatC(sumb/suma, digits=4)
-loss = formatC(1 - suma/sumb, digits=2)
-lengthb = formatC(mean(before$V3-before$V2), digits=4)
-lengtha = formatC(mean(splitted$length), digits=4)
-stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha, flag_1_ratio=length(which(splitted$containsMaxpos == 1))/dim(splitted)[1])
-write.table(stats, "./FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t')

From becfeae762c4066439462438b6b758cd3cf46b6f Mon Sep 17 00:00:00 2001
From: JannikHamp <Jannik.Hamp@mpi-bn.mpg.de>
Date: Tue, 8 Jan 2019 16:03:02 +0100
Subject: [PATCH 16/21] added information for logfile

---
 bin/1.2_filter_motifs/compareBed.sh | 146 ++++++++++++----------------
 1 file changed, 61 insertions(+), 85 deletions(-)

diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh
index 1571065..1840ba8 100644
--- a/bin/1.2_filter_motifs/compareBed.sh
+++ b/bin/1.2_filter_motifs/compareBed.sh
@@ -18,15 +18,11 @@ min=10
 max=200
 path=`echo $0 | sed 's/\/[^\/]*$/\//g'`
 help=false
-# required parameters given
-da=false
-mo=false
-fa=false
 
 # display help when no parameters chosen
 if [ $# -eq 0 ]
 then
-	he=true
+	help=true
 fi
 
 # parsing parameters
@@ -34,7 +30,7 @@ wrong=()
 while [[ $# -gt 0 ]]
 do
 	key="$1"
-	if [[ "^-" =~ $2 ]]
+	if [[ ${2:0:1} == "-" ]]
 	then
 		echo "Each parameter needs a value (except the help parameter), values must not start with a '-'!"
 		exit 1
@@ -43,13 +39,11 @@ do
 	case $key in
 		-d|--data)
 			data="$2"
-			da=true
 			shift
 			shift
 		;;
 		-m|--motifs)
 			motifs="$2"
-			mo=true
 			shift
 			shift
 		;;
@@ -60,7 +54,6 @@ do
 		;;
 		-f|--fasta)
 			fasta="$2"
-			fa=true
 			shift
 			shift
 		;;
@@ -102,10 +95,10 @@ then
 	for i in ${wrong[@]}
 	do
 		echo wrong parameter $i
-		echo call script without parameters for help or call --help
-		echo exit
 	done
-exit 1
+
+	echo call script without parameters for help or call --help
+	exit 1
 fi
 
 # the help message
@@ -151,105 +144,78 @@ echo path of scripts: $path
 # check required parameters
 if [ -z $data ] || [ -z $motifs ] || [ -z $fasta ]
 then
+	echo ERROR
 	echo required parameters not given.
 	echo required are: --data \<path/data.bed\> --motifs \<path/motifs.bed\> --fasta \<path/file.fasta\>
 	exit 1
 fi
 
-# comparing unknown footprints with regions of known motifs
-# comparison is done iteratively
-# remove overlapping regions in unknown footprints
-
 # remove trailing tabs in footprints
 cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed
-temp_switch=true
 all_empty=true
+
+# motiffiles either from a directory OR comma separated list
 if [ -d "$motifs" ]
 then
-	# check if all motiffiles are empty/only consist of header. exit if all are empty 
-	for i in "$motifs"/*.bed
-	do
+	# creates an array of all files with bed in its name in the directory $motifs  
+	declare -a motiffiles=(`ls $motifs | grep bed | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`)
+
+# the else case means, that the motiffiles were passed comma separated with no whitespace.
+else
+	declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`)
+fi
+
+# check if files exist and if they are all empty (exiting if all empty)
+for i in ${motiffiles[@]}
+do
+	if [ -f $i ]
+	then
 		if [ $all_empty == true ]
 		then
 			lines=`cat $i | wc -l`
 			if [ $lines -gt 1 ]
 			then
 				all_empty=false
+				break
 			fi
 		fi
-	done
-	if [ $all_empty == true ]
-	then
-		echo All motiffiles were empty!
-		echo Fix motiffiles and try again.
-		echo exiting
+	else
+		echo file $i does not exist
+		echo please use correct paths. exiting.
 		exit 1
 	fi
+done
 
-	# bedtools comparisons
-	for i in "$motifs"/*.bed
-	do
-		# remove trailing tabs in motiffile
-		sed -i 's/[ \t]*$//' $i
-
-		if [ $temp_switch == true ]
-		then
-			temp_switch=false
-			bedtools subtract -a "$workdir"/filtered.bed -b $i > "$workdir"/filtered_temp.bed
-		else
-			temp_switch=true
-			bedtools subtract -a "$workdir"/filtered_temp.bed	-b $i > "$workdir"/filtered.bed
-		fi
-		echo $i
-	done
-
-# the else case means, that the motiffiles were passed comma separated with no whitespace.
-else
-	declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`)
-	# check if files exist and if they are all empty (exiting if all empty)
-	for i in ${motiffiles[@]}
-	do
-		if [ -f $i ]
-		then
-			if [ $all_empty == true ]
-			then
-				lines=`cat $i | wc -l`
-				if [ $lines -gt 1 ]
-				then
-					all_empty=false
-				fi
-			fi
-		else
-			echo file $i does not exist
-			echo please use correct paths. exiting.
-			exit 1
-		fi
-	done
-	if [ $all_empty == true ]
+# error report of rare case of only empty motiffiles
+if [ $all_empty == true ]
 	then
+		echo ERROR
 		echo All motiffiles were empty!
 		echo Fix motiffiles and try again.
-		echo exiting
 		exit 1
-	fi	
+fi
 
-	# bedtools comparisons
-	for i in ${motiffiles[@]}
-	do
-		# remove trailing tabs in motiffile
-		sed -i 's/[ \t]*$//' $i
+# comparing unknown footprints with regions of known motifs
+# comparison is done iteratively
+# remove overlapping regions in unknown footprints
+temp_switch=true
+counter=1
+for i in ${motiffiles[@]}
+do
+	# remove trailing tabs in motiffile
+	sed -i 's/[ \t]*$//' $i
 
-		if [ $temp_switch == true ]
-		then
-			help=false
-			bedtools subtract -a "$workdir"/filtered.bed -b $i > "$workdir"/filtered_temp.bed
-		else
-			temp_switch=true
-			bedtools subtract -a "$workdir"/filtered_temp.bed	-b $i > "$workdir"/filtered.bed
-		fi
-		echo $i
-	done
-fi
+	if [ $temp_switch == true ]
+	then
+		temp_switch=false
+		bedtools subtract -a "$workdir"/filtered.bed -b $i > "$workdir"/filtered_temp.bed
+	else
+		temp_switch=true
+		bedtools subtract -a "$workdir"/filtered_temp.bed	-b $i > "$workdir"/filtered.bed
+	fi
+	echo "$i  ---  $counter of ${#motiffiles[@]}"
+	counter=`expr $counter + 1`
+done
 
 # get file of last iteration an write its content into filtered.bed
 if [ $temp_switch == false ]
@@ -259,7 +225,7 @@ fi
 
 # remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition
 # also creates a small output file with information about the comparison
-Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed
+Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/FilterMotifs.stats
 # check if Rscript executed without errors
 if [ $? -gt 0 ] 
 then
@@ -271,9 +237,19 @@ first_line=`sed -n 1p $data | sed "s/$/\tcontains_maxpos\tsequence/"`
 if [[ ${first_line:0:1} == "#" ]]
 then
 	echo "$first_line" > $output
+	# add some final values to the log file
+	fp_initial=`cat $data | wc -l`
+	fp_initial=`expr $fp_initial - 1`
+	fp_final=`cat "$workdir"/filtered.bed | wc -l`
+	fp_final=`expr $fp_final - 1`
+	echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/FilterMotifs.stats
+	echo $fp_final | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/FilterMotifs.stats
 else
 	# output will be overwritten if it exists
 	rm -f $output
+	# add some final values to the log file
+	cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/FilterMotifs.stats
+	cat "$workdir"/filtered.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/FilterMotifs.stats
 fi
 
 # add fasta sequences to bed and create fasta file

From c55e8ff1c5fb1525eb52e3e15c6cf25423e8ba05 Mon Sep 17 00:00:00 2001
From: JannikHamp <Jannik.Hamp@mpi-bn.mpg.de>
Date: Tue, 8 Jan 2019 16:03:57 +0100
Subject: [PATCH 17/21] added dicumentation and parameter for .stats output
 file

---
 bin/1.2_filter_motifs/compareBed_runinfo.R | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R
index 4a7af6f..4c5686d 100644
--- a/bin/1.2_filter_motifs/compareBed_runinfo.R
+++ b/bin/1.2_filter_motifs/compareBed_runinfo.R
@@ -8,7 +8,13 @@
 #             adds a column with a flag "contains_maxpos"
 #             and creates a file with information of the bedtool comparison
 
-# TODO: check number of columns, implement make.unique in a better way 
+# parameters: Parameters are not named. Respect the parameter order.
+#             min:            minimum footprint size threshold 
+#             max:            maximum footprint size threshold
+#             input_raw:      unfiltered BED-file
+#             input_filtered: filtered BED-file (after bedtools subtract)
+#             output:         output path/file
+#             output_stats:   output file with general info of the comparison
 
 # parsing parameters
 library(data.table)
@@ -18,8 +24,10 @@ max = as.numeric(args[2])
 input_raw = args[3]
 input_filtered = args[4]
 output = args[5]
+output_stats = args[6]
+
+data_filtered = fread(input_filtered, sep='\t')
 
-data_filtered = fread(input_filtered)
 # check if data has less than 9 columns
 if (ncol(data_filtered) < 9) {
 	stop("footprint file has less than 9 columns. exiting.")
@@ -51,7 +59,7 @@ data_filtered[[8]] = data_filtered[[8]] - data_filtered[[2]]
 fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t')
 
 # data is the initial data before any comparisons have been done (-d parameter of compareBed.sh)
-data = fread(input_raw)
+data = fread(input_raw, sep='\t')
 
 # some statistics about the bedtool comparisons are stored in FilterMotifs.stats
 sum_data = sum(data[[3]]-data[[2]])
@@ -62,4 +70,4 @@ length_data = formatC(mean(data[[3]]-data[[2]]), digits = 4)
 length_filtered = formatC(mean(data_filtered[[7]]), digits = 4)
 stats = data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1])
 stats = t(stats)
-write.table(stats, "./FilterMotifs.stats", col.names = FALSE, quote = FALSE, sep = '\t')
+write.table(stats, output_stats, col.names = FALSE, quote = FALSE, sep = '\t')

From 4acc20f8e761bf74e4363381f04176dc33a43f86 Mon Sep 17 00:00:00 2001
From: JannikHamp <Jannik.Hamp@mpi-bn.mpg.de>
Date: Wed, 9 Jan 2019 10:13:10 +0100
Subject: [PATCH 18/21] documentation changes

---
 bin/1.2_filter_motifs/compareBed.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh
index 1840ba8..73bfa8f 100644
--- a/bin/1.2_filter_motifs/compareBed.sh
+++ b/bin/1.2_filter_motifs/compareBed.sh
@@ -125,7 +125,7 @@ then
 	echo "              -max --max          maximum size of footprints; default is 80"
 	echo "              -o   --output       output path/file ; default dir current directory and filename is newMotifs.bed and newMotifs.bed.fasta"
 	echo "              -h --help           shows this help message"
-	echo "              -p --path           the path where the required scripts merge.R and maxScore.R are stored. Default: same path as this scripts path"
+	echo "              -p --path           the path where the required script compareBed_runinfo.R is stored. Default: same path as this scripts path"
 exit 0
 fi
 
@@ -139,7 +139,7 @@ echo fasta: $fasta \(required\)
 echo min: $min
 echo max: $max
 echo output: $output
-echo path of scripts: $path
+echo relative path of subscript: $path
 
 # check required parameters
 if [ -z $data ] || [ -z $motifs ] || [ -z $fasta ]
@@ -190,7 +190,7 @@ done
 if [ $all_empty == true ]
 	then
 		echo ERROR
-		echo All motiffiles were empty!
+		echo All motiffiles have less than 2 lines!
 		echo Fix motiffiles and try again.
 		exit 1
 fi

From 90c8c0593a636a00698508d8af454ba02c0e092d Mon Sep 17 00:00:00 2001
From: JannikHamp <Jannik.Hamp@mpi-bn.mpg.de>
Date: Wed, 9 Jan 2019 15:29:41 +0100
Subject: [PATCH 19/21] updated check for trailing tabs in motiffiles

---
 bin/1.2_filter_motifs/compareBed.sh | 77 ++++++++++++++++++++---------
 1 file changed, 55 insertions(+), 22 deletions(-)

diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh
index 73bfa8f..223f497 100644
--- a/bin/1.2_filter_motifs/compareBed.sh
+++ b/bin/1.2_filter_motifs/compareBed.sh
@@ -8,17 +8,19 @@
 #             BED-format. 
 
 #             For parameter description, run the script without parameters or -h.
+#             The output is a file with the filtered footprints and the log file FilterMotivs.stats
 
-# One R script is used, called compareBed_runinfo.R, stored in the same directory
+# One R script is used, compareBed_runinfo.R, stored in the same directory.
 
 # default parameters
 workdir=$PWD
 output="newMotifs.bed"
 min=10
 max=200
-path=`echo $0 | sed 's/\/[^\/]*$/\//g'`
 help=false
 
+path=`echo $0 | sed 's/\/[^\/]*$/\//g'`
+
 # display help when no parameters chosen
 if [ $# -eq 0 ]
 then
@@ -32,7 +34,7 @@ do
 	key="$1"
 	if [[ ${2:0:1} == "-" ]]
 	then
-		echo "Each parameter needs a value (except the help parameter), values must not start with a '-'!"
+		echo "ERROR: Each parameter needs a value (except the help parameter), values must not start with a '-'!"
 		exit 1
 	fi
 
@@ -72,11 +74,6 @@ do
 			shift
 			shift
 		;;
-		-p|--path)
-			path="$2"
-			shift
-			shift
-		;;
 		-h|--help)
 			help=true
 			shift
@@ -94,7 +91,7 @@ if [ $count -gt 0 ]
 then
 	for i in ${wrong[@]}
 	do
-		echo wrong parameter $i
+		echo ERROR: wrong parameter $i
 	done
 
 	echo call script without parameters for help or call --help
@@ -104,11 +101,11 @@ fi
 # the help message
 if [ $help == true ]
 then
-	echo "This script utilies bedtools to select new footprints from data."
+	echo "This script utilizes bedtools to select new footprints from data."
 	echo "Therefore the data is compared with known footprint motifs."
-	echo "The output is a new .bed file with added sequence information and a column with flags for better sequences (1)"
-	echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences."
-	echo "If a parameter is chosen, a value must be provided or an error will occur."
+	echo "The output is a new BED-file with added sequence information and a flag contains_maxpos (1/0)"
+	echo "Required arguments are data, motifs, both in bed-format and the fasta genome sequences."
+	echo "If a parameter is chosen, a value must be provided aswell. (exception -h)"
 	echo "--------------------"
 	echo "usage: compareBed.sh --data <path/to/file.bed> --motifs <path/to/known/motifs.bed> --fasta <path/to/genome.fasta> [optional_parameter <value>] ..."
 	echo "--------------------"
@@ -125,8 +122,7 @@ then
 	echo "              -max --max          maximum size of footprints; default is 80"
 	echo "              -o   --output       output path/file ; default dir current directory and filename is newMotifs.bed and newMotifs.bed.fasta"
 	echo "              -h --help           shows this help message"
-	echo "              -p --path           the path where the required script compareBed_runinfo.R is stored. Default: same path as this scripts path"
-exit 0
+	exit 0
 fi
 
 # summary of parameters
@@ -139,7 +135,6 @@ echo fasta: $fasta \(required\)
 echo min: $min
 echo max: $max
 echo output: $output
-echo relative path of subscript: $path
 
 # check required parameters
 if [ -z $data ] || [ -z $motifs ] || [ -z $fasta ]
@@ -149,10 +144,38 @@ then
 	echo required are: --data \<path/data.bed\> --motifs \<path/motifs.bed\> --fasta \<path/file.fasta\>
 	exit 1
 fi
+if [ ! -f $data ]
+then
+	echo ERROR
+	echo $data does not exist. Please check input parameter -d / --data
+	exit 1
+fi
+if [ ! -f $fasta ]
+then
+	echo ERROR
+	echo $fasta does not exist. Please check input parameter -f / --fasta
+	exit 1
+fi
+#check other parameters
+if [ $min -lt 0 ]
+then
+	min=10
+	echo "min can't be negative. Default value of 10 is choosen"
+fi
+if [ $max -lt $min ]
+then
+	max=200
+	echo "max must be greater than min. Default value of 200 is choosen"
+fi
+if [ ! -d $workdir ]
+then
+	echo ERROR
+	echo "directory $workdir does not exist. Please check parameter -w / --workdir"
+	exit 1
+fi
 
 # remove trailing tabs in footprints
 cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed
-all_empty=true
 
 # motiffiles either from a directory OR comma separated list
 if [ -d "$motifs" ]
@@ -166,6 +189,7 @@ else
 fi
 
 # check if files exist and if they are all empty (exiting if all empty)
+all_empty=true
 for i in ${motiffiles[@]}
 do
 	if [ -f $i ]
@@ -180,7 +204,7 @@ do
 			fi
 		fi
 	else
-		echo file $i does not exist
+		echo ERROR: file $i does not exist
 		echo please use correct paths. exiting.
 		exit 1
 	fi
@@ -202,18 +226,26 @@ temp_switch=true
 counter=1
 for i in ${motiffiles[@]}
 do
-	# remove trailing tabs in motiffile
-	sed -i 's/[ \t]*$//' $i
+	echo "$i  ---  $counter of ${#motiffiles[@]}"
+	# remove trailing tabs in motiffile, but only if the second line in the file ends with a tab.
+	# checking all lines for trailing tabs would be time consuming.
+	secnd_line=`sed -n 2p $i | tr '\t' '#'`
+	echo $secnd_line
+	if [[ ${secnd_line: -1} == "#" ]]
+	then
+		echo trailing tabs have been found. removing trailing tabs.
+		sed -i 's/[ \t]*$//' $i
+	fi
 
+	# bedtools comparisons
 	if [ $temp_switch == true ]
 	then
 		temp_switch=false
 		bedtools subtract -a "$workdir"/filtered.bed -b $i > "$workdir"/filtered_temp.bed
 	else
 		temp_switch=true
-		bedtools subtract -a "$workdir"/filtered_temp.bed	-b $i > "$workdir"/filtered.bed
+		bedtools subtract -a "$workdir"/filtered_temp.bed -b $i > "$workdir"/filtered.bed
 	fi
-	echo "$i  ---  $counter of ${#motiffiles[@]}"
 	counter=`expr $counter + 1`
 done
 
@@ -253,5 +285,6 @@ else
 fi
 
 # add fasta sequences to bed and create fasta file
+echo getting sequences from fasta-file
 bedtools getfasta -fi $fasta -bed "$workdir"/filtered_flagged.bed -bedOut >> $output
 bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta

From 1a35ce5fbaceb4c19a097f013e499589db6bc490 Mon Sep 17 00:00:00 2001
From: JannikHamp <Jannik.Hamp@mpi-bn.mpg.de>
Date: Wed, 9 Jan 2019 15:34:56 +0100
Subject: [PATCH 20/21] removed echo from testing

---
 bin/1.2_filter_motifs/compareBed.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh
index 223f497..38027f3 100644
--- a/bin/1.2_filter_motifs/compareBed.sh
+++ b/bin/1.2_filter_motifs/compareBed.sh
@@ -230,7 +230,6 @@ do
 	# remove trailing tabs in motiffile, but only if the second line in the file ends with a tab.
 	# checking all lines for trailing tabs would be time consuming.
 	secnd_line=`sed -n 2p $i | tr '\t' '#'`
-	echo $secnd_line
 	if [[ ${secnd_line: -1} == "#" ]]
 	then
 		echo trailing tabs have been found. removing trailing tabs.

From 530627cc4499cd5c176abd4a7e047942a89c91a9 Mon Sep 17 00:00:00 2001
From: JannikHamp <Jannik.Hamp@mpi-bn.mpg.de>
Date: Wed, 9 Jan 2019 15:55:52 +0100
Subject: [PATCH 21/21] more documentation, replce =, <-

---
 bin/1.2_filter_motifs/compareBed_runinfo.R | 58 ++++++++++++----------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R
index 4c5686d..4888c02 100644
--- a/bin/1.2_filter_motifs/compareBed_runinfo.R
+++ b/bin/1.2_filter_motifs/compareBed_runinfo.R
@@ -18,15 +18,15 @@
 
 # parsing parameters
 library(data.table)
-args = commandArgs(TRUE)
-min = as.numeric(args[1])
-max = as.numeric(args[2])
-input_raw = args[3]
-input_filtered = args[4]
-output = args[5]
-output_stats = args[6]
+args <- commandArgs(TRUE)
+min <- as.numeric(args[1])
+max <- as.numeric(args[2])
+input_raw <- args[3]
+input_filtered <- args[4]
+output <- args[5]
+output_stats <- args[6]
 
-data_filtered = fread(input_filtered, sep='\t')
+data_filtered <- fread(input_filtered, sep='\t')
 
 # check if data has less than 9 columns
 if (ncol(data_filtered) < 9) {
@@ -35,39 +35,45 @@ if (ncol(data_filtered) < 9) {
 
 # remove sequences that are smaller than minimum (parameter)
 # remove sequences that are longer than maximum (parameter)
-data_filtered = data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),]
-data_filtered = data_filtered[which(data_filtered[[3]] - data_filtered[[2]] <= max),]
+data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),]
+data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] <= max),]
 
 # make unique names and adjust length for splitted footprints
 # duplicated names have .0 , .1 , .2 ... added
-names=data_filtered[[4]]
-names(data_filtered)[4] = "name"
+names <- data_filtered[[4]]
+names(data_filtered)[4] <- "name"
 # all duplicated names
-duplicants = unique(data_filtered[duplicated(name)][[4]])
-data_filtered[[4]] = make.unique(as.character(data_filtered[[4]]))
+duplicants <- unique(data_filtered[duplicated(name)][[4]])
+data_filtered[[4]] <- make.unique(as.character(data_filtered[[4]]))
 data_filtered[match(duplicants, names), name := paste0(name,".0")]
 
 # recalculate length of sequences
-data_filtered[[7]] = data_filtered[[3]] - data_filtered[[2]]
+data_filtered[[7]] <- data_filtered[[3]] - data_filtered[[2]]
 
 # adding column "contains_maxpos", containing flag (0 or 1)
 # max_pos is the position of maximum score of a footprint
-data_filtered = cbind(data_filtered, contains_maxpos = 0)
+data_filtered <- cbind(data_filtered, contains_maxpos = 0)
 data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1
-data_filtered[[8]] = data_filtered[[8]] - data_filtered[[2]]
+data_filtered[[8]] <- data_filtered[[8]] - data_filtered[[2]]
 
 fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t')
 
 # data is the initial data before any comparisons have been done (-d parameter of compareBed.sh)
-data = fread(input_raw, sep='\t')
+data <- fread(input_raw, sep='\t')
 
 # some statistics about the bedtool comparisons are stored in FilterMotifs.stats
-sum_data = sum(data[[3]]-data[[2]])
-sum_filtered = sum(data_filtered[[7]])
-difference_nt = formatC(sum_data/sum_filtered, digits = 4)
-loss_nt = formatC(1 - sum_filtered/sum_data, digits = 2)
-length_data = formatC(mean(data[[3]]-data[[2]]), digits = 4)
-length_filtered = formatC(mean(data_filtered[[7]]), digits = 4)
-stats = data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1])
-stats = t(stats)
+# number of nucleotides input
+sum_data <- sum(data[[3]]-data[[2]])
+# number of nucleotides after filter
+sum_filtered <- sum(data_filtered[[7]])
+# quotient: sum_data/sum_filtered
+difference_nt <- formatC(sum_data/sum_filtered, digits = 4)
+# loss: 1 - sum_filtered/sum_data
+loss_nt <- formatC(1 - sum_filtered/sum_data, digits = 2)
+# mean length of footprints input
+length_data <- formatC(mean(data[[3]]-data[[2]]), digits = 4)
+# mean -ength of footprints after filter
+length_filtered <- formatC(mean(data_filtered[[7]]), digits = 4)
+stats <- data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1])
+stats <- t(stats)
 write.table(stats, output_stats, col.names = FALSE, quote = FALSE, sep = '\t')