loosolab · renewiegandt · Jan 9, 2019 · Jan 4, 2019 · Jan 4, 2019 · Jan 4, 2019
diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
-#data
-#motifs
-#workdir
-#fasta
-#min
-#max
-#output
+
+# This script utilizes bedtools to gain non-overlapping sequence parts between bed-files
+# merge.R and maxScore.R are needed to be saved in the same directory than this to make it work
+# For more information read the wiki or run ./compareBed.sh without parameters
+# author Jannik Hamp
+# jannik.hamp@googlemail.com
+
 wrong=()
 da=false
 mo=false
@@ -17,11 +17,13 @@ ou=false
 pa=false
 he=false
 
+#no parameters chosen means help is going to be displayed
 if [ $# -eq 0 ]
 then
 	he=true
 fi
 
+#parsing the parameters, if a parameter is chosen, a value must be provided aswell. No standalone parameters
 while [[ $# -gt 0 ]]
 do
 key="$1"
@@ -87,6 +89,7 @@ case $key in
 esac
 done
 
+# stores unknown selected parameters for error report 
 count=${#wrong[@]}
 if [ $count -gt 0 ]
 then
@@ -99,6 +102,7 @@ then
 exit 1
 fi
 
+# the help message
 if [ $he == true ]
 then
 	echo "This script utilies bedtools to select new footprints from data."
@@ -119,13 +123,14 @@ then
 	echo "              -w   --workdir      the path to directory where temporary files will be created"
 	echo "                                  default is the current directory"
 	echo "              -min --min          minimum size of footprints; default is 10"
-	echo "              -max --max          maximum size of footprints; default is 80"
+	echo "              -max --max          maximum size of footprints; default is 200"
 	echo "              -o   --output       output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta"
 	echo "              -h --help           shows this help message"
 	echo "              -p --path           the path where the required scripts merge.R and maxScore.R are stored. Default: same path as this scripts path"
 exit 0
 fi
 
+# prints which parameters have been selected
 echo selected parameters
 echo -------------------
 echo data: $da \(required\)
@@ -138,19 +143,22 @@ echo output: $ou
 echo help: $he
 echo path of scripts: $pa
 
+#checks if the 3 required parameters have been selected
 if [ $da == false ] || [ $mo == false ] || [ $fa == false ]
 then
 	echo required parameters not given.
 	echo required are: --data \<path/data.bed\> --motifs \<path/motifs.bed\> --fasta \<path/file.fasta\>
 	exit 1
 fi
 
+#workdir is set to the current directory if it was not specified with a parameter
 if [ $wo == false ]
 then
 	wo=true
 	workdir=$PWD
 fi
 
+#if output was not specified it will be at "$workdir"/newMotifs.bed
 if [ $ou == false ]
 then
 	output=${workdir}/"newMotifs.bed"
@@ -165,19 +173,25 @@ fi
 
 if [ $ma == false ]
 then
-	max=80
+	max=200
 	ma=true
 fi
 
+#default path of the scripts merge.R and maxScore.R is the same path as compareBed.sh has
+# the `echo $0 | sed 's/\/[^\/]*$/\//g'` command extracts the path from $0, where the command itself is stored 
 if [ $pa == false ]
 then
 	path=`echo $0 | sed 's/\/[^\/]*$/\//g'`
 	pa=true
 fi
 
 #1. first filter. no overlap vs. overlap
+# This is done with the data of new footprints and each motif file subsequently.
+# The output of one iteration is the input data for the new footprints of the next iteration, in which a new motif is checked.
 echo get sequences with no overlap
 cat $data > "$workdir"/pass1Tr.bed
+# help variable is needed, because in bash I cannot write to the same file from which I am also reading the data.
+# Thus, there are 2 files which are alternating in each iteration
 help=true
 
 if [ -d "$motifs" ]
@@ -194,6 +208,7 @@ do
 	fi
 	echo $i
 done
+# if the -m parameter is not a directory, it is expected to be a comma separated list of files
 else
 declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`)
 for i in ${motiffiles[@]}
@@ -210,6 +225,7 @@ do
 done
 fi
 
+# After the final iteration, the last output is written to pass1Tr.bed if it is not already there
 if [ $help == false ]
 then
 	cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed
@@ -256,6 +272,7 @@ do
 done
 fi
 
+#The output of the last iteration of the subtract loop is written to pass2Tr.bed 
 if [ $help == false ]
 then
 	cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed

diff --git a/bin/1.2_filter_motifs/maxScore.R b/bin/1.2_filter_motifs/maxScore.R
@@ -1,10 +1,16 @@
 #!/bin/Rscript
+
+# The script is used with the script: compareBed.sh
+# This calculates the absolute maxpos values of a bed-file and overwrites them
+# author: Jannik Hamp
+# email: jannik.hamp@googlemail.com
+
 library(data.table)
 args = commandArgs(TRUE)
 file = args[1]
 
 tab = fread(file)
-colnames(tab) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info")
+colnames(tab) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info")
 
 tab$maxpos = tab$start + tab$maxpos
 

diff --git a/bin/1.2_filter_motifs/merge.R b/bin/1.2_filter_motifs/merge.R
@@ -1,29 +1,50 @@
 #!/bin/Rscript
+
+# The script is used in the script: compareBed.sh
+# This script merges the non-overlapping regions and the non-overlapping parts of overlapping regions
+# It also removes sequences that are smaller than the --min paramter or bigger than the --max parameter
+# Additionally, information of the comparison is written into an output file
+# author Jannik Hamp
+# email jannik.hamp@googlemail.com
+
 library(data.table)
 args=commandArgs(TRUE)
 min=as.numeric(args[1])
 max=as.numeric(args[2])
 folder=args[3]
 
+# reading the first dataframe: called splitted (contains splitted regions because of partial overlap)
 splitted = fread(paste(folder, "/pass2Tr.bed", sep=''))
-colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info")
+colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info")
+
+# reading the second dataframe: called p1 (all sequences with zero overlap)
 p1 = fread(paste(folder, "/pass1Tr.bed", sep=''))
-colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info")
+colnames(p1) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info")
 
+# calculates the absolute position of maximum signal of each footprint
 p1$maxpos = p1$start + p1$maxpos
 
+# the new combined dataframe is now called splitted
 splitted=rbind(splitted, p1)
 
+# only keep entries with a larger sequence than min and a smaller sequence than max
 splitted=splitted[which(splitted$stop - splitted$start >= min),]
 splitted=splitted[which(splitted$stop - splitted$start <= max),]
+
+# make the ids unique (because of duplicated ids of some footprints that got spliited in 2)
 splitted$id=make.unique(as.character(splitted$id))
+# calculate new length values (because of the splitted footprints)
 splitted$length=splitted$stop - splitted$start
 
+# add column containsMaxpos (0 means that maxpos has any overlap and 1 means that maxpos has no overlap with any motif) 
 splitted=cbind(splitted, containsMaxpos=0)
 splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1
+
+#calculate relative maxpos values
 splitted$maxpos = splitted$maxpos - splitted$start
 data.table::fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t')
 
+#aditional information of the comparison of the unknown footprints and the known motifs are computed and written to FilterMotifs.stats
 before = fread(args[4], header=FALSE)
 
 sumb=sum(before$V3-before$V2)
@@ -33,4 +54,4 @@ loss = formatC(1 - suma/sumb, digits=2)
 lengthb = formatC(mean(before$V3-before$V2), digits=4)
 lengtha = formatC(mean(splitted$length), digits=4)
 stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha, flag_1_ratio=length(which(splitted$containsMaxpos == 1))/dim(splitted)[1])
-write.table(stats, "../FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t')
+write.table(stats, "./FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t')