From 694291cba9a542cea84365ec79c23c6970a2a2cd Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 4 Jan 2019 12:20:06 +0100 Subject: [PATCH 01/21] updated script. Now the path of subscripts is automatically aquired. Fixed error which wrote output to the path of subscripts instead of the working directory --- bin/1.2_filter_motifs/compareBed.sh | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index aa2c37e..938be63 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -14,6 +14,7 @@ fa=false mi=false ma=false ou=false +pa=false he=false if [ $# -eq 0 ] @@ -70,6 +71,7 @@ case $key in ;; -p|--path) path="$2" + pa=true shift shift ;; @@ -120,6 +122,7 @@ then echo " -max --max maximum size of footprints; default is 80" echo " -o --output output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta" echo " -h --help shows this help message" + echo " -p --path the path where the required scripts merge.R and maxScore.R are stored. Default: same path as this scripts path" exit 0 fi @@ -133,6 +136,7 @@ echo min: $mi echo max: $ma echo output: $ou echo help: $he +echo path of scripts: $pa if [ $da == false ] || [ $mo == false ] || [ $fa == false ] then @@ -143,12 +147,8 @@ fi if [ $wo == false ] then - #if [ ! -d workdir1337 ] - #then - # mkdir workdir1337 - #fi wo=true - workdir=$path + workdir=$PWD fi if [ $ou == false ] @@ -169,9 +169,10 @@ then ma=true fi -if [ ! -d $workdir ] +if [ $pa == false ] then - mkdir $workdir + path=`echo $0 | sed 's/\/[^\/]*$/\//g'` + pa=true fi #1. first filter. no overlap vs. overlap @@ -268,8 +269,5 @@ fi Rscript --vanilla $path/merge.R $min $max $workdir $data #5. add fasta sequences to bed and create fasta file -bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > $output +bedtools getfasta -fi $fasta -bed $workdir/merged.bed -bedOut > $output bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta - -#6 clean up -#rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed From 3ebb1b8c6cd96abaffa7dc378c885ca427fcd6ef Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 4 Jan 2019 16:28:59 +0100 Subject: [PATCH 02/21] added some documentation --- bin/1.2_filter_motifs/compareBed.sh | 33 ++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index 938be63..5e8e6d9 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -1,11 +1,11 @@ #!/bin/bash -#data -#motifs -#workdir -#fasta -#min -#max -#output + +# This script utilizes bedtools to gain non-overlapping sequence parts between bed-files +# merge.R and maxScore.R are needed to be saved in the same directory than this to make it work +# For more information read the wiki or run ./compareBed.sh without parameters +# author Jannik Hamp +# jannik.hamp@googlemail.com + wrong=() da=false mo=false @@ -17,11 +17,13 @@ ou=false pa=false he=false +#no parameters chosen means help is going to be displayed if [ $# -eq 0 ] then he=true fi +#parsing the parameters, if a parameter is chosen, a value must be provided aswell. No standalone parameters while [[ $# -gt 0 ]] do key="$1" @@ -87,6 +89,7 @@ case $key in esac done +# stores unknown selected parameters for error report count=${#wrong[@]} if [ $count -gt 0 ] then @@ -99,6 +102,7 @@ then exit 1 fi +# the help message if [ $he == true ] then echo "This script utilies bedtools to select new footprints from data." @@ -126,6 +130,7 @@ then exit 0 fi +# prints which parameters have been selected echo selected parameters echo ------------------- echo data: $da \(required\) @@ -138,6 +143,7 @@ echo output: $ou echo help: $he echo path of scripts: $pa +#checks if the 3 required parameters have been selected if [ $da == false ] || [ $mo == false ] || [ $fa == false ] then echo required parameters not given. @@ -145,12 +151,14 @@ then exit 1 fi +#workdir is set to the current directory if it was not specified with a parameter if [ $wo == false ] then wo=true workdir=$PWD fi +#if output was not specified it will be at "$workdir"/newMotifs.bed if [ $ou == false ] then output=${workdir}/"newMotifs.bed" @@ -165,10 +173,12 @@ fi if [ $ma == false ] then - max=80 + max=200 ma=true fi +#default path of the scripts merge.R and maxScore.R is the same path as compareBed.sh has +# the `echo $0 | sed 's/\/[^\/]*$/\//g'` command extracts the path from $0, where the command itself is stored if [ $pa == false ] then path=`echo $0 | sed 's/\/[^\/]*$/\//g'` @@ -176,8 +186,12 @@ then fi #1. first filter. no overlap vs. overlap +# This is done with the data of new footprints and each motif file subsequently. +# The output of one iteration is the input data for the new footprints of the next iteration, in which a new motif is checked. echo get sequences with no overlap cat $data > "$workdir"/pass1Tr.bed +# help variable is needed, because in bash I cannot write to the same file from which I am also reading the data. +# Thus, there are 2 files which are alternating in each iteration help=true if [ -d "$motifs" ] @@ -194,6 +208,7 @@ do fi echo $i done +# if the -m parameter is not a directory, it is expected to be a comma separated list of files else declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) for i in ${motiffiles[@]} @@ -210,6 +225,7 @@ do done fi +# After the final iteration, the last output is written to pass1Tr.bed if it is not already there if [ $help == false ] then cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed @@ -256,6 +272,7 @@ do done fi +#The output of the last iteration of the subtract loop is written to pass2Tr.bed if [ $help == false ] then cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed From 7d834a4a37be5ba4054a1e13333cfcea8d437056 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 4 Jan 2019 16:32:19 +0100 Subject: [PATCH 03/21] added collumn for strand information --- bin/1.2_filter_motifs/maxScore.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/1.2_filter_motifs/maxScore.R b/bin/1.2_filter_motifs/maxScore.R index 6b90984..fae6af7 100644 --- a/bin/1.2_filter_motifs/maxScore.R +++ b/bin/1.2_filter_motifs/maxScore.R @@ -4,7 +4,7 @@ args = commandArgs(TRUE) file = args[1] tab = fread(file) -colnames(tab) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") +colnames(tab) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info") tab$maxpos = tab$start + tab$maxpos From 7de9a1b3e9e9e937310b86afaff35f06ebf6c8ac Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 4 Jan 2019 16:34:07 +0100 Subject: [PATCH 04/21] added column "strand" --- bin/1.2_filter_motifs/merge.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/1.2_filter_motifs/merge.R b/bin/1.2_filter_motifs/merge.R index 129e41d..c4a0d4d 100644 --- a/bin/1.2_filter_motifs/merge.R +++ b/bin/1.2_filter_motifs/merge.R @@ -6,9 +6,9 @@ max=as.numeric(args[2]) folder=args[3] splitted = fread(paste(folder, "/pass2Tr.bed", sep='')) -colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") +colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info") p1 = fread(paste(folder, "/pass1Tr.bed", sep='')) -colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") +colnames(p1) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info") p1$maxpos = p1$start + p1$maxpos @@ -33,4 +33,4 @@ loss = formatC(1 - suma/sumb, digits=2) lengthb = formatC(mean(before$V3-before$V2), digits=4) lengtha = formatC(mean(splitted$length), digits=4) stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha, flag_1_ratio=length(which(splitted$containsMaxpos == 1))/dim(splitted)[1]) -write.table(stats, "../FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t') +write.table(stats, "./FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t') From 1a498214752125b5701beea593d0b4c0801c7416 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 4 Jan 2019 16:35:55 +0100 Subject: [PATCH 05/21] documentation --- bin/1.2_filter_motifs/compareBed.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index 5e8e6d9..d0a1baf 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -123,7 +123,7 @@ then echo " -w --workdir the path to directory where temporary files will be created" echo " default is the current directory" echo " -min --min minimum size of footprints; default is 10" - echo " -max --max maximum size of footprints; default is 80" + echo " -max --max maximum size of footprints; default is 200" echo " -o --output output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta" echo " -h --help shows this help message" echo " -p --path the path where the required scripts merge.R and maxScore.R are stored. Default: same path as this scripts path" From 9b5b06135e24f46ff62c5889e7566651e38fb061 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 4 Jan 2019 16:51:17 +0100 Subject: [PATCH 06/21] documentation --- bin/1.2_filter_motifs/merge.R | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/bin/1.2_filter_motifs/merge.R b/bin/1.2_filter_motifs/merge.R index c4a0d4d..750cf93 100644 --- a/bin/1.2_filter_motifs/merge.R +++ b/bin/1.2_filter_motifs/merge.R @@ -1,29 +1,49 @@ #!/bin/Rscript + +# This script merges the non-overlapping regions and the non-overlapping parts of overlapping regions +# It also removes sequences that are smaller than the --min paramter or bigger than the --max parameter +# Additionally, information of the comparison is written into an output file +# author Janni Hamp +# email jannik.hamp@googlemail.com + library(data.table) args=commandArgs(TRUE) min=as.numeric(args[1]) max=as.numeric(args[2]) folder=args[3] +# reading the first dataframe: called splitted (contains splitted regions because of partial overlap) splitted = fread(paste(folder, "/pass2Tr.bed", sep='')) colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info") + +# reading the second dataframe: called p1 (all sequences with zero overlap) p1 = fread(paste(folder, "/pass1Tr.bed", sep='')) colnames(p1) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info") +# calculates the absolute position of maximum signal of each footprint p1$maxpos = p1$start + p1$maxpos +# the new combined dataframe is now called splitted splitted=rbind(splitted, p1) +# only keep entries with a larger sequence than min and a smaller sequence than max splitted=splitted[which(splitted$stop - splitted$start >= min),] splitted=splitted[which(splitted$stop - splitted$start <= max),] + +# make the ids unique (because of duplicated ids of some footprints that got spliited in 2) splitted$id=make.unique(as.character(splitted$id)) +# calculate new length values (because of the splitted footprints) splitted$length=splitted$stop - splitted$start +# add column containsMaxpos (0 means that maxpos has any overlap and 1 means that maxpos has no overlap with any motif) splitted=cbind(splitted, containsMaxpos=0) splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 + +#calculate relative maxpos values splitted$maxpos = splitted$maxpos - splitted$start data.table::fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') +#aditional information of the comparison of the unknown footprints and the known motifs are computed and written to FilterMotifs.stats before = fread(args[4], header=FALSE) sumb=sum(before$V3-before$V2) From 9ecba8ac1de0e83a556f723dd8277ad49453b3a2 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 4 Jan 2019 17:08:38 +0100 Subject: [PATCH 07/21] documentation --- bin/1.2_filter_motifs/maxScore.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bin/1.2_filter_motifs/maxScore.R b/bin/1.2_filter_motifs/maxScore.R index fae6af7..e93429a 100644 --- a/bin/1.2_filter_motifs/maxScore.R +++ b/bin/1.2_filter_motifs/maxScore.R @@ -1,4 +1,10 @@ #!/bin/Rscript + +# The script is used with the script: compareBed.sh +# This calculates the absolute maxpos values of a bed-file and overwrites them +# author: Jannik Hamp +# email: jannik.hamp@googlemail.com + library(data.table) args = commandArgs(TRUE) file = args[1] From 58eff9871baa56c7765731870e09b247f5002151 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 4 Jan 2019 17:09:40 +0100 Subject: [PATCH 08/21] documentation --- bin/1.2_filter_motifs/merge.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/1.2_filter_motifs/merge.R b/bin/1.2_filter_motifs/merge.R index 750cf93..507a746 100644 --- a/bin/1.2_filter_motifs/merge.R +++ b/bin/1.2_filter_motifs/merge.R @@ -1,9 +1,10 @@ #!/bin/Rscript +# The script is used in the script: compareBed.sh # This script merges the non-overlapping regions and the non-overlapping parts of overlapping regions # It also removes sequences that are smaller than the --min paramter or bigger than the --max parameter # Additionally, information of the comparison is written into an output file -# author Janni Hamp +# author Jannik Hamp # email jannik.hamp@googlemail.com library(data.table) From b1cab105b50e7537cc28187ffd3352c13ce6f1c3 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Sat, 5 Jan 2019 19:03:55 +0100 Subject: [PATCH 09/21] Update compareBed.sh --- bin/1.2_filter_motifs/compareBed.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index d0a1baf..e1ac62f 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -1,7 +1,7 @@ #!/bin/bash # This script utilizes bedtools to gain non-overlapping sequence parts between bed-files -# merge.R and maxScore.R are needed to be saved in the same directory than this to make it work +# merge.R and maxScore.R are needed to be saved in the same directory as this to make it work # For more information read the wiki or run ./compareBed.sh without parameters # author Jannik Hamp # jannik.hamp@googlemail.com From cca8ac474cb4980eb40d55c2d457cf0e260e6083 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sun, 6 Jan 2019 15:34:38 +0100 Subject: [PATCH 10/21] Added check for \t at the end of lines in BED-file --- bin/1.2_filter_motifs/compareBed.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index e1ac62f..99423f7 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -89,7 +89,7 @@ case $key in esac done -# stores unknown selected parameters for error report +# stores unknown selected parameters for error report count=${#wrong[@]} if [ $count -gt 0 ] then @@ -178,7 +178,7 @@ then fi #default path of the scripts merge.R and maxScore.R is the same path as compareBed.sh has -# the `echo $0 | sed 's/\/[^\/]*$/\//g'` command extracts the path from $0, where the command itself is stored +# the `echo $0 | sed 's/\/[^\/]*$/\//g'` command extracts the path from $0, where the command itself is stored if [ $pa == false ] then path=`echo $0 | sed 's/\/[^\/]*$/\//g'` @@ -198,6 +198,7 @@ if [ -d "$motifs" ] then for i in "$motifs"/*.bed do + sed -i 's/[ \t]*$//' $i if [ $help == true ] then help=false @@ -272,7 +273,7 @@ do done fi -#The output of the last iteration of the subtract loop is written to pass2Tr.bed +#The output of the last iteration of the subtract loop is written to pass2Tr.bed if [ $help == false ] then cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed From f983d2222bf681bc10ca4e5d6487e3312b42965d Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sun, 6 Jan 2019 15:35:17 +0100 Subject: [PATCH 11/21] Remove header = false from fread --- bin/1.2_filter_motifs/merge.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/1.2_filter_motifs/merge.R b/bin/1.2_filter_motifs/merge.R index 507a746..9d97da0 100644 --- a/bin/1.2_filter_motifs/merge.R +++ b/bin/1.2_filter_motifs/merge.R @@ -45,7 +45,7 @@ splitted$maxpos = splitted$maxpos - splitted$start data.table::fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') #aditional information of the comparison of the unknown footprints and the known motifs are computed and written to FilterMotifs.stats -before = fread(args[4], header=FALSE) +before = fread(args[4]) sumb=sum(before$V3-before$V2) suma=sum(splitted$length) From 1134d7485c0814ca73e0ecf002edb2247d60d213 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sun, 6 Jan 2019 21:49:02 +0100 Subject: [PATCH 12/21] merge.R: Set separator from auto to '\t' in fread --- bin/1.2_filter_motifs/merge.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/1.2_filter_motifs/merge.R b/bin/1.2_filter_motifs/merge.R index 9d97da0..bd009cc 100644 --- a/bin/1.2_filter_motifs/merge.R +++ b/bin/1.2_filter_motifs/merge.R @@ -14,11 +14,11 @@ max=as.numeric(args[2]) folder=args[3] # reading the first dataframe: called splitted (contains splitted regions because of partial overlap) -splitted = fread(paste(folder, "/pass2Tr.bed", sep='')) +splitted = fread(paste(folder, "/pass2Tr.bed", sep=''), sep = '\t') colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info") # reading the second dataframe: called p1 (all sequences with zero overlap) -p1 = fread(paste(folder, "/pass1Tr.bed", sep='')) +p1 = fread(paste(folder, "/pass1Tr.bed", sep=''), sep = '\t') colnames(p1) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info") # calculates the absolute position of maximum signal of each footprint From 37c69e6660ebd826241f4e46bc1bd951897d0459 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Tue, 8 Jan 2019 13:53:06 +0100 Subject: [PATCH 13/21] this r script is no more necessary, the other does its job --- bin/1.2_filter_motifs/maxScore.R | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 bin/1.2_filter_motifs/maxScore.R diff --git a/bin/1.2_filter_motifs/maxScore.R b/bin/1.2_filter_motifs/maxScore.R deleted file mode 100644 index e93429a..0000000 --- a/bin/1.2_filter_motifs/maxScore.R +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/Rscript - -# The script is used with the script: compareBed.sh -# This calculates the absolute maxpos values of a bed-file and overwrites them -# author: Jannik Hamp -# email: jannik.hamp@googlemail.com - -library(data.table) -args = commandArgs(TRUE) -file = args[1] - -tab = fread(file) -colnames(tab) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info") - -tab$maxpos = tab$start + tab$maxpos - -fwrite(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') From 6802d72736a0c2a4f272f7b364e347389f4d0dd9 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Tue, 8 Jan 2019 13:54:10 +0100 Subject: [PATCH 14/21] New updated version. Faster and more robust --- bin/1.2_filter_motifs/compareBed.sh | 406 ++++++++++++++-------------- 1 file changed, 198 insertions(+), 208 deletions(-) diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index 99423f7..1571065 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -1,95 +1,101 @@ #!/bin/bash -# This script utilizes bedtools to gain non-overlapping sequence parts between bed-files -# merge.R and maxScore.R are needed to be saved in the same directory as this to make it work -# For more information read the wiki or run ./compareBed.sh without parameters -# author Jannik Hamp -# jannik.hamp@googlemail.com +# author: Jannik Hamp +# email: jannik.hamp@googlemail.com -wrong=() +# desciption: This scripts uses bedtools to find new, non overlapping regions +# between input data in BED-format and a group of known motifs in +# BED-format. + +# For parameter description, run the script without parameters or -h. + +# One R script is used, called compareBed_runinfo.R, stored in the same directory + +# default parameters +workdir=$PWD +output="newMotifs.bed" +min=10 +max=200 +path=`echo $0 | sed 's/\/[^\/]*$/\//g'` +help=false +# required parameters given da=false mo=false -wo=false fa=false -mi=false -ma=false -ou=false -pa=false -he=false -#no parameters chosen means help is going to be displayed +# display help when no parameters chosen if [ $# -eq 0 ] then he=true fi -#parsing the parameters, if a parameter is chosen, a value must be provided aswell. No standalone parameters +# parsing parameters +wrong=() while [[ $# -gt 0 ]] do -key="$1" + key="$1" + if [[ "^-" =~ $2 ]] + then + echo "Each parameter needs a value (except the help parameter), values must not start with a '-'!" + exit 1 + fi -case $key in - -d|--data) - data="$2" - da=true - shift - shift - ;; - -m|--motifs) - motifs="$2" - mo=true - shift - shift - ;; - -w|--workdir) - workdir="$2" - wo=true - shift - shift - ;; - -f|--fasta) - fasta="$2" - fa=true - shift - shift - ;; - -min|--min) - min=$2 - mi=true - shift - shift - ;; - -max|--max) - max=$2 - ma=true - shift - shift - ;; - -o|--output) - output="$2" - ou=true - shift - shift - ;; - -p|--path) - path="$2" - pa=true - shift - shift - ;; - -h|--help) - help=true - he=true - shift - ;; - *) - wrong+=("$1") - shift - ;; -esac + case $key in + -d|--data) + data="$2" + da=true + shift + shift + ;; + -m|--motifs) + motifs="$2" + mo=true + shift + shift + ;; + -w|--workdir) + workdir="$2" + shift + shift + ;; + -f|--fasta) + fasta="$2" + fa=true + shift + shift + ;; + -min|--min) + min=$2 + shift + shift + ;; + -max|--max) + max=$2 + shift + shift + ;; + -o|--output) + output="$2" + shift + shift + ;; + -p|--path) + path="$2" + shift + shift + ;; + -h|--help) + help=true + shift + ;; + *) + wrong+=("$1") + shift + ;; + esac done -# stores unknown selected parameters for error report +# if wrong parameters were chosen.. count=${#wrong[@]} if [ $count -gt 0 ] then @@ -103,7 +109,7 @@ exit 1 fi # the help message -if [ $he == true ] +if [ $help == true ] then echo "This script utilies bedtools to select new footprints from data." echo "Therefore the data is compared with known footprint motifs." @@ -123,169 +129,153 @@ then echo " -w --workdir the path to directory where temporary files will be created" echo " default is the current directory" echo " -min --min minimum size of footprints; default is 10" - echo " -max --max maximum size of footprints; default is 200" - echo " -o --output output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta" + echo " -max --max maximum size of footprints; default is 80" + echo " -o --output output path/file ; default dir current directory and filename is newMotifs.bed and newMotifs.bed.fasta" echo " -h --help shows this help message" echo " -p --path the path where the required scripts merge.R and maxScore.R are stored. Default: same path as this scripts path" exit 0 fi -# prints which parameters have been selected +# summary of parameters echo selected parameters echo ------------------- -echo data: $da \(required\) -echo motifs: $mo \(required\) -echo workdir: $wo -echo fasta: $fa \(required\) -echo min: $mi -echo max: $ma -echo output: $ou -echo help: $he -echo path of scripts: $pa +echo data: $data \(required\) +echo motifs: $motifs \(required\) +echo workdir: $workdir +echo fasta: $fasta \(required\) +echo min: $min +echo max: $max +echo output: $output +echo path of scripts: $path -#checks if the 3 required parameters have been selected -if [ $da == false ] || [ $mo == false ] || [ $fa == false ] +# check required parameters +if [ -z $data ] || [ -z $motifs ] || [ -z $fasta ] then echo required parameters not given. echo required are: --data \ --motifs \ --fasta \ exit 1 fi -#workdir is set to the current directory if it was not specified with a parameter -if [ $wo == false ] -then - wo=true - workdir=$PWD -fi - -#if output was not specified it will be at "$workdir"/newMotifs.bed -if [ $ou == false ] -then - output=${workdir}/"newMotifs.bed" - ou=true -fi - -if [ $mi == false ] -then - min=10 - mi=true -fi - -if [ $ma == false ] -then - max=200 - ma=true -fi - -#default path of the scripts merge.R and maxScore.R is the same path as compareBed.sh has -# the `echo $0 | sed 's/\/[^\/]*$/\//g'` command extracts the path from $0, where the command itself is stored -if [ $pa == false ] -then - path=`echo $0 | sed 's/\/[^\/]*$/\//g'` - pa=true -fi - -#1. first filter. no overlap vs. overlap -# This is done with the data of new footprints and each motif file subsequently. -# The output of one iteration is the input data for the new footprints of the next iteration, in which a new motif is checked. -echo get sequences with no overlap -cat $data > "$workdir"/pass1Tr.bed -# help variable is needed, because in bash I cannot write to the same file from which I am also reading the data. -# Thus, there are 2 files which are alternating in each iteration -help=true +# comparing unknown footprints with regions of known motifs +# comparison is done iteratively +# remove overlapping regions in unknown footprints +# remove trailing tabs in footprints +cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed +temp_switch=true +all_empty=true if [ -d "$motifs" ] then -for i in "$motifs"/*.bed -do - sed -i 's/[ \t]*$//' $i - if [ $help == true ] + # check if all motiffiles are empty/only consist of header. exit if all are empty + for i in "$motifs"/*.bed + do + if [ $all_empty == true ] + then + lines=`cat $i | wc -l` + if [ $lines -gt 1 ] + then + all_empty=false + fi + fi + done + if [ $all_empty == true ] then - help=false - bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed - else - help=true - bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed + echo All motiffiles were empty! + echo Fix motiffiles and try again. + echo exiting + exit 1 fi - echo $i -done -# if the -m parameter is not a directory, it is expected to be a comma separated list of files + + # bedtools comparisons + for i in "$motifs"/*.bed + do + # remove trailing tabs in motiffile + sed -i 's/[ \t]*$//' $i + + if [ $temp_switch == true ] + then + temp_switch=false + bedtools subtract -a "$workdir"/filtered.bed -b $i > "$workdir"/filtered_temp.bed + else + temp_switch=true + bedtools subtract -a "$workdir"/filtered_temp.bed -b $i > "$workdir"/filtered.bed + fi + echo $i + done + +# the else case means, that the motiffiles were passed comma separated with no whitespace. else -declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) -for i in ${motiffiles[@]} -do - if [ $help == true ] + declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) + # check if files exist and if they are all empty (exiting if all empty) + for i in ${motiffiles[@]} + do + if [ -f $i ] + then + if [ $all_empty == true ] + then + lines=`cat $i | wc -l` + if [ $lines -gt 1 ] + then + all_empty=false + fi + fi + else + echo file $i does not exist + echo please use correct paths. exiting. + exit 1 + fi + done + if [ $all_empty == true ] then - help=false - bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed - else - help=true - bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed - fi - echo $i -done + echo All motiffiles were empty! + echo Fix motiffiles and try again. + echo exiting + exit 1 + fi + + # bedtools comparisons + for i in ${motiffiles[@]} + do + # remove trailing tabs in motiffile + sed -i 's/[ \t]*$//' $i + + if [ $temp_switch == true ] + then + help=false + bedtools subtract -a "$workdir"/filtered.bed -b $i > "$workdir"/filtered_temp.bed + else + temp_switch=true + bedtools subtract -a "$workdir"/filtered_temp.bed -b $i > "$workdir"/filtered.bed + fi + echo $i + done fi -# After the final iteration, the last output is written to pass1Tr.bed if it is not already there -if [ $help == false ] +# get file of last iteration an write its content into filtered.bed +if [ $temp_switch == false ] then - cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed + cat "$workdir"/filtered_temp.bed > "$workdir"/filtered.bed fi -echo get overlapping sequences -bedtools intersect -v -a $data -b "$workdir"/pass1Tr.bed > "$workdir"/pass1Fa.bed -cat "$workdir"/pass1Fa.bed | wc -l - -echo calc maxScore -#2. compute absolut maxscore position -Rscript --vanilla $path/maxScore.R "$workdir"/pass1Fa.bed - -#3 subtract overlapping regions for additional motifs -#echo "also get subsequences with no overlap of overlapping sequences" -help=true - -if [ -d "$motifs" ] +# remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition +# also creates a small output file with information about the comparison +Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed +# check if Rscript executed without errors +if [ $? -gt 0 ] then -for i in "$motifs"/*.bed -do - if [ $help == true ] - then - help=false - bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed - else - help=true - bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed - fi - echo $i -done -else -for i in ${motiffiles[@]} -do - if [ $help == true ] - then - help=false - bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed - else - help=true - bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed - fi - echo $i -done + exit 1 fi -#The output of the last iteration of the subtract loop is written to pass2Tr.bed -if [ $help == false ] +# check if header existed. If so, final output also has a header. +first_line=`sed -n 1p $data | sed "s/$/\tcontains_maxpos\tsequence/"` +if [[ ${first_line:0:1} == "#" ]] then - cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed + echo "$first_line" > $output else - cat "$workdir"/pass1Fa.bed > "$workdir"/pass2Tr.bed + # output will be overwritten if it exists + rm -f $output fi -#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition -# also creates a small output file with information about the comparison - -Rscript --vanilla $path/merge.R $min $max $workdir $data - -#5. add fasta sequences to bed and create fasta file -bedtools getfasta -fi $fasta -bed $workdir/merged.bed -bedOut > $output +# add fasta sequences to bed and create fasta file +bedtools getfasta -fi $fasta -bed "$workdir"/filtered_flagged.bed -bedOut >> $output bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta From b787d14b46e9ad21d14f9df28161cb7c262c1cbc Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Tue, 8 Jan 2019 13:55:39 +0100 Subject: [PATCH 15/21] updated version.. make unique updated, more robust in general --- bin/1.2_filter_motifs/compareBed_runinfo.R | 65 ++++++++++++++++++++++ bin/1.2_filter_motifs/merge.R | 57 ------------------- 2 files changed, 65 insertions(+), 57 deletions(-) create mode 100644 bin/1.2_filter_motifs/compareBed_runinfo.R delete mode 100644 bin/1.2_filter_motifs/merge.R diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R new file mode 100644 index 0000000..4a7af6f --- /dev/null +++ b/bin/1.2_filter_motifs/compareBed_runinfo.R @@ -0,0 +1,65 @@ +#!/bin/Rscript + +# author: Jannik Hamp +# email: jannik.hamp@googlemail.com + +# desciption: This script gets called by compareBed.sh +# It removes too small/large footprints, makes names unique, +# adds a column with a flag "contains_maxpos" +# and creates a file with information of the bedtool comparison + +# TODO: check number of columns, implement make.unique in a better way + +# parsing parameters +library(data.table) +args = commandArgs(TRUE) +min = as.numeric(args[1]) +max = as.numeric(args[2]) +input_raw = args[3] +input_filtered = args[4] +output = args[5] + +data_filtered = fread(input_filtered) +# check if data has less than 9 columns +if (ncol(data_filtered) < 9) { + stop("footprint file has less than 9 columns. exiting.") +} + +# remove sequences that are smaller than minimum (parameter) +# remove sequences that are longer than maximum (parameter) +data_filtered = data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),] +data_filtered = data_filtered[which(data_filtered[[3]] - data_filtered[[2]] <= max),] + +# make unique names and adjust length for splitted footprints +# duplicated names have .0 , .1 , .2 ... added +names=data_filtered[[4]] +names(data_filtered)[4] = "name" +# all duplicated names +duplicants = unique(data_filtered[duplicated(name)][[4]]) +data_filtered[[4]] = make.unique(as.character(data_filtered[[4]])) +data_filtered[match(duplicants, names), name := paste0(name,".0")] + +# recalculate length of sequences +data_filtered[[7]] = data_filtered[[3]] - data_filtered[[2]] + +# adding column "contains_maxpos", containing flag (0 or 1) +# max_pos is the position of maximum score of a footprint +data_filtered = cbind(data_filtered, contains_maxpos = 0) +data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1 +data_filtered[[8]] = data_filtered[[8]] - data_filtered[[2]] + +fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t') + +# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh) +data = fread(input_raw) + +# some statistics about the bedtool comparisons are stored in FilterMotifs.stats +sum_data = sum(data[[3]]-data[[2]]) +sum_filtered = sum(data_filtered[[7]]) +difference_nt = formatC(sum_data/sum_filtered, digits = 4) +loss_nt = formatC(1 - sum_filtered/sum_data, digits = 2) +length_data = formatC(mean(data[[3]]-data[[2]]), digits = 4) +length_filtered = formatC(mean(data_filtered[[7]]), digits = 4) +stats = data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1]) +stats = t(stats) +write.table(stats, "./FilterMotifs.stats", col.names = FALSE, quote = FALSE, sep = '\t') diff --git a/bin/1.2_filter_motifs/merge.R b/bin/1.2_filter_motifs/merge.R deleted file mode 100644 index bd009cc..0000000 --- a/bin/1.2_filter_motifs/merge.R +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/Rscript - -# The script is used in the script: compareBed.sh -# This script merges the non-overlapping regions and the non-overlapping parts of overlapping regions -# It also removes sequences that are smaller than the --min paramter or bigger than the --max parameter -# Additionally, information of the comparison is written into an output file -# author Jannik Hamp -# email jannik.hamp@googlemail.com - -library(data.table) -args=commandArgs(TRUE) -min=as.numeric(args[1]) -max=as.numeric(args[2]) -folder=args[3] - -# reading the first dataframe: called splitted (contains splitted regions because of partial overlap) -splitted = fread(paste(folder, "/pass2Tr.bed", sep=''), sep = '\t') -colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info") - -# reading the second dataframe: called p1 (all sequences with zero overlap) -p1 = fread(paste(folder, "/pass1Tr.bed", sep=''), sep = '\t') -colnames(p1) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info") - -# calculates the absolute position of maximum signal of each footprint -p1$maxpos = p1$start + p1$maxpos - -# the new combined dataframe is now called splitted -splitted=rbind(splitted, p1) - -# only keep entries with a larger sequence than min and a smaller sequence than max -splitted=splitted[which(splitted$stop - splitted$start >= min),] -splitted=splitted[which(splitted$stop - splitted$start <= max),] - -# make the ids unique (because of duplicated ids of some footprints that got spliited in 2) -splitted$id=make.unique(as.character(splitted$id)) -# calculate new length values (because of the splitted footprints) -splitted$length=splitted$stop - splitted$start - -# add column containsMaxpos (0 means that maxpos has any overlap and 1 means that maxpos has no overlap with any motif) -splitted=cbind(splitted, containsMaxpos=0) -splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 - -#calculate relative maxpos values -splitted$maxpos = splitted$maxpos - splitted$start -data.table::fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') - -#aditional information of the comparison of the unknown footprints and the known motifs are computed and written to FilterMotifs.stats -before = fread(args[4]) - -sumb=sum(before$V3-before$V2) -suma=sum(splitted$length) -difference = formatC(sumb/suma, digits=4) -loss = formatC(1 - suma/sumb, digits=2) -lengthb = formatC(mean(before$V3-before$V2), digits=4) -lengtha = formatC(mean(splitted$length), digits=4) -stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha, flag_1_ratio=length(which(splitted$containsMaxpos == 1))/dim(splitted)[1]) -write.table(stats, "./FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t') From becfeae762c4066439462438b6b758cd3cf46b6f Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Tue, 8 Jan 2019 16:03:02 +0100 Subject: [PATCH 16/21] added information for logfile --- bin/1.2_filter_motifs/compareBed.sh | 146 ++++++++++++---------------- 1 file changed, 61 insertions(+), 85 deletions(-) diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index 1571065..1840ba8 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -18,15 +18,11 @@ min=10 max=200 path=`echo $0 | sed 's/\/[^\/]*$/\//g'` help=false -# required parameters given -da=false -mo=false -fa=false # display help when no parameters chosen if [ $# -eq 0 ] then - he=true + help=true fi # parsing parameters @@ -34,7 +30,7 @@ wrong=() while [[ $# -gt 0 ]] do key="$1" - if [[ "^-" =~ $2 ]] + if [[ ${2:0:1} == "-" ]] then echo "Each parameter needs a value (except the help parameter), values must not start with a '-'!" exit 1 @@ -43,13 +39,11 @@ do case $key in -d|--data) data="$2" - da=true shift shift ;; -m|--motifs) motifs="$2" - mo=true shift shift ;; @@ -60,7 +54,6 @@ do ;; -f|--fasta) fasta="$2" - fa=true shift shift ;; @@ -102,10 +95,10 @@ then for i in ${wrong[@]} do echo wrong parameter $i - echo call script without parameters for help or call --help - echo exit done -exit 1 + + echo call script without parameters for help or call --help + exit 1 fi # the help message @@ -151,105 +144,78 @@ echo path of scripts: $path # check required parameters if [ -z $data ] || [ -z $motifs ] || [ -z $fasta ] then + echo ERROR echo required parameters not given. echo required are: --data \ --motifs \ --fasta \ exit 1 fi -# comparing unknown footprints with regions of known motifs -# comparison is done iteratively -# remove overlapping regions in unknown footprints - # remove trailing tabs in footprints cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed -temp_switch=true all_empty=true + +# motiffiles either from a directory OR comma separated list if [ -d "$motifs" ] then - # check if all motiffiles are empty/only consist of header. exit if all are empty - for i in "$motifs"/*.bed - do + # creates an array of all files with bed in its name in the directory $motifs + declare -a motiffiles=(`ls $motifs | grep bed | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`) + +# the else case means, that the motiffiles were passed comma separated with no whitespace. +else + declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) +fi + +# check if files exist and if they are all empty (exiting if all empty) +for i in ${motiffiles[@]} +do + if [ -f $i ] + then if [ $all_empty == true ] then lines=`cat $i | wc -l` if [ $lines -gt 1 ] then all_empty=false + break fi fi - done - if [ $all_empty == true ] - then - echo All motiffiles were empty! - echo Fix motiffiles and try again. - echo exiting + else + echo file $i does not exist + echo please use correct paths. exiting. exit 1 fi +done - # bedtools comparisons - for i in "$motifs"/*.bed - do - # remove trailing tabs in motiffile - sed -i 's/[ \t]*$//' $i - - if [ $temp_switch == true ] - then - temp_switch=false - bedtools subtract -a "$workdir"/filtered.bed -b $i > "$workdir"/filtered_temp.bed - else - temp_switch=true - bedtools subtract -a "$workdir"/filtered_temp.bed -b $i > "$workdir"/filtered.bed - fi - echo $i - done - -# the else case means, that the motiffiles were passed comma separated with no whitespace. -else - declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) - # check if files exist and if they are all empty (exiting if all empty) - for i in ${motiffiles[@]} - do - if [ -f $i ] - then - if [ $all_empty == true ] - then - lines=`cat $i | wc -l` - if [ $lines -gt 1 ] - then - all_empty=false - fi - fi - else - echo file $i does not exist - echo please use correct paths. exiting. - exit 1 - fi - done - if [ $all_empty == true ] +# error report of rare case of only empty motiffiles +if [ $all_empty == true ] then + echo ERROR echo All motiffiles were empty! echo Fix motiffiles and try again. - echo exiting exit 1 - fi +fi - # bedtools comparisons - for i in ${motiffiles[@]} - do - # remove trailing tabs in motiffile - sed -i 's/[ \t]*$//' $i +# comparing unknown footprints with regions of known motifs +# comparison is done iteratively +# remove overlapping regions in unknown footprints +temp_switch=true +counter=1 +for i in ${motiffiles[@]} +do + # remove trailing tabs in motiffile + sed -i 's/[ \t]*$//' $i - if [ $temp_switch == true ] - then - help=false - bedtools subtract -a "$workdir"/filtered.bed -b $i > "$workdir"/filtered_temp.bed - else - temp_switch=true - bedtools subtract -a "$workdir"/filtered_temp.bed -b $i > "$workdir"/filtered.bed - fi - echo $i - done -fi + if [ $temp_switch == true ] + then + temp_switch=false + bedtools subtract -a "$workdir"/filtered.bed -b $i > "$workdir"/filtered_temp.bed + else + temp_switch=true + bedtools subtract -a "$workdir"/filtered_temp.bed -b $i > "$workdir"/filtered.bed + fi + echo "$i --- $counter of ${#motiffiles[@]}" + counter=`expr $counter + 1` +done # get file of last iteration an write its content into filtered.bed if [ $temp_switch == false ] @@ -259,7 +225,7 @@ fi # remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition # also creates a small output file with information about the comparison -Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed +Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/FilterMotifs.stats # check if Rscript executed without errors if [ $? -gt 0 ] then @@ -271,9 +237,19 @@ first_line=`sed -n 1p $data | sed "s/$/\tcontains_maxpos\tsequence/"` if [[ ${first_line:0:1} == "#" ]] then echo "$first_line" > $output + # add some final values to the log file + fp_initial=`cat $data | wc -l` + fp_initial=`expr $fp_initial - 1` + fp_final=`cat "$workdir"/filtered.bed | wc -l` + fp_final=`expr $fp_final - 1` + echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/FilterMotifs.stats + echo $fp_final | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/FilterMotifs.stats else # output will be overwritten if it exists rm -f $output + # add some final values to the log file + cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/FilterMotifs.stats + cat "$workdir"/filtered.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/FilterMotifs.stats fi # add fasta sequences to bed and create fasta file From c55e8ff1c5fb1525eb52e3e15c6cf25423e8ba05 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Tue, 8 Jan 2019 16:03:57 +0100 Subject: [PATCH 17/21] added dicumentation and parameter for .stats output file --- bin/1.2_filter_motifs/compareBed_runinfo.R | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R index 4a7af6f..4c5686d 100644 --- a/bin/1.2_filter_motifs/compareBed_runinfo.R +++ b/bin/1.2_filter_motifs/compareBed_runinfo.R @@ -8,7 +8,13 @@ # adds a column with a flag "contains_maxpos" # and creates a file with information of the bedtool comparison -# TODO: check number of columns, implement make.unique in a better way +# parameters: Parameters are not named. Respect the parameter order. +# min: minimum footprint size threshold +# max: maximum footprint size threshold +# input_raw: unfiltered BED-file +# input_filtered: filtered BED-file (after bedtools subtract) +# output: output path/file +# output_stats: output file with general info of the comparison # parsing parameters library(data.table) @@ -18,8 +24,10 @@ max = as.numeric(args[2]) input_raw = args[3] input_filtered = args[4] output = args[5] +output_stats = args[6] + +data_filtered = fread(input_filtered, sep='\t') -data_filtered = fread(input_filtered) # check if data has less than 9 columns if (ncol(data_filtered) < 9) { stop("footprint file has less than 9 columns. exiting.") @@ -51,7 +59,7 @@ data_filtered[[8]] = data_filtered[[8]] - data_filtered[[2]] fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t') # data is the initial data before any comparisons have been done (-d parameter of compareBed.sh) -data = fread(input_raw) +data = fread(input_raw, sep='\t') # some statistics about the bedtool comparisons are stored in FilterMotifs.stats sum_data = sum(data[[3]]-data[[2]]) @@ -62,4 +70,4 @@ length_data = formatC(mean(data[[3]]-data[[2]]), digits = 4) length_filtered = formatC(mean(data_filtered[[7]]), digits = 4) stats = data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1]) stats = t(stats) -write.table(stats, "./FilterMotifs.stats", col.names = FALSE, quote = FALSE, sep = '\t') +write.table(stats, output_stats, col.names = FALSE, quote = FALSE, sep = '\t') From 4acc20f8e761bf74e4363381f04176dc33a43f86 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Wed, 9 Jan 2019 10:13:10 +0100 Subject: [PATCH 18/21] documentation changes --- bin/1.2_filter_motifs/compareBed.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index 1840ba8..73bfa8f 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -125,7 +125,7 @@ then echo " -max --max maximum size of footprints; default is 80" echo " -o --output output path/file ; default dir current directory and filename is newMotifs.bed and newMotifs.bed.fasta" echo " -h --help shows this help message" - echo " -p --path the path where the required scripts merge.R and maxScore.R are stored. Default: same path as this scripts path" + echo " -p --path the path where the required script compareBed_runinfo.R is stored. Default: same path as this scripts path" exit 0 fi @@ -139,7 +139,7 @@ echo fasta: $fasta \(required\) echo min: $min echo max: $max echo output: $output -echo path of scripts: $path +echo relative path of subscript: $path # check required parameters if [ -z $data ] || [ -z $motifs ] || [ -z $fasta ] @@ -190,7 +190,7 @@ done if [ $all_empty == true ] then echo ERROR - echo All motiffiles were empty! + echo All motiffiles have less than 2 lines! echo Fix motiffiles and try again. exit 1 fi From 90c8c0593a636a00698508d8af454ba02c0e092d Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Wed, 9 Jan 2019 15:29:41 +0100 Subject: [PATCH 19/21] updated check for trailing tabs in motiffiles --- bin/1.2_filter_motifs/compareBed.sh | 77 ++++++++++++++++++++--------- 1 file changed, 55 insertions(+), 22 deletions(-) diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index 73bfa8f..223f497 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -8,17 +8,19 @@ # BED-format. # For parameter description, run the script without parameters or -h. +# The output is a file with the filtered footprints and the log file FilterMotivs.stats -# One R script is used, called compareBed_runinfo.R, stored in the same directory +# One R script is used, compareBed_runinfo.R, stored in the same directory. # default parameters workdir=$PWD output="newMotifs.bed" min=10 max=200 -path=`echo $0 | sed 's/\/[^\/]*$/\//g'` help=false +path=`echo $0 | sed 's/\/[^\/]*$/\//g'` + # display help when no parameters chosen if [ $# -eq 0 ] then @@ -32,7 +34,7 @@ do key="$1" if [[ ${2:0:1} == "-" ]] then - echo "Each parameter needs a value (except the help parameter), values must not start with a '-'!" + echo "ERROR: Each parameter needs a value (except the help parameter), values must not start with a '-'!" exit 1 fi @@ -72,11 +74,6 @@ do shift shift ;; - -p|--path) - path="$2" - shift - shift - ;; -h|--help) help=true shift @@ -94,7 +91,7 @@ if [ $count -gt 0 ] then for i in ${wrong[@]} do - echo wrong parameter $i + echo ERROR: wrong parameter $i done echo call script without parameters for help or call --help @@ -104,11 +101,11 @@ fi # the help message if [ $help == true ] then - echo "This script utilies bedtools to select new footprints from data." + echo "This script utilizes bedtools to select new footprints from data." echo "Therefore the data is compared with known footprint motifs." - echo "The output is a new .bed file with added sequence information and a column with flags for better sequences (1)" - echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences." - echo "If a parameter is chosen, a value must be provided or an error will occur." + echo "The output is a new BED-file with added sequence information and a flag contains_maxpos (1/0)" + echo "Required arguments are data, motifs, both in bed-format and the fasta genome sequences." + echo "If a parameter is chosen, a value must be provided aswell. (exception -h)" echo "--------------------" echo "usage: compareBed.sh --data --motifs --fasta [optional_parameter ] ..." echo "--------------------" @@ -125,8 +122,7 @@ then echo " -max --max maximum size of footprints; default is 80" echo " -o --output output path/file ; default dir current directory and filename is newMotifs.bed and newMotifs.bed.fasta" echo " -h --help shows this help message" - echo " -p --path the path where the required script compareBed_runinfo.R is stored. Default: same path as this scripts path" -exit 0 + exit 0 fi # summary of parameters @@ -139,7 +135,6 @@ echo fasta: $fasta \(required\) echo min: $min echo max: $max echo output: $output -echo relative path of subscript: $path # check required parameters if [ -z $data ] || [ -z $motifs ] || [ -z $fasta ] @@ -149,10 +144,38 @@ then echo required are: --data \ --motifs \ --fasta \ exit 1 fi +if [ ! -f $data ] +then + echo ERROR + echo $data does not exist. Please check input parameter -d / --data + exit 1 +fi +if [ ! -f $fasta ] +then + echo ERROR + echo $fasta does not exist. Please check input parameter -f / --fasta + exit 1 +fi +#check other parameters +if [ $min -lt 0 ] +then + min=10 + echo "min can't be negative. Default value of 10 is choosen" +fi +if [ $max -lt $min ] +then + max=200 + echo "max must be greater than min. Default value of 200 is choosen" +fi +if [ ! -d $workdir ] +then + echo ERROR + echo "directory $workdir does not exist. Please check parameter -w / --workdir" + exit 1 +fi # remove trailing tabs in footprints cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed -all_empty=true # motiffiles either from a directory OR comma separated list if [ -d "$motifs" ] @@ -166,6 +189,7 @@ else fi # check if files exist and if they are all empty (exiting if all empty) +all_empty=true for i in ${motiffiles[@]} do if [ -f $i ] @@ -180,7 +204,7 @@ do fi fi else - echo file $i does not exist + echo ERROR: file $i does not exist echo please use correct paths. exiting. exit 1 fi @@ -202,18 +226,26 @@ temp_switch=true counter=1 for i in ${motiffiles[@]} do - # remove trailing tabs in motiffile - sed -i 's/[ \t]*$//' $i + echo "$i --- $counter of ${#motiffiles[@]}" + # remove trailing tabs in motiffile, but only if the second line in the file ends with a tab. + # checking all lines for trailing tabs would be time consuming. + secnd_line=`sed -n 2p $i | tr '\t' '#'` + echo $secnd_line + if [[ ${secnd_line: -1} == "#" ]] + then + echo trailing tabs have been found. removing trailing tabs. + sed -i 's/[ \t]*$//' $i + fi + # bedtools comparisons if [ $temp_switch == true ] then temp_switch=false bedtools subtract -a "$workdir"/filtered.bed -b $i > "$workdir"/filtered_temp.bed else temp_switch=true - bedtools subtract -a "$workdir"/filtered_temp.bed -b $i > "$workdir"/filtered.bed + bedtools subtract -a "$workdir"/filtered_temp.bed -b $i > "$workdir"/filtered.bed fi - echo "$i --- $counter of ${#motiffiles[@]}" counter=`expr $counter + 1` done @@ -253,5 +285,6 @@ else fi # add fasta sequences to bed and create fasta file +echo getting sequences from fasta-file bedtools getfasta -fi $fasta -bed "$workdir"/filtered_flagged.bed -bedOut >> $output bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta From 1a35ce5fbaceb4c19a097f013e499589db6bc490 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Wed, 9 Jan 2019 15:34:56 +0100 Subject: [PATCH 20/21] removed echo from testing --- bin/1.2_filter_motifs/compareBed.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index 223f497..38027f3 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -230,7 +230,6 @@ do # remove trailing tabs in motiffile, but only if the second line in the file ends with a tab. # checking all lines for trailing tabs would be time consuming. secnd_line=`sed -n 2p $i | tr '\t' '#'` - echo $secnd_line if [[ ${secnd_line: -1} == "#" ]] then echo trailing tabs have been found. removing trailing tabs. From 530627cc4499cd5c176abd4a7e047942a89c91a9 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Wed, 9 Jan 2019 15:55:52 +0100 Subject: [PATCH 21/21] more documentation, replce =, <- --- bin/1.2_filter_motifs/compareBed_runinfo.R | 58 ++++++++++++---------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R index 4c5686d..4888c02 100644 --- a/bin/1.2_filter_motifs/compareBed_runinfo.R +++ b/bin/1.2_filter_motifs/compareBed_runinfo.R @@ -18,15 +18,15 @@ # parsing parameters library(data.table) -args = commandArgs(TRUE) -min = as.numeric(args[1]) -max = as.numeric(args[2]) -input_raw = args[3] -input_filtered = args[4] -output = args[5] -output_stats = args[6] +args <- commandArgs(TRUE) +min <- as.numeric(args[1]) +max <- as.numeric(args[2]) +input_raw <- args[3] +input_filtered <- args[4] +output <- args[5] +output_stats <- args[6] -data_filtered = fread(input_filtered, sep='\t') +data_filtered <- fread(input_filtered, sep='\t') # check if data has less than 9 columns if (ncol(data_filtered) < 9) { @@ -35,39 +35,45 @@ if (ncol(data_filtered) < 9) { # remove sequences that are smaller than minimum (parameter) # remove sequences that are longer than maximum (parameter) -data_filtered = data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),] -data_filtered = data_filtered[which(data_filtered[[3]] - data_filtered[[2]] <= max),] +data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),] +data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] <= max),] # make unique names and adjust length for splitted footprints # duplicated names have .0 , .1 , .2 ... added -names=data_filtered[[4]] -names(data_filtered)[4] = "name" +names <- data_filtered[[4]] +names(data_filtered)[4] <- "name" # all duplicated names -duplicants = unique(data_filtered[duplicated(name)][[4]]) -data_filtered[[4]] = make.unique(as.character(data_filtered[[4]])) +duplicants <- unique(data_filtered[duplicated(name)][[4]]) +data_filtered[[4]] <- make.unique(as.character(data_filtered[[4]])) data_filtered[match(duplicants, names), name := paste0(name,".0")] # recalculate length of sequences -data_filtered[[7]] = data_filtered[[3]] - data_filtered[[2]] +data_filtered[[7]] <- data_filtered[[3]] - data_filtered[[2]] # adding column "contains_maxpos", containing flag (0 or 1) # max_pos is the position of maximum score of a footprint -data_filtered = cbind(data_filtered, contains_maxpos = 0) +data_filtered <- cbind(data_filtered, contains_maxpos = 0) data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1 -data_filtered[[8]] = data_filtered[[8]] - data_filtered[[2]] +data_filtered[[8]] <- data_filtered[[8]] - data_filtered[[2]] fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t') # data is the initial data before any comparisons have been done (-d parameter of compareBed.sh) -data = fread(input_raw, sep='\t') +data <- fread(input_raw, sep='\t') # some statistics about the bedtool comparisons are stored in FilterMotifs.stats -sum_data = sum(data[[3]]-data[[2]]) -sum_filtered = sum(data_filtered[[7]]) -difference_nt = formatC(sum_data/sum_filtered, digits = 4) -loss_nt = formatC(1 - sum_filtered/sum_data, digits = 2) -length_data = formatC(mean(data[[3]]-data[[2]]), digits = 4) -length_filtered = formatC(mean(data_filtered[[7]]), digits = 4) -stats = data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1]) -stats = t(stats) +# number of nucleotides input +sum_data <- sum(data[[3]]-data[[2]]) +# number of nucleotides after filter +sum_filtered <- sum(data_filtered[[7]]) +# quotient: sum_data/sum_filtered +difference_nt <- formatC(sum_data/sum_filtered, digits = 4) +# loss: 1 - sum_filtered/sum_data +loss_nt <- formatC(1 - sum_filtered/sum_data, digits = 2) +# mean length of footprints input +length_data <- formatC(mean(data[[3]]-data[[2]]), digits = 4) +# mean -ength of footprints after filter +length_filtered <- formatC(mean(data_filtered[[7]]), digits = 4) +stats <- data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1]) +stats <- t(stats) write.table(stats, output_stats, col.names = FALSE, quote = FALSE, sep = '\t')