From 881b445ed41f41588be932378795a87100c3d238 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Thu, 10 Jan 2019 11:02:53 +0100 Subject: [PATCH 01/14] fixed max_pos calculation --- bin/1.2_filter_motifs/compareBed_runinfo.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R index 4888c02..ea6b1a6 100644 --- a/bin/1.2_filter_motifs/compareBed_runinfo.R +++ b/bin/1.2_filter_motifs/compareBed_runinfo.R @@ -52,9 +52,10 @@ data_filtered[[7]] <- data_filtered[[3]] - data_filtered[[2]] # adding column "contains_maxpos", containing flag (0 or 1) # max_pos is the position of maximum score of a footprint +data_filtered[[8]] <- data_filtered[[8]] - data_filtered[[2]] data_filtered <- cbind(data_filtered, contains_maxpos = 0) data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1 -data_filtered[[8]] <- data_filtered[[8]] - data_filtered[[2]] +data_filtered[[8]] <- data_filtered[[8]] + data_filtered[[2]] fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t') From b128196a2cf492a838af5940e926d4f6257f115f Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Thu, 10 Jan 2019 11:04:14 +0100 Subject: [PATCH 02/14] correct maxpos calculation --- bin/1.2_filter_motifs/compareBed_runinfo.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R index ea6b1a6..04b9f87 100644 --- a/bin/1.2_filter_motifs/compareBed_runinfo.R +++ b/bin/1.2_filter_motifs/compareBed_runinfo.R @@ -8,7 +8,7 @@ # adds a column with a flag "contains_maxpos" # and creates a file with information of the bedtool comparison -# parameters: Parameters are not named. Respect the parameter order. +# parameters: Parameters are not named, respect the parameter order. # min: minimum footprint size threshold # max: maximum footprint size threshold # input_raw: unfiltered BED-file From 5343f6052f151429d2747727a7d2bdbd7ecbf8d0 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Thu, 10 Jan 2019 12:18:54 +0100 Subject: [PATCH 03/14] fixed for correct contains_maxpos calculation --- bin/1.2_filter_motifs/compareBed_runinfo.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R index 04b9f87..2723fb5 100644 --- a/bin/1.2_filter_motifs/compareBed_runinfo.R +++ b/bin/1.2_filter_motifs/compareBed_runinfo.R @@ -52,10 +52,10 @@ data_filtered[[7]] <- data_filtered[[3]] - data_filtered[[2]] # adding column "contains_maxpos", containing flag (0 or 1) # max_pos is the position of maximum score of a footprint -data_filtered[[8]] <- data_filtered[[8]] - data_filtered[[2]] +data_filtered[[8]] <- data_filtered[[8]] + data_filtered[[2]] data_filtered <- cbind(data_filtered, contains_maxpos = 0) data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1 -data_filtered[[8]] <- data_filtered[[8]] + data_filtered[[2]] +data_filtered[[8]] <- data_filtered[[8]] - data_filtered[[2]] fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t') From 4397f77c6357c4e6aed6b21b4a3c1d1665d1faf9 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Thu, 10 Jan 2019 14:02:22 +0100 Subject: [PATCH 04/14] script for calculation of absolute max_pos values at start --- bin/1.2_filter_motifs/abs_max_score.R | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 bin/1.2_filter_motifs/abs_max_score.R diff --git a/bin/1.2_filter_motifs/abs_max_score.R b/bin/1.2_filter_motifs/abs_max_score.R new file mode 100644 index 0000000..b6e5de0 --- /dev/null +++ b/bin/1.2_filter_motifs/abs_max_score.R @@ -0,0 +1,24 @@ +#!/bin/Rscript + +# author: Jannik Hamp +# email: jannik.hamp@googlemail.com + +# desciption: This script gets called by compareBed.sh + +# It calculates the absolute position of maximum score of the footprints. +# This information is later used for an additional flag column in the bed file. + +library(data.table) +args = commandArgs(TRUE) +file = args[1] + +tab = fread(file, sep='\t') + +# check if data has less than 9 columns +if (ncol(tab) < 9) { + stop("footprint file has less than 9 columns. exiting.") +} + +tab[[8]] = tab[[2]] + tab[[8]] + +fwrite(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') From 9a4c9c1580090d5e8cd8793903bd71c886029caf Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Thu, 10 Jan 2019 14:03:18 +0100 Subject: [PATCH 05/14] added Rscript for maxpos calculation --- bin/1.2_filter_motifs/compareBed.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index 38027f3..86adb6a 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -10,7 +10,7 @@ # For parameter description, run the script without parameters or -h. # The output is a file with the filtered footprints and the log file FilterMotivs.stats -# One R script is used, compareBed_runinfo.R, stored in the same directory. +# Two R scripts are used, compareBed_runinfo.R and abs_max_score.R, stored in the same directory. # default parameters workdir=$PWD @@ -181,7 +181,7 @@ cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed if [ -d "$motifs" ] then # creates an array of all files with bed in its name in the directory $motifs - declare -a motiffiles=(`ls $motifs | grep bed | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`) + declare -a motiffiles=(`ls $motifs | grep -i '.bed$' | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`) # the else case means, that the motiffiles were passed comma separated with no whitespace. else @@ -219,6 +219,14 @@ if [ $all_empty == true ] exit 1 fi +#calculate absolute max_score position +Rscript abs_max_score.R "$workdir"/filtered.bed +# check if Rscript executed without errors +if [ $? -gt 0 ] +then + exit 1 +fi + # comparing unknown footprints with regions of known motifs # comparison is done iteratively # remove overlapping regions in unknown footprints From 327dd04e914a16812c4dc25426919f89d7536521 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Thu, 10 Jan 2019 14:05:52 +0100 Subject: [PATCH 06/14] fixed for correct max_pos calculation after possible splitting a new R script has been added, which calculates the absolut positions first. Here the relative positions are calculated again, after the column with contains_maxpos flags has been created. --- bin/1.2_filter_motifs/compareBed_runinfo.R | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R index 2723fb5..075ed56 100644 --- a/bin/1.2_filter_motifs/compareBed_runinfo.R +++ b/bin/1.2_filter_motifs/compareBed_runinfo.R @@ -52,7 +52,6 @@ data_filtered[[7]] <- data_filtered[[3]] - data_filtered[[2]] # adding column "contains_maxpos", containing flag (0 or 1) # max_pos is the position of maximum score of a footprint -data_filtered[[8]] <- data_filtered[[8]] + data_filtered[[2]] data_filtered <- cbind(data_filtered, contains_maxpos = 0) data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1 data_filtered[[8]] <- data_filtered[[8]] - data_filtered[[2]] From 20849d51f9fef4e882f43b667c8484a12b6c3abb Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Thu, 10 Jan 2019 14:37:37 +0100 Subject: [PATCH 07/14] forgot $path to start the new Rscript. now fixed --- bin/1.2_filter_motifs/compareBed.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index 86adb6a..1cb3c1b 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -220,7 +220,7 @@ if [ $all_empty == true ] fi #calculate absolute max_score position -Rscript abs_max_score.R "$workdir"/filtered.bed +Rscript $path/abs_max_score.R "$workdir"/filtered.bed # check if Rscript executed without errors if [ $? -gt 0 ] then From 7514dfb2759e63ffe082452c7a7a2d002a81de40 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 11 Jan 2019 10:36:27 +0100 Subject: [PATCH 08/14] Fixed 1 output value of FilterMotifs.stats output. Flag 1 ratio was not correct, due to variable naming issue --- bin/1.2_filter_motifs/compareBed_runinfo.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R index 075ed56..b4f180f 100644 --- a/bin/1.2_filter_motifs/compareBed_runinfo.R +++ b/bin/1.2_filter_motifs/compareBed_runinfo.R @@ -74,6 +74,6 @@ loss_nt <- formatC(1 - sum_filtered/sum_data, digits = 2) length_data <- formatC(mean(data[[3]]-data[[2]]), digits = 4) # mean -ength of footprints after filter length_filtered <- formatC(mean(data_filtered[[7]]), digits = 4) -stats <- data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1]) +stats <- data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$contains_maxpos == 1))/dim(data_filtered)[1]) stats <- t(stats) write.table(stats, output_stats, col.names = FALSE, quote = FALSE, sep = '\t') From 6cd8c7fd9f665315bb7d552f706c19aaa9e20129 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 11 Jan 2019 11:04:26 +0100 Subject: [PATCH 09/14] check if output ends with .bed --- bin/1.2_filter_motifs/compareBed.sh | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index 1cb3c1b..c50687a 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -10,7 +10,7 @@ # For parameter description, run the script without parameters or -h. # The output is a file with the filtered footprints and the log file FilterMotivs.stats -# Two R scripts are used, compareBed_runinfo.R and abs_max_score.R, stored in the same directory. +# One R scripts is used, compareBed_runinfo.R, stored in the same directory. # default parameters workdir=$PWD @@ -173,6 +173,10 @@ then echo "directory $workdir does not exist. Please check parameter -w / --workdir" exit 1 fi +if [[ ${output: -4: -1} != '.bed' ]] +then + output=`echo $output | sed "s|$|.bed|g"` +fi # remove trailing tabs in footprints cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed @@ -219,14 +223,6 @@ if [ $all_empty == true ] exit 1 fi -#calculate absolute max_score position -Rscript $path/abs_max_score.R "$workdir"/filtered.bed -# check if Rscript executed without errors -if [ $? -gt 0 ] -then - exit 1 -fi - # comparing unknown footprints with regions of known motifs # comparison is done iteratively # remove overlapping regions in unknown footprints @@ -292,6 +288,7 @@ else fi # add fasta sequences to bed and create fasta file +out_fasta=`echo $output | sed "s|.bed$|.fasta|g"` echo getting sequences from fasta-file bedtools getfasta -fi $fasta -bed "$workdir"/filtered_flagged.bed -bedOut >> $output -bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta +bedtools getfasta -name -fi $fasta -bed "$output" -fo "$out_fasta" From f1a0690387a34f9775567a3b8cf1d005f2d9faca Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 11 Jan 2019 11:04:39 +0100 Subject: [PATCH 10/14] Delete abs_max_score.R --- bin/1.2_filter_motifs/abs_max_score.R | 24 ------------------------ 1 file changed, 24 deletions(-) delete mode 100644 bin/1.2_filter_motifs/abs_max_score.R diff --git a/bin/1.2_filter_motifs/abs_max_score.R b/bin/1.2_filter_motifs/abs_max_score.R deleted file mode 100644 index b6e5de0..0000000 --- a/bin/1.2_filter_motifs/abs_max_score.R +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/Rscript - -# author: Jannik Hamp -# email: jannik.hamp@googlemail.com - -# desciption: This script gets called by compareBed.sh - -# It calculates the absolute position of maximum score of the footprints. -# This information is later used for an additional flag column in the bed file. - -library(data.table) -args = commandArgs(TRUE) -file = args[1] - -tab = fread(file, sep='\t') - -# check if data has less than 9 columns -if (ncol(tab) < 9) { - stop("footprint file has less than 9 columns. exiting.") -} - -tab[[8]] = tab[[2]] + tab[[8]] - -fwrite(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') From 9144c1f9261a30f72b9a5af639fbff923697ed59 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 11 Jan 2019 11:05:17 +0100 Subject: [PATCH 11/14] max_pos calculation is done in one Rscript --- bin/1.2_filter_motifs/compareBed_runinfo.R | 26 ++++++++++++++-------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R index b4f180f..f45b4a0 100644 --- a/bin/1.2_filter_motifs/compareBed_runinfo.R +++ b/bin/1.2_filter_motifs/compareBed_runinfo.R @@ -27,12 +27,29 @@ output <- args[5] output_stats <- args[6] data_filtered <- fread(input_filtered, sep='\t') +# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh) +data <- fread(input_raw, sep='\t') # check if data has less than 9 columns if (ncol(data_filtered) < 9) { stop("footprint file has less than 9 columns. exiting.") } +# define needed col names +names(data)[2] <- "old_start" +names(data_filtered)[c(2, 3, 8)] <- c("start", "end", "max_pos") +# save corect column order +correct_order <- c(names(data_filtered), names(data)[2]) +# compute new maxpos (relevant for splitted footprints) +data_filtered <- merge(x=data_filtered, y=data[,c(2,4)], by.x=names(data_filtered)[4], by.y=names(data)[4], all.x=TRUE, all.y=FALSE, sort=FALSE) +data_filtered <- data_filtered[, correct_order, with = FALSE] +data_filtered[, max_pos := max_pos + old_start - start][, old_start := NULL] + +# adding column "contains_maxpos", containing flag (0 or 1) +# max_pos is the position of maximum score of a footprint +data_filtered <- cbind(data_filtered, contains_maxpos = 0) +data_filtered[max_pos >= 0 & start + max_pos <= end, contains_maxpos := 1] + # remove sequences that are smaller than minimum (parameter) # remove sequences that are longer than maximum (parameter) data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),] @@ -50,17 +67,8 @@ data_filtered[match(duplicants, names), name := paste0(name,".0")] # recalculate length of sequences data_filtered[[7]] <- data_filtered[[3]] - data_filtered[[2]] -# adding column "contains_maxpos", containing flag (0 or 1) -# max_pos is the position of maximum score of a footprint -data_filtered <- cbind(data_filtered, contains_maxpos = 0) -data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1 -data_filtered[[8]] <- data_filtered[[8]] - data_filtered[[2]] - fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t') -# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh) -data <- fread(input_raw, sep='\t') - # some statistics about the bedtool comparisons are stored in FilterMotifs.stats # number of nucleotides input sum_data <- sum(data[[3]]-data[[2]]) From 8ed11f2a2d8cec121a8ed87e69229024798b0c37 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 11 Jan 2019 11:07:26 +0100 Subject: [PATCH 12/14] documentation --- bin/1.2_filter_motifs/compareBed.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index c50687a..5aa5f45 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -120,7 +120,7 @@ then echo " default is the current directory" echo " -min --min minimum size of footprints; default is 10" echo " -max --max maximum size of footprints; default is 80" - echo " -o --output output path/file ; default dir current directory and filename is newMotifs.bed and newMotifs.bed.fasta" + echo " -o --output output path/file ; default dir current directory and filename is newMotifs.bed and newMotifs.fasta" echo " -h --help shows this help message" exit 0 fi From aa306f9c8b162ad1d9e8e532f0a8f5ffffedb015 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 11 Jan 2019 11:23:23 +0100 Subject: [PATCH 13/14] changed stats file name to compareBed.stats --- bin/1.2_filter_motifs/compareBed.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index 5aa5f45..d358865 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -8,7 +8,7 @@ # BED-format. # For parameter description, run the script without parameters or -h. -# The output is a file with the filtered footprints and the log file FilterMotivs.stats +# The output is a file with the filtered footprints and the log file compareBed.stats # One R scripts is used, compareBed_runinfo.R, stored in the same directory. @@ -120,7 +120,7 @@ then echo " default is the current directory" echo " -min --min minimum size of footprints; default is 10" echo " -max --max maximum size of footprints; default is 80" - echo " -o --output output path/file ; default dir current directory and filename is newMotifs.bed and newMotifs.fasta" + echo " -o --output output path/file ; default dir current directory and filename is newMotifs.bed and newMotifs.bed.fasta" echo " -h --help shows this help message" exit 0 fi @@ -260,7 +260,7 @@ fi # remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition # also creates a small output file with information about the comparison -Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/FilterMotifs.stats +Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/compareBed.stats # check if Rscript executed without errors if [ $? -gt 0 ] then @@ -277,14 +277,14 @@ then fp_initial=`expr $fp_initial - 1` fp_final=`cat "$workdir"/filtered.bed | wc -l` fp_final=`expr $fp_final - 1` - echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/FilterMotifs.stats - echo $fp_final | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/FilterMotifs.stats + echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats + echo $fp_final | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats else # output will be overwritten if it exists rm -f $output # add some final values to the log file - cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/FilterMotifs.stats - cat "$workdir"/filtered.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/FilterMotifs.stats + cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats + cat "$workdir"/filtered.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats fi # add fasta sequences to bed and create fasta file From cb16c3cb842d87eca8bf055a4e6c7442b9077021 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 11 Jan 2019 11:24:27 +0100 Subject: [PATCH 14/14] changed name of output compareBed.stats in description --- bin/1.2_filter_motifs/compareBed_runinfo.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R index f45b4a0..12a73a6 100644 --- a/bin/1.2_filter_motifs/compareBed_runinfo.R +++ b/bin/1.2_filter_motifs/compareBed_runinfo.R @@ -69,7 +69,7 @@ data_filtered[[7]] <- data_filtered[[3]] - data_filtered[[2]] fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t') -# some statistics about the bedtool comparisons are stored in FilterMotifs.stats +# some statistics about the bedtool comparisons are stored in compareBed.stats # number of nucleotides input sum_data <- sum(data[[3]]-data[[2]]) # number of nucleotides after filter