diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index aa2c37e..38027f3 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -1,109 +1,111 @@ #!/bin/bash -#data -#motifs -#workdir -#fasta -#min -#max -#output -wrong=() -da=false -mo=false -wo=false -fa=false -mi=false -ma=false -ou=false -he=false +# author: Jannik Hamp +# email: jannik.hamp@googlemail.com + +# desciption: This scripts uses bedtools to find new, non overlapping regions +# between input data in BED-format and a group of known motifs in +# BED-format. + +# For parameter description, run the script without parameters or -h. +# The output is a file with the filtered footprints and the log file FilterMotivs.stats + +# One R script is used, compareBed_runinfo.R, stored in the same directory. + +# default parameters +workdir=$PWD +output="newMotifs.bed" +min=10 +max=200 +help=false + +path=`echo $0 | sed 's/\/[^\/]*$/\//g'` + +# display help when no parameters chosen if [ $# -eq 0 ] then - he=true + help=true fi +# parsing parameters +wrong=() while [[ $# -gt 0 ]] do -key="$1" + key="$1" + if [[ ${2:0:1} == "-" ]] + then + echo "ERROR: Each parameter needs a value (except the help parameter), values must not start with a '-'!" + exit 1 + fi -case $key in - -d|--data) - data="$2" - da=true - shift - shift - ;; - -m|--motifs) - motifs="$2" - mo=true - shift - shift - ;; - -w|--workdir) - workdir="$2" - wo=true - shift - shift - ;; - -f|--fasta) - fasta="$2" - fa=true - shift - shift - ;; - -min|--min) - min=$2 - mi=true - shift - shift - ;; - -max|--max) - max=$2 - ma=true - shift - shift - ;; - -o|--output) - output="$2" - ou=true - shift - shift - ;; - -p|--path) - path="$2" - shift - shift - ;; - -h|--help) - help=true - he=true - shift - ;; - *) - wrong+=("$1") - shift - ;; -esac + case $key in + -d|--data) + data="$2" + shift + shift + ;; + -m|--motifs) + motifs="$2" + shift + shift + ;; + -w|--workdir) + workdir="$2" + shift + shift + ;; + -f|--fasta) + fasta="$2" + shift + shift + ;; + -min|--min) + min=$2 + shift + shift + ;; + -max|--max) + max=$2 + shift + shift + ;; + -o|--output) + output="$2" + shift + shift + ;; + -h|--help) + help=true + shift + ;; + *) + wrong+=("$1") + shift + ;; + esac done +# if wrong parameters were chosen.. count=${#wrong[@]} if [ $count -gt 0 ] then for i in ${wrong[@]} do - echo wrong parameter $i - echo call script without parameters for help or call --help - echo exit + echo ERROR: wrong parameter $i done -exit 1 + + echo call script without parameters for help or call --help + exit 1 fi -if [ $he == true ] +# the help message +if [ $help == true ] then - echo "This script utilies bedtools to select new footprints from data." + echo "This script utilizes bedtools to select new footprints from data." echo "Therefore the data is compared with known footprint motifs." - echo "The output is a new .bed file with added sequence information and a column with flags for better sequences (1)" - echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences." - echo "If a parameter is chosen, a value must be provided or an error will occur." + echo "The output is a new BED-file with added sequence information and a flag contains_maxpos (1/0)" + echo "Required arguments are data, motifs, both in bed-format and the fasta genome sequences." + echo "If a parameter is chosen, a value must be provided aswell. (exception -h)" echo "--------------------" echo "usage: compareBed.sh --data --motifs --fasta [optional_parameter ] ..." echo "--------------------" @@ -118,158 +120,170 @@ then echo " default is the current directory" echo " -min --min minimum size of footprints; default is 10" echo " -max --max maximum size of footprints; default is 80" - echo " -o --output output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta" + echo " -o --output output path/file ; default dir current directory and filename is newMotifs.bed and newMotifs.bed.fasta" echo " -h --help shows this help message" -exit 0 + exit 0 fi +# summary of parameters echo selected parameters echo ------------------- -echo data: $da \(required\) -echo motifs: $mo \(required\) -echo workdir: $wo -echo fasta: $fa \(required\) -echo min: $mi -echo max: $ma -echo output: $ou -echo help: $he +echo data: $data \(required\) +echo motifs: $motifs \(required\) +echo workdir: $workdir +echo fasta: $fasta \(required\) +echo min: $min +echo max: $max +echo output: $output -if [ $da == false ] || [ $mo == false ] || [ $fa == false ] +# check required parameters +if [ -z $data ] || [ -z $motifs ] || [ -z $fasta ] then + echo ERROR echo required parameters not given. echo required are: --data \ --motifs \ --fasta \ exit 1 fi - -if [ $wo == false ] +if [ ! -f $data ] then - #if [ ! -d workdir1337 ] - #then - # mkdir workdir1337 - #fi - wo=true - workdir=$path + echo ERROR + echo $data does not exist. Please check input parameter -d / --data + exit 1 fi - -if [ $ou == false ] +if [ ! -f $fasta ] then - output=${workdir}/"newMotifs.bed" - ou=true + echo ERROR + echo $fasta does not exist. Please check input parameter -f / --fasta + exit 1 fi - -if [ $mi == false ] +#check other parameters +if [ $min -lt 0 ] then min=10 - mi=true + echo "min can't be negative. Default value of 10 is choosen" fi - -if [ $ma == false ] +if [ $max -lt $min ] then - max=80 - ma=true + max=200 + echo "max must be greater than min. Default value of 200 is choosen" fi - if [ ! -d $workdir ] then - mkdir $workdir + echo ERROR + echo "directory $workdir does not exist. Please check parameter -w / --workdir" + exit 1 fi -#1. first filter. no overlap vs. overlap -echo get sequences with no overlap -cat $data > "$workdir"/pass1Tr.bed -help=true +# remove trailing tabs in footprints +cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed +# motiffiles either from a directory OR comma separated list if [ -d "$motifs" ] then -for i in "$motifs"/*.bed -do - if [ $help == true ] - then - help=false - bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed - else - help=true - bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed - fi - echo $i -done + # creates an array of all files with bed in its name in the directory $motifs + declare -a motiffiles=(`ls $motifs | grep bed | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`) + +# the else case means, that the motiffiles were passed comma separated with no whitespace. else -declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) + declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) +fi + +# check if files exist and if they are all empty (exiting if all empty) +all_empty=true for i in ${motiffiles[@]} do - if [ $help == true ] + if [ -f $i ] then - help=false - bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed + if [ $all_empty == true ] + then + lines=`cat $i | wc -l` + if [ $lines -gt 1 ] + then + all_empty=false + break + fi + fi else - help=true - bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed + echo ERROR: file $i does not exist + echo please use correct paths. exiting. + exit 1 fi - echo $i done -fi -if [ $help == false ] -then - cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed +# error report of rare case of only empty motiffiles +if [ $all_empty == true ] + then + echo ERROR + echo All motiffiles have less than 2 lines! + echo Fix motiffiles and try again. + exit 1 fi -echo get overlapping sequences -bedtools intersect -v -a $data -b "$workdir"/pass1Tr.bed > "$workdir"/pass1Fa.bed -cat "$workdir"/pass1Fa.bed | wc -l - -echo calc maxScore -#2. compute absolut maxscore position -Rscript --vanilla $path/maxScore.R "$workdir"/pass1Fa.bed - -#3 subtract overlapping regions for additional motifs -#echo "also get subsequences with no overlap of overlapping sequences" -help=true - -if [ -d "$motifs" ] -then -for i in "$motifs"/*.bed +# comparing unknown footprints with regions of known motifs +# comparison is done iteratively +# remove overlapping regions in unknown footprints +temp_switch=true +counter=1 +for i in ${motiffiles[@]} do - if [ $help == true ] + echo "$i --- $counter of ${#motiffiles[@]}" + # remove trailing tabs in motiffile, but only if the second line in the file ends with a tab. + # checking all lines for trailing tabs would be time consuming. + secnd_line=`sed -n 2p $i | tr '\t' '#'` + if [[ ${secnd_line: -1} == "#" ]] then - help=false - bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed - else - help=true - bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed + echo trailing tabs have been found. removing trailing tabs. + sed -i 's/[ \t]*$//' $i fi - echo $i -done -else -for i in ${motiffiles[@]} -do - if [ $help == true ] + + # bedtools comparisons + if [ $temp_switch == true ] then - help=false - bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed + temp_switch=false + bedtools subtract -a "$workdir"/filtered.bed -b $i > "$workdir"/filtered_temp.bed else - help=true - bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed + temp_switch=true + bedtools subtract -a "$workdir"/filtered_temp.bed -b $i > "$workdir"/filtered.bed fi - echo $i + counter=`expr $counter + 1` done -fi -if [ $help == false ] +# get file of last iteration an write its content into filtered.bed +if [ $temp_switch == false ] then - cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed -else - cat "$workdir"/pass1Fa.bed > "$workdir"/pass2Tr.bed + cat "$workdir"/filtered_temp.bed > "$workdir"/filtered.bed fi -#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition +# remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition # also creates a small output file with information about the comparison +Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/FilterMotifs.stats +# check if Rscript executed without errors +if [ $? -gt 0 ] +then + exit 1 +fi -Rscript --vanilla $path/merge.R $min $max $workdir $data +# check if header existed. If so, final output also has a header. +first_line=`sed -n 1p $data | sed "s/$/\tcontains_maxpos\tsequence/"` +if [[ ${first_line:0:1} == "#" ]] +then + echo "$first_line" > $output + # add some final values to the log file + fp_initial=`cat $data | wc -l` + fp_initial=`expr $fp_initial - 1` + fp_final=`cat "$workdir"/filtered.bed | wc -l` + fp_final=`expr $fp_final - 1` + echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/FilterMotifs.stats + echo $fp_final | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/FilterMotifs.stats +else + # output will be overwritten if it exists + rm -f $output + # add some final values to the log file + cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/FilterMotifs.stats + cat "$workdir"/filtered.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/FilterMotifs.stats +fi -#5. add fasta sequences to bed and create fasta file -bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > $output +# add fasta sequences to bed and create fasta file +echo getting sequences from fasta-file +bedtools getfasta -fi $fasta -bed "$workdir"/filtered_flagged.bed -bedOut >> $output bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta - -#6 clean up -#rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R new file mode 100644 index 0000000..4888c02 --- /dev/null +++ b/bin/1.2_filter_motifs/compareBed_runinfo.R @@ -0,0 +1,79 @@ +#!/bin/Rscript + +# author: Jannik Hamp +# email: jannik.hamp@googlemail.com + +# desciption: This script gets called by compareBed.sh +# It removes too small/large footprints, makes names unique, +# adds a column with a flag "contains_maxpos" +# and creates a file with information of the bedtool comparison + +# parameters: Parameters are not named. Respect the parameter order. +# min: minimum footprint size threshold +# max: maximum footprint size threshold +# input_raw: unfiltered BED-file +# input_filtered: filtered BED-file (after bedtools subtract) +# output: output path/file +# output_stats: output file with general info of the comparison + +# parsing parameters +library(data.table) +args <- commandArgs(TRUE) +min <- as.numeric(args[1]) +max <- as.numeric(args[2]) +input_raw <- args[3] +input_filtered <- args[4] +output <- args[5] +output_stats <- args[6] + +data_filtered <- fread(input_filtered, sep='\t') + +# check if data has less than 9 columns +if (ncol(data_filtered) < 9) { + stop("footprint file has less than 9 columns. exiting.") +} + +# remove sequences that are smaller than minimum (parameter) +# remove sequences that are longer than maximum (parameter) +data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),] +data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] <= max),] + +# make unique names and adjust length for splitted footprints +# duplicated names have .0 , .1 , .2 ... added +names <- data_filtered[[4]] +names(data_filtered)[4] <- "name" +# all duplicated names +duplicants <- unique(data_filtered[duplicated(name)][[4]]) +data_filtered[[4]] <- make.unique(as.character(data_filtered[[4]])) +data_filtered[match(duplicants, names), name := paste0(name,".0")] + +# recalculate length of sequences +data_filtered[[7]] <- data_filtered[[3]] - data_filtered[[2]] + +# adding column "contains_maxpos", containing flag (0 or 1) +# max_pos is the position of maximum score of a footprint +data_filtered <- cbind(data_filtered, contains_maxpos = 0) +data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1 +data_filtered[[8]] <- data_filtered[[8]] - data_filtered[[2]] + +fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t') + +# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh) +data <- fread(input_raw, sep='\t') + +# some statistics about the bedtool comparisons are stored in FilterMotifs.stats +# number of nucleotides input +sum_data <- sum(data[[3]]-data[[2]]) +# number of nucleotides after filter +sum_filtered <- sum(data_filtered[[7]]) +# quotient: sum_data/sum_filtered +difference_nt <- formatC(sum_data/sum_filtered, digits = 4) +# loss: 1 - sum_filtered/sum_data +loss_nt <- formatC(1 - sum_filtered/sum_data, digits = 2) +# mean length of footprints input +length_data <- formatC(mean(data[[3]]-data[[2]]), digits = 4) +# mean -ength of footprints after filter +length_filtered <- formatC(mean(data_filtered[[7]]), digits = 4) +stats <- data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1]) +stats <- t(stats) +write.table(stats, output_stats, col.names = FALSE, quote = FALSE, sep = '\t') diff --git a/bin/1.2_filter_motifs/maxScore.R b/bin/1.2_filter_motifs/maxScore.R deleted file mode 100644 index 6b90984..0000000 --- a/bin/1.2_filter_motifs/maxScore.R +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/Rscript -library(data.table) -args = commandArgs(TRUE) -file = args[1] - -tab = fread(file) -colnames(tab) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") - -tab$maxpos = tab$start + tab$maxpos - -fwrite(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') diff --git a/bin/1.2_filter_motifs/merge.R b/bin/1.2_filter_motifs/merge.R deleted file mode 100644 index 129e41d..0000000 --- a/bin/1.2_filter_motifs/merge.R +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/Rscript -library(data.table) -args=commandArgs(TRUE) -min=as.numeric(args[1]) -max=as.numeric(args[2]) -folder=args[3] - -splitted = fread(paste(folder, "/pass2Tr.bed", sep='')) -colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") -p1 = fread(paste(folder, "/pass1Tr.bed", sep='')) -colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") - -p1$maxpos = p1$start + p1$maxpos - -splitted=rbind(splitted, p1) - -splitted=splitted[which(splitted$stop - splitted$start >= min),] -splitted=splitted[which(splitted$stop - splitted$start <= max),] -splitted$id=make.unique(as.character(splitted$id)) -splitted$length=splitted$stop - splitted$start - -splitted=cbind(splitted, containsMaxpos=0) -splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 -splitted$maxpos = splitted$maxpos - splitted$start -data.table::fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') - -before = fread(args[4], header=FALSE) - -sumb=sum(before$V3-before$V2) -suma=sum(splitted$length) -difference = formatC(sumb/suma, digits=4) -loss = formatC(1 - suma/sumb, digits=2) -lengthb = formatC(mean(before$V3-before$V2), digits=4) -lengtha = formatC(mean(splitted$length), digits=4) -stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha, flag_1_ratio=length(which(splitted$containsMaxpos == 1))/dim(splitted)[1]) -write.table(stats, "../FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t')