diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index 38027f3..d358865 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -8,9 +8,9 @@ # BED-format. # For parameter description, run the script without parameters or -h. -# The output is a file with the filtered footprints and the log file FilterMotivs.stats +# The output is a file with the filtered footprints and the log file compareBed.stats -# One R script is used, compareBed_runinfo.R, stored in the same directory. +# One R scripts is used, compareBed_runinfo.R, stored in the same directory. # default parameters workdir=$PWD @@ -173,6 +173,10 @@ then echo "directory $workdir does not exist. Please check parameter -w / --workdir" exit 1 fi +if [[ ${output: -4: -1} != '.bed' ]] +then + output=`echo $output | sed "s|$|.bed|g"` +fi # remove trailing tabs in footprints cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed @@ -181,7 +185,7 @@ cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed if [ -d "$motifs" ] then # creates an array of all files with bed in its name in the directory $motifs - declare -a motiffiles=(`ls $motifs | grep bed | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`) + declare -a motiffiles=(`ls $motifs | grep -i '.bed$' | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`) # the else case means, that the motiffiles were passed comma separated with no whitespace. else @@ -256,7 +260,7 @@ fi # remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition # also creates a small output file with information about the comparison -Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/FilterMotifs.stats +Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/compareBed.stats # check if Rscript executed without errors if [ $? -gt 0 ] then @@ -273,17 +277,18 @@ then fp_initial=`expr $fp_initial - 1` fp_final=`cat "$workdir"/filtered.bed | wc -l` fp_final=`expr $fp_final - 1` - echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/FilterMotifs.stats - echo $fp_final | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/FilterMotifs.stats + echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats + echo $fp_final | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats else # output will be overwritten if it exists rm -f $output # add some final values to the log file - cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/FilterMotifs.stats - cat "$workdir"/filtered.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/FilterMotifs.stats + cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats + cat "$workdir"/filtered.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats fi # add fasta sequences to bed and create fasta file +out_fasta=`echo $output | sed "s|.bed$|.fasta|g"` echo getting sequences from fasta-file bedtools getfasta -fi $fasta -bed "$workdir"/filtered_flagged.bed -bedOut >> $output -bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta +bedtools getfasta -name -fi $fasta -bed "$output" -fo "$out_fasta" diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R index 4888c02..12a73a6 100644 --- a/bin/1.2_filter_motifs/compareBed_runinfo.R +++ b/bin/1.2_filter_motifs/compareBed_runinfo.R @@ -8,7 +8,7 @@ # adds a column with a flag "contains_maxpos" # and creates a file with information of the bedtool comparison -# parameters: Parameters are not named. Respect the parameter order. +# parameters: Parameters are not named, respect the parameter order. # min: minimum footprint size threshold # max: maximum footprint size threshold # input_raw: unfiltered BED-file @@ -27,12 +27,29 @@ output <- args[5] output_stats <- args[6] data_filtered <- fread(input_filtered, sep='\t') +# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh) +data <- fread(input_raw, sep='\t') # check if data has less than 9 columns if (ncol(data_filtered) < 9) { stop("footprint file has less than 9 columns. exiting.") } +# define needed col names +names(data)[2] <- "old_start" +names(data_filtered)[c(2, 3, 8)] <- c("start", "end", "max_pos") +# save corect column order +correct_order <- c(names(data_filtered), names(data)[2]) +# compute new maxpos (relevant for splitted footprints) +data_filtered <- merge(x=data_filtered, y=data[,c(2,4)], by.x=names(data_filtered)[4], by.y=names(data)[4], all.x=TRUE, all.y=FALSE, sort=FALSE) +data_filtered <- data_filtered[, correct_order, with = FALSE] +data_filtered[, max_pos := max_pos + old_start - start][, old_start := NULL] + +# adding column "contains_maxpos", containing flag (0 or 1) +# max_pos is the position of maximum score of a footprint +data_filtered <- cbind(data_filtered, contains_maxpos = 0) +data_filtered[max_pos >= 0 & start + max_pos <= end, contains_maxpos := 1] + # remove sequences that are smaller than minimum (parameter) # remove sequences that are longer than maximum (parameter) data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),] @@ -50,18 +67,9 @@ data_filtered[match(duplicants, names), name := paste0(name,".0")] # recalculate length of sequences data_filtered[[7]] <- data_filtered[[3]] - data_filtered[[2]] -# adding column "contains_maxpos", containing flag (0 or 1) -# max_pos is the position of maximum score of a footprint -data_filtered <- cbind(data_filtered, contains_maxpos = 0) -data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1 -data_filtered[[8]] <- data_filtered[[8]] - data_filtered[[2]] - fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t') -# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh) -data <- fread(input_raw, sep='\t') - -# some statistics about the bedtool comparisons are stored in FilterMotifs.stats +# some statistics about the bedtool comparisons are stored in compareBed.stats # number of nucleotides input sum_data <- sum(data[[3]]-data[[2]]) # number of nucleotides after filter @@ -74,6 +82,6 @@ loss_nt <- formatC(1 - sum_filtered/sum_data, digits = 2) length_data <- formatC(mean(data[[3]]-data[[2]]), digits = 4) # mean -ength of footprints after filter length_filtered <- formatC(mean(data_filtered[[7]]), digits = 4) -stats <- data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1]) +stats <- data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$contains_maxpos == 1))/dim(data_filtered)[1]) stats <- t(stats) write.table(stats, output_stats, col.names = FALSE, quote = FALSE, sep = '\t')