Merge pull request #54 from loosolab/JannikHamp-patch-1

Jannik hamp patch 1
loosolab · Jan 11, 2019 · f4a745c · f4a745c
2 parents 58355d4 + cb16c3c
commit f4a745c
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 21 deletions.
diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh
@@ -8,9 +8,9 @@
 #             BED-format. 
 
 #             For parameter description, run the script without parameters or -h.
-#             The output is a file with the filtered footprints and the log file FilterMotivs.stats
+#             The output is a file with the filtered footprints and the log file compareBed.stats
 
-# One R script is used, compareBed_runinfo.R, stored in the same directory.
+# One R scripts is used, compareBed_runinfo.R, stored in the same directory.
 
 # default parameters
 workdir=$PWD
@@ -173,6 +173,10 @@ then
 	echo "directory $workdir does not exist. Please check parameter -w / --workdir"
 	exit 1
 fi
+if [[ ${output: -4: -1} != '.bed' ]]
+then
+	output=`echo $output | sed "s|$|.bed|g"`
+fi
 
 # remove trailing tabs in footprints
 cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed
@@ -181,7 +185,7 @@ cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed
 if [ -d "$motifs" ]
 then
 	# creates an array of all files with bed in its name in the directory $motifs  
-	declare -a motiffiles=(`ls $motifs | grep bed | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`)
+	declare -a motiffiles=(`ls $motifs | grep -i '.bed$' | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`)
 
 # the else case means, that the motiffiles were passed comma separated with no whitespace.
 else
@@ -256,7 +260,7 @@ fi
 
 # remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition
 # also creates a small output file with information about the comparison
-Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/FilterMotifs.stats
+Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/compareBed.stats
 # check if Rscript executed without errors
 if [ $? -gt 0 ] 
 then
@@ -273,17 +277,18 @@ then
 	fp_initial=`expr $fp_initial - 1`
 	fp_final=`cat "$workdir"/filtered.bed | wc -l`
 	fp_final=`expr $fp_final - 1`
-	echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/FilterMotifs.stats
-	echo $fp_final | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/FilterMotifs.stats
+	echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats
+	echo $fp_final | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats
 else
 	# output will be overwritten if it exists
 	rm -f $output
 	# add some final values to the log file
-	cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/FilterMotifs.stats
-	cat "$workdir"/filtered.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/FilterMotifs.stats
+	cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats
+	cat "$workdir"/filtered.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats
 fi
 
 # add fasta sequences to bed and create fasta file
+out_fasta=`echo $output | sed "s|.bed$|.fasta|g"`
 echo getting sequences from fasta-file
 bedtools getfasta -fi $fasta -bed "$workdir"/filtered_flagged.bed -bedOut >> $output
-bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta
+bedtools getfasta -name -fi $fasta -bed "$output" -fo "$out_fasta"
diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R
@@ -8,7 +8,7 @@
 #             adds a column with a flag "contains_maxpos"
 #             and creates a file with information of the bedtool comparison
 
-# parameters: Parameters are not named. Respect the parameter order.
+# parameters: Parameters are not named, respect the parameter order.
 #             min:            minimum footprint size threshold 
 #             max:            maximum footprint size threshold
 #             input_raw:      unfiltered BED-file
@@ -27,12 +27,29 @@ output <- args[5]
 output_stats <- args[6]
 
 data_filtered <- fread(input_filtered, sep='\t')
+# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh)
+data <- fread(input_raw, sep='\t')
 
 # check if data has less than 9 columns
 if (ncol(data_filtered) < 9) {
 	stop("footprint file has less than 9 columns. exiting.")
 }
 
+# define needed col names
+names(data)[2] <- "old_start"
+names(data_filtered)[c(2, 3, 8)] <- c("start", "end", "max_pos")
+# save corect column order
+correct_order <- c(names(data_filtered), names(data)[2])
+# compute new maxpos (relevant for splitted footprints)
+data_filtered <- merge(x=data_filtered, y=data[,c(2,4)], by.x=names(data_filtered)[4], by.y=names(data)[4], all.x=TRUE, all.y=FALSE, sort=FALSE)
+data_filtered <- data_filtered[, correct_order, with = FALSE]
+data_filtered[, max_pos := max_pos + old_start - start][, old_start := NULL]
+
+# adding column "contains_maxpos", containing flag (0 or 1)
+# max_pos is the position of maximum score of a footprint
+data_filtered <- cbind(data_filtered, contains_maxpos = 0)
+data_filtered[max_pos >= 0 & start + max_pos <= end, contains_maxpos := 1]
+
 # remove sequences that are smaller than minimum (parameter)
 # remove sequences that are longer than maximum (parameter)
 data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),]
@@ -50,18 +67,9 @@ data_filtered[match(duplicants, names), name := paste0(name,".0")]
 # recalculate length of sequences
 data_filtered[[7]] <- data_filtered[[3]] - data_filtered[[2]]
 
-# adding column "contains_maxpos", containing flag (0 or 1)
-# max_pos is the position of maximum score of a footprint
-data_filtered <- cbind(data_filtered, contains_maxpos = 0)
-data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1
-data_filtered[[8]] <- data_filtered[[8]] - data_filtered[[2]]
-
 fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t')
 
-# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh)
-data <- fread(input_raw, sep='\t')
-
-# some statistics about the bedtool comparisons are stored in FilterMotifs.stats
+# some statistics about the bedtool comparisons are stored in compareBed.stats
 # number of nucleotides input
 sum_data <- sum(data[[3]]-data[[2]])
 # number of nucleotides after filter
@@ -74,6 +82,6 @@ loss_nt <- formatC(1 - sum_filtered/sum_data, digits = 2)
 length_data <- formatC(mean(data[[3]]-data[[2]]), digits = 4)
 # mean -ength of footprints after filter
 length_filtered <- formatC(mean(data_filtered[[7]]), digits = 4)
-stats <- data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1])
+stats <- data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$contains_maxpos == 1))/dim(data_filtered)[1])
 stats <- t(stats)
 write.table(stats, output_stats, col.names = FALSE, quote = FALSE, sep = '\t')