Skip to content

Commit

Permalink
Merge pull request #54 from loosolab/JannikHamp-patch-1
Browse files Browse the repository at this point in the history
Jannik hamp patch 1
  • Loading branch information
JannikHamp authored Jan 11, 2019
2 parents 58355d4 + cb16c3c commit f4a745c
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 21 deletions.
23 changes: 14 additions & 9 deletions bin/1.2_filter_motifs/compareBed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
# BED-format.

# For parameter description, run the script without parameters or -h.
# The output is a file with the filtered footprints and the log file FilterMotivs.stats
# The output is a file with the filtered footprints and the log file compareBed.stats

# One R script is used, compareBed_runinfo.R, stored in the same directory.
# One R scripts is used, compareBed_runinfo.R, stored in the same directory.

# default parameters
workdir=$PWD
Expand Down Expand Up @@ -173,6 +173,10 @@ then
echo "directory $workdir does not exist. Please check parameter -w / --workdir"
exit 1
fi
if [[ ${output: -4: -1} != '.bed' ]]
then
output=`echo $output | sed "s|$|.bed|g"`
fi

# remove trailing tabs in footprints
cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed
Expand All @@ -181,7 +185,7 @@ cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed
if [ -d "$motifs" ]
then
# creates an array of all files with bed in its name in the directory $motifs
declare -a motiffiles=(`ls $motifs | grep bed | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`)
declare -a motiffiles=(`ls $motifs | grep -i '.bed$' | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`)

# the else case means, that the motiffiles were passed comma separated with no whitespace.
else
Expand Down Expand Up @@ -256,7 +260,7 @@ fi

# remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition
# also creates a small output file with information about the comparison
Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/FilterMotifs.stats
Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/compareBed.stats
# check if Rscript executed without errors
if [ $? -gt 0 ]
then
Expand All @@ -273,17 +277,18 @@ then
fp_initial=`expr $fp_initial - 1`
fp_final=`cat "$workdir"/filtered.bed | wc -l`
fp_final=`expr $fp_final - 1`
echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/FilterMotifs.stats
echo $fp_final | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/FilterMotifs.stats
echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats
echo $fp_final | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats
else
# output will be overwritten if it exists
rm -f $output
# add some final values to the log file
cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/FilterMotifs.stats
cat "$workdir"/filtered.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/FilterMotifs.stats
cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats
cat "$workdir"/filtered.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats
fi

# add fasta sequences to bed and create fasta file
out_fasta=`echo $output | sed "s|.bed$|.fasta|g"`
echo getting sequences from fasta-file
bedtools getfasta -fi $fasta -bed "$workdir"/filtered_flagged.bed -bedOut >> $output
bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta
bedtools getfasta -name -fi $fasta -bed "$output" -fo "$out_fasta"
32 changes: 20 additions & 12 deletions bin/1.2_filter_motifs/compareBed_runinfo.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# adds a column with a flag "contains_maxpos"
# and creates a file with information of the bedtool comparison

# parameters: Parameters are not named. Respect the parameter order.
# parameters: Parameters are not named, respect the parameter order.
# min: minimum footprint size threshold
# max: maximum footprint size threshold
# input_raw: unfiltered BED-file
Expand All @@ -27,12 +27,29 @@ output <- args[5]
output_stats <- args[6]

data_filtered <- fread(input_filtered, sep='\t')
# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh)
data <- fread(input_raw, sep='\t')

# check if data has less than 9 columns
if (ncol(data_filtered) < 9) {
stop("footprint file has less than 9 columns. exiting.")
}

# define needed col names
names(data)[2] <- "old_start"
names(data_filtered)[c(2, 3, 8)] <- c("start", "end", "max_pos")
# save corect column order
correct_order <- c(names(data_filtered), names(data)[2])
# compute new maxpos (relevant for splitted footprints)
data_filtered <- merge(x=data_filtered, y=data[,c(2,4)], by.x=names(data_filtered)[4], by.y=names(data)[4], all.x=TRUE, all.y=FALSE, sort=FALSE)
data_filtered <- data_filtered[, correct_order, with = FALSE]
data_filtered[, max_pos := max_pos + old_start - start][, old_start := NULL]

# adding column "contains_maxpos", containing flag (0 or 1)
# max_pos is the position of maximum score of a footprint
data_filtered <- cbind(data_filtered, contains_maxpos = 0)
data_filtered[max_pos >= 0 & start + max_pos <= end, contains_maxpos := 1]

# remove sequences that are smaller than minimum (parameter)
# remove sequences that are longer than maximum (parameter)
data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),]
Expand All @@ -50,18 +67,9 @@ data_filtered[match(duplicants, names), name := paste0(name,".0")]
# recalculate length of sequences
data_filtered[[7]] <- data_filtered[[3]] - data_filtered[[2]]

# adding column "contains_maxpos", containing flag (0 or 1)
# max_pos is the position of maximum score of a footprint
data_filtered <- cbind(data_filtered, contains_maxpos = 0)
data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1
data_filtered[[8]] <- data_filtered[[8]] - data_filtered[[2]]

fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t')

# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh)
data <- fread(input_raw, sep='\t')

# some statistics about the bedtool comparisons are stored in FilterMotifs.stats
# some statistics about the bedtool comparisons are stored in compareBed.stats
# number of nucleotides input
sum_data <- sum(data[[3]]-data[[2]])
# number of nucleotides after filter
Expand All @@ -74,6 +82,6 @@ loss_nt <- formatC(1 - sum_filtered/sum_data, digits = 2)
length_data <- formatC(mean(data[[3]]-data[[2]]), digits = 4)
# mean -ength of footprints after filter
length_filtered <- formatC(mean(data_filtered[[7]]), digits = 4)
stats <- data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1])
stats <- data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$contains_maxpos == 1))/dim(data_filtered)[1])
stats <- t(stats)
write.table(stats, output_stats, col.names = FALSE, quote = FALSE, sep = '\t')

0 comments on commit f4a745c

Please sign in to comment.