Skip to content

Jannik hamp patch 1 #54

Merged
merged 15 commits into from
Jan 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
23 changes: 14 additions & 9 deletions bin/1.2_filter_motifs/compareBed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
# BED-format.

# For parameter description, run the script without parameters or -h.
# The output is a file with the filtered footprints and the log file FilterMotivs.stats
# The output is a file with the filtered footprints and the log file compareBed.stats

# One R script is used, compareBed_runinfo.R, stored in the same directory.
# One R scripts is used, compareBed_runinfo.R, stored in the same directory.

# default parameters
workdir=$PWD
Expand Down Expand Up @@ -173,6 +173,10 @@ then
echo "directory $workdir does not exist. Please check parameter -w / --workdir"
exit 1
fi
if [[ ${output: -4: -1} != '.bed' ]]
then
output=`echo $output | sed "s|$|.bed|g"`
fi

# remove trailing tabs in footprints
cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed
Expand All @@ -181,7 +185,7 @@ cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed
if [ -d "$motifs" ]
then
# creates an array of all files with bed in its name in the directory $motifs
declare -a motiffiles=(`ls $motifs | grep bed | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`)
declare -a motiffiles=(`ls $motifs | grep -i '.bed$' | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`)

# the else case means, that the motiffiles were passed comma separated with no whitespace.
else
Expand Down Expand Up @@ -256,7 +260,7 @@ fi

# remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition
# also creates a small output file with information about the comparison
Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/FilterMotifs.stats
Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/compareBed.stats
# check if Rscript executed without errors
if [ $? -gt 0 ]
then
Expand All @@ -273,17 +277,18 @@ then
fp_initial=`expr $fp_initial - 1`
fp_final=`cat "$workdir"/filtered.bed | wc -l`
fp_final=`expr $fp_final - 1`
echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/FilterMotifs.stats
echo $fp_final | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/FilterMotifs.stats
echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats
echo $fp_final | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats
else
# output will be overwritten if it exists
rm -f $output
# add some final values to the log file
cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/FilterMotifs.stats
cat "$workdir"/filtered.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/FilterMotifs.stats
cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats
cat "$workdir"/filtered.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats
fi

# add fasta sequences to bed and create fasta file
out_fasta=`echo $output | sed "s|.bed$|.fasta|g"`
echo getting sequences from fasta-file
bedtools getfasta -fi $fasta -bed "$workdir"/filtered_flagged.bed -bedOut >> $output
bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta
bedtools getfasta -name -fi $fasta -bed "$output" -fo "$out_fasta"
32 changes: 20 additions & 12 deletions bin/1.2_filter_motifs/compareBed_runinfo.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# adds a column with a flag "contains_maxpos"
# and creates a file with information of the bedtool comparison

# parameters: Parameters are not named. Respect the parameter order.
# parameters: Parameters are not named, respect the parameter order.
# min: minimum footprint size threshold
# max: maximum footprint size threshold
# input_raw: unfiltered BED-file
Expand All @@ -27,12 +27,29 @@ output <- args[5]
output_stats <- args[6]

data_filtered <- fread(input_filtered, sep='\t')
# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh)
data <- fread(input_raw, sep='\t')

# check if data has less than 9 columns
if (ncol(data_filtered) < 9) {
stop("footprint file has less than 9 columns. exiting.")
}

# define needed col names
names(data)[2] <- "old_start"
names(data_filtered)[c(2, 3, 8)] <- c("start", "end", "max_pos")
# save corect column order
correct_order <- c(names(data_filtered), names(data)[2])
# compute new maxpos (relevant for splitted footprints)
data_filtered <- merge(x=data_filtered, y=data[,c(2,4)], by.x=names(data_filtered)[4], by.y=names(data)[4], all.x=TRUE, all.y=FALSE, sort=FALSE)
data_filtered <- data_filtered[, correct_order, with = FALSE]
data_filtered[, max_pos := max_pos + old_start - start][, old_start := NULL]

# adding column "contains_maxpos", containing flag (0 or 1)
# max_pos is the position of maximum score of a footprint
data_filtered <- cbind(data_filtered, contains_maxpos = 0)
data_filtered[max_pos >= 0 & start + max_pos <= end, contains_maxpos := 1]

# remove sequences that are smaller than minimum (parameter)
# remove sequences that are longer than maximum (parameter)
data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),]
Expand All @@ -50,18 +67,9 @@ data_filtered[match(duplicants, names), name := paste0(name,".0")]
# recalculate length of sequences
data_filtered[[7]] <- data_filtered[[3]] - data_filtered[[2]]

# adding column "contains_maxpos", containing flag (0 or 1)
# max_pos is the position of maximum score of a footprint
data_filtered <- cbind(data_filtered, contains_maxpos = 0)
data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1
data_filtered[[8]] <- data_filtered[[8]] - data_filtered[[2]]

fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t')

# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh)
data <- fread(input_raw, sep='\t')

# some statistics about the bedtool comparisons are stored in FilterMotifs.stats
# some statistics about the bedtool comparisons are stored in compareBed.stats
# number of nucleotides input
sum_data <- sum(data[[3]]-data[[2]])
# number of nucleotides after filter
Expand All @@ -74,6 +82,6 @@ loss_nt <- formatC(1 - sum_filtered/sum_data, digits = 2)
length_data <- formatC(mean(data[[3]]-data[[2]]), digits = 4)
# mean -ength of footprints after filter
length_filtered <- formatC(mean(data_filtered[[7]]), digits = 4)
stats <- data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1])
stats <- data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$contains_maxpos == 1))/dim(data_filtered)[1])
stats <- t(stats)
write.table(stats, output_stats, col.names = FALSE, quote = FALSE, sep = '\t')