-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
updated version.. make unique updated, more robust in general
- Loading branch information
1 parent
6802d72
commit b787d14
Showing
2 changed files
with
65 additions
and
57 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#!/bin/Rscript | ||
|
||
# author: Jannik Hamp | ||
# email: jannik.hamp@googlemail.com | ||
|
||
# desciption: This script gets called by compareBed.sh | ||
# It removes too small/large footprints, makes names unique, | ||
# adds a column with a flag "contains_maxpos" | ||
# and creates a file with information of the bedtool comparison | ||
|
||
# TODO: check number of columns, implement make.unique in a better way | ||
|
||
# parsing parameters | ||
library(data.table) | ||
args = commandArgs(TRUE) | ||
min = as.numeric(args[1]) | ||
max = as.numeric(args[2]) | ||
input_raw = args[3] | ||
input_filtered = args[4] | ||
output = args[5] | ||
|
||
data_filtered = fread(input_filtered) | ||
# check if data has less than 9 columns | ||
if (ncol(data_filtered) < 9) { | ||
stop("footprint file has less than 9 columns. exiting.") | ||
} | ||
|
||
# remove sequences that are smaller than minimum (parameter) | ||
# remove sequences that are longer than maximum (parameter) | ||
data_filtered = data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),] | ||
data_filtered = data_filtered[which(data_filtered[[3]] - data_filtered[[2]] <= max),] | ||
|
||
# make unique names and adjust length for splitted footprints | ||
# duplicated names have .0 , .1 , .2 ... added | ||
names=data_filtered[[4]] | ||
names(data_filtered)[4] = "name" | ||
# all duplicated names | ||
duplicants = unique(data_filtered[duplicated(name)][[4]]) | ||
data_filtered[[4]] = make.unique(as.character(data_filtered[[4]])) | ||
data_filtered[match(duplicants, names), name := paste0(name,".0")] | ||
|
||
# recalculate length of sequences | ||
data_filtered[[7]] = data_filtered[[3]] - data_filtered[[2]] | ||
|
||
# adding column "contains_maxpos", containing flag (0 or 1) | ||
# max_pos is the position of maximum score of a footprint | ||
data_filtered = cbind(data_filtered, contains_maxpos = 0) | ||
data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1 | ||
data_filtered[[8]] = data_filtered[[8]] - data_filtered[[2]] | ||
|
||
fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t') | ||
|
||
# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh) | ||
data = fread(input_raw) | ||
|
||
# some statistics about the bedtool comparisons are stored in FilterMotifs.stats | ||
sum_data = sum(data[[3]]-data[[2]]) | ||
sum_filtered = sum(data_filtered[[7]]) | ||
difference_nt = formatC(sum_data/sum_filtered, digits = 4) | ||
loss_nt = formatC(1 - sum_filtered/sum_data, digits = 2) | ||
length_data = formatC(mean(data[[3]]-data[[2]]), digits = 4) | ||
length_filtered = formatC(mean(data_filtered[[7]]), digits = 4) | ||
stats = data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1]) | ||
stats = t(stats) | ||
write.table(stats, "./FilterMotifs.stats", col.names = FALSE, quote = FALSE, sep = '\t') |
This file was deleted.
Oops, something went wrong.