Skip to content

Commit

Permalink
updated version.. make unique updated, more robust in general
Browse files Browse the repository at this point in the history
  • Loading branch information
JannikHamp authored Jan 8, 2019
1 parent 6802d72 commit b787d14
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 57 deletions.
65 changes: 65 additions & 0 deletions bin/1.2_filter_motifs/compareBed_runinfo.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/Rscript

# author: Jannik Hamp
# email: jannik.hamp@googlemail.com

# desciption: This script gets called by compareBed.sh
# It removes too small/large footprints, makes names unique,
# adds a column with a flag "contains_maxpos"
# and creates a file with information of the bedtool comparison

# TODO: check number of columns, implement make.unique in a better way

# parsing parameters
library(data.table)
args = commandArgs(TRUE)
min = as.numeric(args[1])
max = as.numeric(args[2])
input_raw = args[3]
input_filtered = args[4]
output = args[5]

data_filtered = fread(input_filtered)
# check if data has less than 9 columns
if (ncol(data_filtered) < 9) {
stop("footprint file has less than 9 columns. exiting.")
}

# remove sequences that are smaller than minimum (parameter)
# remove sequences that are longer than maximum (parameter)
data_filtered = data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),]
data_filtered = data_filtered[which(data_filtered[[3]] - data_filtered[[2]] <= max),]

# make unique names and adjust length for splitted footprints
# duplicated names have .0 , .1 , .2 ... added
names=data_filtered[[4]]
names(data_filtered)[4] = "name"
# all duplicated names
duplicants = unique(data_filtered[duplicated(name)][[4]])
data_filtered[[4]] = make.unique(as.character(data_filtered[[4]]))
data_filtered[match(duplicants, names), name := paste0(name,".0")]

# recalculate length of sequences
data_filtered[[7]] = data_filtered[[3]] - data_filtered[[2]]

# adding column "contains_maxpos", containing flag (0 or 1)
# max_pos is the position of maximum score of a footprint
data_filtered = cbind(data_filtered, contains_maxpos = 0)
data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1
data_filtered[[8]] = data_filtered[[8]] - data_filtered[[2]]

fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t')

# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh)
data = fread(input_raw)

# some statistics about the bedtool comparisons are stored in FilterMotifs.stats
sum_data = sum(data[[3]]-data[[2]])
sum_filtered = sum(data_filtered[[7]])
difference_nt = formatC(sum_data/sum_filtered, digits = 4)
loss_nt = formatC(1 - sum_filtered/sum_data, digits = 2)
length_data = formatC(mean(data[[3]]-data[[2]]), digits = 4)
length_filtered = formatC(mean(data_filtered[[7]]), digits = 4)
stats = data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1])
stats = t(stats)
write.table(stats, "./FilterMotifs.stats", col.names = FALSE, quote = FALSE, sep = '\t')
57 changes: 0 additions & 57 deletions bin/1.2_filter_motifs/merge.R

This file was deleted.

0 comments on commit b787d14

Please sign in to comment.