Skip to content

Commit

Permalink
more documentation, replce =, <-
Browse files Browse the repository at this point in the history
  • Loading branch information
JannikHamp authored Jan 9, 2019
1 parent 1a35ce5 commit 530627c
Showing 1 changed file with 32 additions and 26 deletions.
58 changes: 32 additions & 26 deletions bin/1.2_filter_motifs/compareBed_runinfo.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@

# parsing parameters
library(data.table)
args = commandArgs(TRUE)
min = as.numeric(args[1])
max = as.numeric(args[2])
input_raw = args[3]
input_filtered = args[4]
output = args[5]
output_stats = args[6]
args <- commandArgs(TRUE)
min <- as.numeric(args[1])
max <- as.numeric(args[2])
input_raw <- args[3]
input_filtered <- args[4]
output <- args[5]
output_stats <- args[6]

data_filtered = fread(input_filtered, sep='\t')
data_filtered <- fread(input_filtered, sep='\t')

# check if data has less than 9 columns
if (ncol(data_filtered) < 9) {
Expand All @@ -35,39 +35,45 @@ if (ncol(data_filtered) < 9) {

# remove sequences that are smaller than minimum (parameter)
# remove sequences that are longer than maximum (parameter)
data_filtered = data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),]
data_filtered = data_filtered[which(data_filtered[[3]] - data_filtered[[2]] <= max),]
data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),]
data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] <= max),]

# make unique names and adjust length for splitted footprints
# duplicated names have .0 , .1 , .2 ... added
names=data_filtered[[4]]
names(data_filtered)[4] = "name"
names <- data_filtered[[4]]
names(data_filtered)[4] <- "name"
# all duplicated names
duplicants = unique(data_filtered[duplicated(name)][[4]])
data_filtered[[4]] = make.unique(as.character(data_filtered[[4]]))
duplicants <- unique(data_filtered[duplicated(name)][[4]])
data_filtered[[4]] <- make.unique(as.character(data_filtered[[4]]))
data_filtered[match(duplicants, names), name := paste0(name,".0")]

# recalculate length of sequences
data_filtered[[7]] = data_filtered[[3]] - data_filtered[[2]]
data_filtered[[7]] <- data_filtered[[3]] - data_filtered[[2]]

# adding column "contains_maxpos", containing flag (0 or 1)
# max_pos is the position of maximum score of a footprint
data_filtered = cbind(data_filtered, contains_maxpos = 0)
data_filtered <- cbind(data_filtered, contains_maxpos = 0)
data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1
data_filtered[[8]] = data_filtered[[8]] - data_filtered[[2]]
data_filtered[[8]] <- data_filtered[[8]] - data_filtered[[2]]

fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t')

# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh)
data = fread(input_raw, sep='\t')
data <- fread(input_raw, sep='\t')

# some statistics about the bedtool comparisons are stored in FilterMotifs.stats
sum_data = sum(data[[3]]-data[[2]])
sum_filtered = sum(data_filtered[[7]])
difference_nt = formatC(sum_data/sum_filtered, digits = 4)
loss_nt = formatC(1 - sum_filtered/sum_data, digits = 2)
length_data = formatC(mean(data[[3]]-data[[2]]), digits = 4)
length_filtered = formatC(mean(data_filtered[[7]]), digits = 4)
stats = data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1])
stats = t(stats)
# number of nucleotides input
sum_data <- sum(data[[3]]-data[[2]])
# number of nucleotides after filter
sum_filtered <- sum(data_filtered[[7]])
# quotient: sum_data/sum_filtered
difference_nt <- formatC(sum_data/sum_filtered, digits = 4)
# loss: 1 - sum_filtered/sum_data
loss_nt <- formatC(1 - sum_filtered/sum_data, digits = 2)
# mean length of footprints input
length_data <- formatC(mean(data[[3]]-data[[2]]), digits = 4)
# mean -ength of footprints after filter
length_filtered <- formatC(mean(data_filtered[[7]]), digits = 4)
stats <- data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1])
stats <- t(stats)
write.table(stats, output_stats, col.names = FALSE, quote = FALSE, sep = '\t')

0 comments on commit 530627c

Please sign in to comment.