From 530627cc4499cd5c176abd4a7e047942a89c91a9 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Wed, 9 Jan 2019 15:55:52 +0100 Subject: [PATCH] more documentation, replce =, <- --- bin/1.2_filter_motifs/compareBed_runinfo.R | 58 ++++++++++++---------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R index 4c5686d..4888c02 100644 --- a/bin/1.2_filter_motifs/compareBed_runinfo.R +++ b/bin/1.2_filter_motifs/compareBed_runinfo.R @@ -18,15 +18,15 @@ # parsing parameters library(data.table) -args = commandArgs(TRUE) -min = as.numeric(args[1]) -max = as.numeric(args[2]) -input_raw = args[3] -input_filtered = args[4] -output = args[5] -output_stats = args[6] +args <- commandArgs(TRUE) +min <- as.numeric(args[1]) +max <- as.numeric(args[2]) +input_raw <- args[3] +input_filtered <- args[4] +output <- args[5] +output_stats <- args[6] -data_filtered = fread(input_filtered, sep='\t') +data_filtered <- fread(input_filtered, sep='\t') # check if data has less than 9 columns if (ncol(data_filtered) < 9) { @@ -35,39 +35,45 @@ if (ncol(data_filtered) < 9) { # remove sequences that are smaller than minimum (parameter) # remove sequences that are longer than maximum (parameter) -data_filtered = data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),] -data_filtered = data_filtered[which(data_filtered[[3]] - data_filtered[[2]] <= max),] +data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),] +data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] <= max),] # make unique names and adjust length for splitted footprints # duplicated names have .0 , .1 , .2 ... added -names=data_filtered[[4]] -names(data_filtered)[4] = "name" +names <- data_filtered[[4]] +names(data_filtered)[4] <- "name" # all duplicated names -duplicants = unique(data_filtered[duplicated(name)][[4]]) -data_filtered[[4]] = make.unique(as.character(data_filtered[[4]])) +duplicants <- unique(data_filtered[duplicated(name)][[4]]) +data_filtered[[4]] <- make.unique(as.character(data_filtered[[4]])) data_filtered[match(duplicants, names), name := paste0(name,".0")] # recalculate length of sequences -data_filtered[[7]] = data_filtered[[3]] - data_filtered[[2]] +data_filtered[[7]] <- data_filtered[[3]] - data_filtered[[2]] # adding column "contains_maxpos", containing flag (0 or 1) # max_pos is the position of maximum score of a footprint -data_filtered = cbind(data_filtered, contains_maxpos = 0) +data_filtered <- cbind(data_filtered, contains_maxpos = 0) data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1 -data_filtered[[8]] = data_filtered[[8]] - data_filtered[[2]] +data_filtered[[8]] <- data_filtered[[8]] - data_filtered[[2]] fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t') # data is the initial data before any comparisons have been done (-d parameter of compareBed.sh) -data = fread(input_raw, sep='\t') +data <- fread(input_raw, sep='\t') # some statistics about the bedtool comparisons are stored in FilterMotifs.stats -sum_data = sum(data[[3]]-data[[2]]) -sum_filtered = sum(data_filtered[[7]]) -difference_nt = formatC(sum_data/sum_filtered, digits = 4) -loss_nt = formatC(1 - sum_filtered/sum_data, digits = 2) -length_data = formatC(mean(data[[3]]-data[[2]]), digits = 4) -length_filtered = formatC(mean(data_filtered[[7]]), digits = 4) -stats = data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1]) -stats = t(stats) +# number of nucleotides input +sum_data <- sum(data[[3]]-data[[2]]) +# number of nucleotides after filter +sum_filtered <- sum(data_filtered[[7]]) +# quotient: sum_data/sum_filtered +difference_nt <- formatC(sum_data/sum_filtered, digits = 4) +# loss: 1 - sum_filtered/sum_data +loss_nt <- formatC(1 - sum_filtered/sum_data, digits = 2) +# mean length of footprints input +length_data <- formatC(mean(data[[3]]-data[[2]]), digits = 4) +# mean -ength of footprints after filter +length_filtered <- formatC(mean(data_filtered[[7]]), digits = 4) +stats <- data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1]) +stats <- t(stats) write.table(stats, output_stats, col.names = FALSE, quote = FALSE, sep = '\t')