diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R index b4f180f..f45b4a0 100644 --- a/bin/1.2_filter_motifs/compareBed_runinfo.R +++ b/bin/1.2_filter_motifs/compareBed_runinfo.R @@ -27,12 +27,29 @@ output <- args[5] output_stats <- args[6] data_filtered <- fread(input_filtered, sep='\t') +# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh) +data <- fread(input_raw, sep='\t') # check if data has less than 9 columns if (ncol(data_filtered) < 9) { stop("footprint file has less than 9 columns. exiting.") } +# define needed col names +names(data)[2] <- "old_start" +names(data_filtered)[c(2, 3, 8)] <- c("start", "end", "max_pos") +# save corect column order +correct_order <- c(names(data_filtered), names(data)[2]) +# compute new maxpos (relevant for splitted footprints) +data_filtered <- merge(x=data_filtered, y=data[,c(2,4)], by.x=names(data_filtered)[4], by.y=names(data)[4], all.x=TRUE, all.y=FALSE, sort=FALSE) +data_filtered <- data_filtered[, correct_order, with = FALSE] +data_filtered[, max_pos := max_pos + old_start - start][, old_start := NULL] + +# adding column "contains_maxpos", containing flag (0 or 1) +# max_pos is the position of maximum score of a footprint +data_filtered <- cbind(data_filtered, contains_maxpos = 0) +data_filtered[max_pos >= 0 & start + max_pos <= end, contains_maxpos := 1] + # remove sequences that are smaller than minimum (parameter) # remove sequences that are longer than maximum (parameter) data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),] @@ -50,17 +67,8 @@ data_filtered[match(duplicants, names), name := paste0(name,".0")] # recalculate length of sequences data_filtered[[7]] <- data_filtered[[3]] - data_filtered[[2]] -# adding column "contains_maxpos", containing flag (0 or 1) -# max_pos is the position of maximum score of a footprint -data_filtered <- cbind(data_filtered, contains_maxpos = 0) -data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1 -data_filtered[[8]] <- data_filtered[[8]] - data_filtered[[2]] - fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t') -# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh) -data <- fread(input_raw, sep='\t') - # some statistics about the bedtool comparisons are stored in FilterMotifs.stats # number of nucleotides input sum_data <- sum(data[[3]]-data[[2]])