Skip to content

Commit

Permalink
max_pos calculation is done in one Rscript
Browse files Browse the repository at this point in the history
  • Loading branch information
JannikHamp authored Jan 11, 2019
1 parent f1a0690 commit 9144c1f
Showing 1 changed file with 17 additions and 9 deletions.
26 changes: 17 additions & 9 deletions bin/1.2_filter_motifs/compareBed_runinfo.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,29 @@ output <- args[5]
output_stats <- args[6]

data_filtered <- fread(input_filtered, sep='\t')
# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh)
data <- fread(input_raw, sep='\t')

# check if data has less than 9 columns
if (ncol(data_filtered) < 9) {
stop("footprint file has less than 9 columns. exiting.")
}

# define needed col names
names(data)[2] <- "old_start"
names(data_filtered)[c(2, 3, 8)] <- c("start", "end", "max_pos")
# save corect column order
correct_order <- c(names(data_filtered), names(data)[2])
# compute new maxpos (relevant for splitted footprints)
data_filtered <- merge(x=data_filtered, y=data[,c(2,4)], by.x=names(data_filtered)[4], by.y=names(data)[4], all.x=TRUE, all.y=FALSE, sort=FALSE)
data_filtered <- data_filtered[, correct_order, with = FALSE]
data_filtered[, max_pos := max_pos + old_start - start][, old_start := NULL]

# adding column "contains_maxpos", containing flag (0 or 1)
# max_pos is the position of maximum score of a footprint
data_filtered <- cbind(data_filtered, contains_maxpos = 0)
data_filtered[max_pos >= 0 & start + max_pos <= end, contains_maxpos := 1]

# remove sequences that are smaller than minimum (parameter)
# remove sequences that are longer than maximum (parameter)
data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),]
Expand All @@ -50,17 +67,8 @@ data_filtered[match(duplicants, names), name := paste0(name,".0")]
# recalculate length of sequences
data_filtered[[7]] <- data_filtered[[3]] - data_filtered[[2]]

# adding column "contains_maxpos", containing flag (0 or 1)
# max_pos is the position of maximum score of a footprint
data_filtered <- cbind(data_filtered, contains_maxpos = 0)
data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1
data_filtered[[8]] <- data_filtered[[8]] - data_filtered[[2]]

fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t')

# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh)
data <- fread(input_raw, sep='\t')

# some statistics about the bedtool comparisons are stored in FilterMotifs.stats
# number of nucleotides input
sum_data <- sum(data[[3]]-data[[2]])
Expand Down

0 comments on commit 9144c1f

Please sign in to comment.