Skip to content
Permalink
689331e5f4
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
247 lines (196 sloc) 10.5 KB
#! /bin/Rscript
library("optparse")
option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"),
make_option(opt_str = c("-k", "--kmer"), default = 10, help = "Kmer length. Default = %default", metavar = "integer"),
make_option(opt_str = c("-m", "--motif"), default = 10, help = "Estimated motif length. Default = %default", metavar = "integer"),
make_option(opt_str = c("-o", "--output"), default = "reduced.bed", help = "Output file. Default = %default", metavar = "character"),
make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use -1 for all available cores. Default = %default", metavar = "integer"),
make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"),
make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"),
make_option(opt_str = c("-ko", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"),
make_option(opt_str = c("-mo", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer")
# TODO more args
)
opt_parser <- OptionParser(option_list = option_list,
description = "Reduce sequences to frequent regions.")
opt <- parse_args(opt_parser)
#' Reduce bed file to conserved regions
#'
#' @param input bed file
#' @param kmer Length of kmer.
#' @param motif Estimated motif length.
#' @param output Output file
#' @param threads Number of threads.
#' @param clean Delete all temporary files.
#' @param minoverlap_kmer Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1
#' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)
#' @param min_seq_length Must be greater or equal to kmer and motif. Default = max(c(motif, kmer)).
#'
#' @return reduced bed
#' TODO check whether jellyfish is installed
reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer))) {
# get number of available cores
if (threads == -1) {
threads <- parallel::detectCores()
}
message("Loading bed...")
# load bed
# columns: chr, start, end, name, ..., sequence
bed_table <- data.table::fread(input = input, header = FALSE)
names(bed_table)[1:4] <- c("chr", "start", "end", "name")
names(bed_table)[ncol(bed_table)] <- "sequence"
# index
data.table::setkey(bed_table, name, physical = FALSE)
# check for duplicated names
if (anyDuplicated(bed_table[, "name"])) {
warning("Found duplicated names. Making names unique.")
bed_table[, name := make.unique(name)]
}
# remove sequences below minimum length
if (min_seq_length < max(c(kmer, motif))) {
stop("Minimum sequence length must be greater or equal to ", max(c(motif, kmer)), " (maximum value of kmer and motif).")
}
total_rows <- nrow(bed_table)
bed_table <- bed_table[nchar(sequence) > min_seq_length]
if (total_rows > nrow(bed_table)) {
message("Removed ", total_rows - nrow(bed_table), " sequence(s) below minimum length of ", min_seq_length)
}
# TODO forward fasta file as parameter so no bed -> fasta conversion is needed.
message("Writing fasta...")
# save as fasta
fasta_file <- paste0(basename(input), ".fasta")
seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file)
message("Counting kmer...")
# count k-mer
hashsize <- 4 ^ kmer
count_output_binary <- "mer_count_binary.jf"
input <- fasta_file
jellyfish_call <- paste("jellyfish count ", "-m", kmer, "-s", hashsize, "-o", count_output_binary, input)
system(command = jellyfish_call, wait = TRUE)
mer_count_table <- "mer_count_table.jf"
jellyfish_dump_call <- paste("jellyfish dump --column --tab --output", mer_count_table, count_output_binary)
system(command = jellyfish_dump_call, wait = TRUE)
message("Reduce kmer.")
# load mer table
# columns: kmer, count
kmer_counts <- data.table::fread(input = mer_count_table, header = FALSE)
# order kmer descending
data.table::setorder(kmer_counts, -V2)
# compute number of hits to keep
keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif, minoverlap = minoverlap_motif)
# reduce kmer
reduced_kmer <- reduce_kmer(kmer = kmer_counts, significant = keep_hits)
message("Find kmer in sequences.")
# find k-mer in sequences
# TODO minoverlap as parameter
# columns: name, start, end, width
ranges_table <- find_kmer_regions(bed = bed_table, kmer_counts = reduced_kmer, minoverlap = minoverlap_kmer, threads = threads)
names(ranges_table)[1:2] <- c("relative_start", "relative_end")
# merge ranged_table with bed_table + keep column order
merged <- merge(x = bed_table, y = ranges_table, by = "name", sort = FALSE)[, union(names(bed_table), names(ranges_table)), with = FALSE]
# delete sequences without hit
merged <- na.omit(merged, cols = c("relative_start", "relative_end"))
message("Removed ", nrow(bed_table) - nrow(merged), " sequence(s) without hit.")
message("Reduce sequences.")
# create subsequences
merged[, sequence := stringr::str_sub(sequence, relative_start, relative_end)]
# bed files count from 0
merged[, `:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)]
# change start end location
merged[, `:=`(start = start + relative_start, end = start + relative_end)]
# clean table
merged[, `:=`(relative_start = NULL, relative_end = NULL, width = NULL)]
if (clean) {
file.remove(fasta_file, count_output_binary, mer_count_table)
}
data.table::fwrite(merged, file = output, sep = "\t", col.names = FALSE)
}
#' returns sum of top x kmer frequencies to keep
#'
#' @param bed Bed table with sequences in last column
#' @param kmer Length of kmer
#' @param motif Length of motif
#' @param minoverlap Minimum number of bases overlapping between kmer and motif. Must be <= motif & <= kmer. Defaults to ceiling(motif / 2).
#'
#' @return Number of interesting kmer.
significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2)) {
if (minoverlap > kmer || minoverlap > motif) {
stop("Kmer & motif must be greater or equal to minoverlap!")
}
# minimum sequence length to get all interesting overlaps
min_seq_length <- motif + 2 * (kmer - minoverlap)
seq_lengths <- nchar(bed[[ncol(bed)]])
# reduce to max interesting length
seq_lengths <- ifelse(seq_lengths > min_seq_length, min_seq_length, seq_lengths)
# calculate max possible kmer
topx <- sum(seq_lengths - kmer + 1)
return(topx)
}
#' @param kmer Kmer table
#' @param significant
reduce_kmer <- function(kmer, significant) {
kmer[, cumsum := cumsum(V2)]
return(kmer[cumsum <= significant])
}
#' create list of significant ranges (one for each bed entry)
#'
#' @param bed Data.table of bed with sequence in last column
#' @param kmer_counts Data.table of counted kmer. Column1 = kmer, column2 = count.
#' @param minoverlap Minimum overlapping nucleotides between kmers to be merged. Positive integer. Must be smaller than kmer length.
#' @param threads Number of threads.
#'
#' @return Data.table with relative positions and width (start, end, width).
find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) {
if (nchar(kmer_counts[1, 1]) <= minoverlap) {
stop("Minoverlap must be smaller than kmer length!")
}
names(kmer_counts)[1:2] <- c("kmer", "count")
data.table::setorder(kmer_counts, -count)
seq_ranges <- pbapply::pblapply(seq_len(nrow(bed)), cl = threads, function(x) {
seq <- bed[x][[ncol(bed)]]
#### locate ranges
ranges <- data.table::data.table(do.call(rbind, stringi::stri_locate_all_fixed(seq, pattern = kmer_counts[[1]])))
ranges <- na.omit(ranges, cols = c("start", "end"))
if (nrow(ranges) < 1) {
return(data.table::data.table(start = NA, end = NA, width = NA))
}
# add kmer sequences
ranges[, sub_seq := stringr::str_sub(seq, start, end)]
# add kmer count
ranges[, count := kmer_counts[ranges[["sub_seq"]], "count", on = "kmer"]]
#### reduce ranges
reduced_ranges <- IRanges::IRanges(start = ranges[["start"]], end = ranges[["end"]], names = ranges[["sub_seq"]])
# list of overlapping ranges
edge_list <- as.matrix(IRanges::findOverlaps(reduced_ranges, minoverlap = minoverlap, drop.self = FALSE, drop.redundant = TRUE))
# get components (groups of connected ranges)
graph <- igraph::graph_from_edgelist(edge_list, directed = FALSE)
# vector of node membership (indices correspond to ranges above)
member <- as.factor(igraph::components(graph)$membership)
# list of membership vectors
node_membership <- lapply(levels(member), function(x) {
which(member == x)
})
# calculate component score (= sum of kmer count)
score <- vapply(node_membership, FUN.VALUE = numeric(1), function(x) {
sum(kmer_counts[x, "count"])
})
selected_ranges <- node_membership[[which(score == max(score))[1]]]
# reduce selected ranges
reduced_ranges <- IRanges::reduce(reduced_ranges[selected_ranges])
reduced_ranges <- data.table::as.data.table(reduced_ranges)
return(reduced_ranges)
})
# create ranges table
conserved_regions_table <- data.table::rbindlist(seq_ranges)
conserved_regions_table[, name := bed[[4]]]
return(conserved_regions_table)
}
# call function with given parameter if not in interactive context (e.g. run from shell)
if (!interactive()) {
# show apply progressbar
pbo <- pbapply::pboptions(type = "timer")
# remove last parameter (help param)
params <- opt[-length(opt)]
do.call(reduce_bed, args = params)
}