Skip to content
Permalink
53b0857b4a
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
369 lines (297 sloc) 16.1 KB
#! /bin/Rscript
if (!require(optparse, quietly = TRUE)) install.packages("optparse"); library(optparse)
option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"),
make_option(opt_str = c("-k", "--kmer"), default = 10, help = "K-mer length. Default = %default", metavar = "integer"),
make_option(opt_str = c("-m", "--motif"), default = 10, help = "Estimated motif length. Default = %default", metavar = "integer"),
make_option(opt_str = c("-o", "--output"), default = "reduced.bed", help = "Output file. Default = %default", metavar = "character"),
make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use 0 for all available cores. Default = %default", metavar = "integer"),
make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"),
make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and k-mer and can not be lower.", metavar = "integer", type = "integer"),
make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between k-mer. Used to create reduced sequence ranges out of merged k-mer. Can not be greater than k-mer length. Default = kmer - 1", metavar = "integer", type = "integer"),
make_option(opt_str = c("-v", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and k-mer to consider k-mer significant. Used for k-mer cutoff calculation. Can not be greater than motif and k-mer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"),
make_option(opt_str = c("-f", "--motif_occurrence"), default = 1, help = "Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = %default meaning that there should be approximately one motif per sequence.", metavar = "double"),
make_option(opt_str = c("--summary"), default = NULL, help = "Filepath where a small summary file of the run should be written. Default = %default", metavar = "character")
)
opt_parser <- OptionParser(option_list = option_list,
description = "Reduces each sequence to its most frequent region.",
epilogue = "Author: Hendrik Schultheis <Hendrik.Schultheis@mpi-bn.mpg.de>")
opt <- parse_args(opt_parser)
#' Reduces each sequence to its most frequent region.
#'
#' @param input Input bed-file. Last column must be sequences.
#' @param kmer k-mer length. Default = 10
#' @param motif Estimated motif length. Default = 10
#' @param output Output file. Default = reduced.bed
#' @param threads Number of threads to use. Default = 1. Use 0 for all cores.
#' @param clean Delete all temporary files.
#' @param minoverlap_kmer Minimum required overlap between k-mer. Used to create reduced sequence ranges out of merged k-mer. Can not be greater than k-mer length. Default = kmer - 1
#' @param minoverlap_motif Minimum required overlap between motif and k-mer to consider k-mer significant. Used for k-mer cutoff calculation. Can not be greater than motif and k-mer length. Default = ceiling(motif / 2)
#' @param min_seq_length Remove sequences below this length. Defaults to the maximum value of motif and k-mer and can not be lower.
#' @param motif_occurrence Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence.
#' @param summary Filepath where a small summary file of the run should be written.
#'
#' @details If there is a header supplied other then the default data.table naming scheme ('V1', 'V2', etc.) it will be kept.
#'
#' @author Hendrik Schultheis <Hendrik.Schultheis@@mpi-bn.mpg.de>
#'
reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = 1, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurrence = 1, summary = NULL) {
# parameter checks
if (system("which jellyfish", ignore.stdout = TRUE) != 0) {
stop("Required program jellyfish not found! Please check whether it is installed.")
}
if (missing(input)) {
stop("No input specified! Please forward a valid bed-file.")
}
if (!file.exists(input)) {
stop("File ", input, " does not exist!")
}
if (!is.numeric(kmer) || kmer != as.integer(kmer) || kmer <= 0) {
stop("K-mer has to be a positive integer above 0.")
}
if (!is.numeric(motif) || motif != as.integer(motif) || motif <= 0) {
stop("Motif has to be a positive integer above 0.")
}
if (!is.numeric(threads) || threads != as.integer(threads) || threads < 0) {
stop("Threads has to be a positive integer (0 or greater).")
}
if (!is.logical(clean)) {
stop("Parameter clean has to be a boolean value.")
}
if (!is.numeric(minoverlap_kmer) || minoverlap_kmer != as.integer(minoverlap_kmer) || minoverlap_kmer <= 0) {
stop("Minoverlap_kmer has to be a positive integer above 0.")
}
if (!is.numeric(minoverlap_motif) || minoverlap_motif != as.integer(minoverlap_motif) || minoverlap_motif <= 0) {
stop("Minoverlap_motif has to be a positive integer above 0.")
}
if (!is.numeric(min_seq_length) || min_seq_length != as.integer(min_seq_length) || min_seq_length <= 0) {
stop("Min_seq_length hat to be a positive integer above 0.")
}
if (!is.numeric(motif_occurrence) || motif_occurrence < 0 || motif_occurrence > 1) { # TODO remove motif_occurence > 1. See TODO of find_kmer_regions below.
stop("Motif_occurence has to be a numeric value above 0 and can not be greater than 1.")
}
# get number of available cores
if (threads == 0) {
threads <- parallel::detectCores()
}
message("Loading bed...")
# load bed
# columns: chr, start, end, name, ..., sequence
bed_table <- data.table::fread(input = input)
# check for header and save it if provided
default_col_names <- grepl(pattern = "^V+\\d$", names(bed_table), perl = TRUE)
if (!any(default_col_names)) {
keep_col_names <- TRUE
col_names <- names(bed_table)
} else {
keep_col_names <- FALSE
}
names(bed_table)[1:4] <- c("chr", "start", "end", "name")
names(bed_table)[ncol(bed_table)] <- "sequence"
# index
data.table::setkey(bed_table, name, physical = FALSE)
# check for duplicated names
if (anyDuplicated(bed_table[, "name"])) {
warning("Found duplicated names. Making names unique.")
bed_table[, name := make.unique(name)]
}
# remove sequences below minimum length
if (min_seq_length < max(c(kmer, motif))) {
stop("Minimum sequence length must be greater or equal to ", max(c(motif, kmer)), " (maximum value of k-mer and motif).")
}
total_rows <- nrow(bed_table)
bed_table <- bed_table[nchar(sequence) > min_seq_length]
if (total_rows > nrow(bed_table)) {
message("Removed ", total_rows - nrow(bed_table), " sequence(s) below minimum length of ", min_seq_length)
}
# TODO forward fasta file as parameter so no bed -> fasta conversion is needed.
message("Writing fasta...")
# save as fasta
fasta_file <- paste0(basename(input), ".fasta")
seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file)
message("Counting k-mer...")
# count k-mer
hashsize <- 4 ^ kmer
count_output_binary <- "mer_count_binary.jf"
input <- fasta_file
jellyfish_call <- paste("jellyfish count ", "-m", kmer, "-s", hashsize, "-o", count_output_binary, input)
system(command = jellyfish_call, wait = TRUE)
mer_count_table <- "mer_count_table.jf"
jellyfish_dump_call <- paste("jellyfish dump --column --tab --output", mer_count_table, count_output_binary)
system(command = jellyfish_dump_call, wait = TRUE)
message("Reduce k-mer.")
# load mer table
# columns: kmer, count
kmer_counts <- data.table::fread(input = mer_count_table, header = FALSE)
# order k-mer descending
data.table::setorder(kmer_counts, -V2)
# compute number of hits to keep
keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif, minoverlap = minoverlap_motif, motif_occurrence = motif_occurrence)
# reduce k-mer
reduced_kmer <- reduce_kmer(kmer = kmer_counts, significant = keep_hits)
message("Reduced kmer to first most frequent ", nrow(reduced_kmer), " out of ", nrow(kmer_counts), ".")
message("Find k-mer in sequences.")
# find k-mer in sequences
# columns: name, start, end, width
ranges_table <- find_kmer_regions(bed = bed_table, kmer_counts = reduced_kmer, minoverlap = minoverlap_kmer, threads = threads)
names(ranges_table)[1:2] <- c("relative_start", "relative_end")
# merge ranged_table with bed_table + keep column order
merged <- merge(x = bed_table, y = ranges_table, by = "name", sort = FALSE)[, union(names(bed_table), names(ranges_table)), with = FALSE]
# delete sequences without hit
merged <- na.omit(merged, cols = c("relative_start", "relative_end"))
message("Removed ", nrow(bed_table) - nrow(merged), " sequence(s) without hit.")
message("Reduce sequences.")
# create subsequences
merged[, sequence := stringr::str_sub(sequence, relative_start, relative_end)]
# bed files count from 0
merged[, `:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)]
# change start end location
merged[, `:=`(start = start + relative_start, end = start + relative_end)]
# clean table
merged[, `:=`(relative_start = NULL, relative_end = NULL, width = NULL)]
if (clean) {
file.remove(fasta_file, count_output_binary, mer_count_table)
}
# write summary
if (!is.null(summary)) {
# script/ function call
file_name <- sub("--file=", "", commandArgs()[grep("--file=", commandArgs())])
call <- ifelse(interactive(), deparse(match.call()),
paste("Rscript", basename(file_name), paste(commandArgs(trailingOnly = TRUE), collapse = " "))
)
# sequence length summary
summary_input <- summary(nchar(bed_table[["sequence"]]))
summary_output <- summary(nchar(merged[["sequence"]]))
# sequences removed
seq_to_small <- total_rows - nrow(bed_table)
no_hit_in_assembly <- nrow(bed_table) - nrow(merged)
total_removed <- seq_to_small + no_hit_in_assembly
summary_data <- c(
"Call:",
paste0("\t", call),
"",
"Sequence length information:",
paste0("\t\t", paste0(names(summary_input), collapse = "\t")),
paste("\tInput", paste0(summary_input, collapse = "\t"), sep = "\t"),
paste("\tOutput", paste0(summary_output, collapse = "\t"), sep = "\t"),
"Total sequences:",
paste("\tInput", total_rows),
paste("\tOutput", nrow(merged)),
"Removed sequences:",
paste("\tBelow minimum size", seq_to_small),
paste("\tNo hit in assembly", no_hit_in_assembly),
paste("\tTotal", total_removed)
)
write(x = summary_data, file = summary)
}
# keep provided column names
if (keep_col_names) {
names(merged) <- col_names
}
data.table::fwrite(merged, file = output, sep = "\t", col.names = keep_col_names)
}
#' Predict how many interesting k-mer are possible for the given data.
#'
#' @param bed Bed table with sequences in last column
#' @param kmer Length of k-mer
#' @param motif Length of motif
#' @param minoverlap Minimum number of bases overlapping between k-mer and motif. Must be <= motif & <= kmer. Defaults to ceiling(motif / 2).
#' @param motif_occurrence Define how many motifs are expected per sequence. Default = 1
#'
#' @return Number of interesting k-mer.
significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2), motif_occurrence = 1) {
if (minoverlap > kmer || minoverlap > motif) {
stop("Kmer & motif must be greater or equal to minoverlap!")
}
if (motif_occurrence <= 0) {
stop("Motif_occurrence must be a numeric value above 0!")
}
# minimum sequence length to get all interesting overlaps
min_seq_length <- motif + 2 * (kmer - minoverlap)
seq_lengths <- nchar(bed[[ncol(bed)]])
# reduce to max interesting length
seq_lengths <- ifelse(seq_lengths > min_seq_length, min_seq_length, seq_lengths)
# calculate max possible k-mer
topx <- sum(seq_lengths - kmer + 1)
return(topx * motif_occurrence)
}
#' Orders k-mer table after count descending and keeps all k-mer with a cumulative sum below the given significance threshold.
#'
#' @param kmer K-mer data.table columns: kmer, count
#' @param significant Value from significant_kmer function.
#'
#' @return reduced data.table
reduce_kmer <- function(kmer, significant) {
data.table::setorderv(kmer, cols = names(kmer)[2], order = -1)
# TODO don't use 'V2'
kmer[, cumsum := cumsum(V2)]
return(kmer[cumsum <= significant])
}
#' create list of significant ranges (one for each bed entry)
#'
#' @param bed Data.table of bed with sequence in last column
#' @param kmer_counts Data.table of counted k-mer. Column1 = kmer, column2 = count.
#' @param minoverlap Minimum overlapping nucleotides between k-mers to be merged. Positive integer. Must be smaller than k-mer length.
#' @param threads Number of threads.
#'
#' @return Data.table with relative positions and width (start, end, width).
#'
#' TODO Include number of motifs per sequence (aka motif_occurrence). Attempt to keep best 2 regions for occurrence = 2? Probably high impact on performance.
find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) {
if (nchar(kmer_counts[1, 1]) <= minoverlap) {
stop("Minoverlap must be smaller than k-mer length!")
}
names(kmer_counts)[1:2] <- c("kmer", "count")
data.table::setorder(kmer_counts, -count)
seq_ranges <- pbapply::pblapply(seq_len(nrow(bed)), cl = threads, function(x) {
seq <- bed[x][[ncol(bed)]]
name <- bed[x][[4]]
#### locate ranges
ranges <- data.table::data.table(do.call(rbind, stringi::stri_locate_all_fixed(seq, pattern = kmer_counts[[1]])))
ranges <- na.omit(ranges, cols = c("start", "end"))
if (nrow(ranges) < 1) {
return(data.table::data.table(start = NA, end = NA, width = NA, name = name))
}
# add k-mer sequences
ranges[, sub_seq := stringr::str_sub(seq, start, end)]
# add k-mer count
ranges[, count := kmer_counts[ranges[["sub_seq"]], "count", on = "kmer"]]
#### reduce ranges
reduced_ranges <- IRanges::IRanges(start = ranges[["start"]], end = ranges[["end"]], names = ranges[["sub_seq"]])
# list of overlapping ranges
edge_list <- as.matrix(IRanges::findOverlaps(reduced_ranges, minoverlap = minoverlap, drop.self = FALSE, drop.redundant = TRUE))
# get components (groups of connected ranges)
graph <- igraph::graph_from_edgelist(edge_list, directed = FALSE)
# vector of node membership (indices correspond to ranges above)
member <- as.factor(igraph::components(graph)$membership)
# list of membership vectors
node_membership <- lapply(levels(member), function(x) {
which(member == x)
})
# calculate component score (= sum of k-mer count)
score <- vapply(node_membership, FUN.VALUE = numeric(1), function(x) {
sum(kmer_counts[x, "count"])
})
selected_ranges <- node_membership[[which(score == max(score))[1]]]
# reduce selected ranges
reduced_ranges <- IRanges::reduce(reduced_ranges[selected_ranges])
reduced_ranges <- data.table::as.data.table(reduced_ranges)[, name := name]
return(reduced_ranges)
})
# create ranges table
conserved_regions_table <- data.table::rbindlist(seq_ranges)
return(conserved_regions_table)
}
# call function with given parameter if not in interactive context (e.g. run from shell)
if (!interactive()) {
# show apply progressbar
pbo <- pbapply::pboptions(type = "timer")
# show help if called without arguments
if (length(commandArgs(trailingOnly = TRUE)) <= 0) {
print_help(opt_parser)
} else {
# remove last parameter (help param)
params <- opt[-length(opt)]
do.call(reduce_sequence, args = params)
}
}