diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R new file mode 100644 index 0000000..ed5ea7f --- /dev/null +++ b/bin/cdhit_wrapper.R @@ -0,0 +1,75 @@ +#! /bin/Rscript +library("optparse") + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"), + make_option(opt_str = c("-s", "--similarity"), default = 0.8, help = "Similarity threshold. Default = %default", metavar = "double >= 0.8"), + make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default", metavar = "integer"), + make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file. Default = %default", metavar = "character"), + make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical") + # TODO more args +) + +opt_parser <- OptionParser(option_list = option_list, + description = "CD-HIT-EST Wrapper function.") + +opt <- parse_args(opt_parser) + +#' cd-hit wrapper +#' +#' @param input +#' @param similarity Similarity threshold. +#' @param coverage Minimal alignment length for both sequences in nucelotides. +#' @param output Clustered bedfile. Adds cluster number in last column (lower number = bigger cluster). +#' @param clean Clean up after run. +#' +#' @return bed_table with cluster in last column +#' TODO add all cdhit parameter +#' TODO check whether cdhit is installed +cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE) { + # load bed if neccessary + if (!data.table::is.data.table(input)) { + bed_table <- data.table::fread(input = input, header = FALSE) + } else { + bed_table <- input + } + + ### convert bed to fasta + # 4th column = name + # last column = sequence + fasta_file <- "converted_bed.fasta" + seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file) + + ### cd-hit-est + cdhit_output <- "cdhit_output" + cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", similarity, "-A", coverage, "-G 0 -n 3 -g 1 -r 0 -l 5 -sc 1 -d 0") + + system(command = cdhit_call, wait = TRUE) + + # reformat cluster file + # columns: id, clstr, clstr_size, length, clstr_rep, clstr_iden, clstr_cov + cluster_file <- "reformated.clstr" + cluster_call <- paste("clstr2txt.pl", paste0(cdhit_output, ".clstr"), ">", cluster_file) + + system(command = cluster_call, wait = TRUE) + + # load reformated file + cluster <- data.table::fread(cluster_file) + + ### add cluster to bed_table + result <- merge(x = bed_table, y = cluster[, c("id", "clstr")], by.x = "V4", by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE] + + # delete files + if (clean) { + file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file) + } + + data.table::fwrite(x = result, file = output, sep = "\t", col.names = FALSE) +} + +# call function with given parameter if not in interactive context (e.g. run from shell) +if (!interactive()) { + # remove last parameter (help param) + params <- opt[-length(opt)] + do.call(cdhitest, args = params) +} diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R new file mode 100644 index 0000000..223c946 --- /dev/null +++ b/bin/reduce_bed.R @@ -0,0 +1,241 @@ +#! /bin/Rscript +library("optparse") + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"), + make_option(opt_str = c("-k", "--kmer"), default = 10, help = "Kmer length. Default = %default", metavar = "integer"), + make_option(opt_str = c("-m", "--motif"), default = 10, help = "Estimated motif length. Default = %default", metavar = "integer"), + make_option(opt_str = c("-o", "--output"), default = "reduced.bed", help = "Output file. Default = %default", metavar = "character"), + make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use -1 for all available cores. Default = %default", metavar = "integer"), + make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), + make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to motif length.", metavar = "integer", type = "integer") + # TODO more args +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Reduce sequences to frequent regions.") + +opt <- parse_args(opt_parser) + +#' Reduce bed file to conserved regions +#' +#' @param input bed file +#' @param kmer Length of kmer. +#' @param motif Estimated motif length. +#' @param output Output file +#' @param threads Number of threads. +#' @param clean Delete all temporary files. +#' @param minoverlap_kmer Minimum required overlap between kmers. # TODO +#' @param minoverlap_motif Minimum required overlap between motif and kmer. # TODO +#' @param min_seq_length Must be smaller or equal to kmer and motif. Default = motif. +#' +#' @return reduced bed +#' TODO check whether jellyfish is installed +reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer, minoverlap_motif, min_seq_length = motif) { + # get number of available cores + if (threads == -1) { + threads <- parallel::detectCores() + } + + message("Loading bed...") + # load bed + # columns: chr, start, end, name, ..., sequence + bed_table <- data.table::fread(input = input, header = FALSE) + names(bed_table)[1:4] <- c("chr", "start", "end", "name") + names(bed_table)[ncol(bed_table)] <- "sequence" + # index + data.table::setkey(bed_table, name, physical = FALSE) + + # check for duplicated names + if (anyDuplicated(bed_table[, "name"])) { + warning("Found duplicated names. Making names unique.") + bed_table[, name := make.unique(name)] + } + + # remove sequences below minimum length + total_rows <- nrow(bed_table) + bed_table <- bed_table[nchar(sequence) > min_seq_length] + if (total_rows > nrow(bed_table)) { + message("Removed ", total_rows - nrow(bed_table), " sequence(s) below minimum length of ", min_seq_length) + } + + # TODO forward fasta file as parameter so no bed -> fasta conversion is needed. + message("Writing fasta...") + # save as fasta + fasta_file <- paste0(basename(input), ".fasta") + seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file) + + message("Counting kmer...") + # count k-mer + hashsize <- 4 ^ kmer + count_output_binary <- "mer_count_binary.jf" + input <- fasta_file + jellyfish_call <- paste("jellyfish count ", "-m", kmer, "-s", hashsize, "-o", count_output_binary, input) + + system(command = jellyfish_call, wait = TRUE) + + mer_count_table <- "mer_count_table.jf" + jellyfish_dump_call <- paste("jellyfish dump --column --tab --output", mer_count_table, count_output_binary) + + system(command = jellyfish_dump_call, wait = TRUE) + + message("Reduce kmer.") + # load mer table + # columns: kmer, count + kmer_counts <- data.table::fread(input = mer_count_table, header = FALSE) + # order kmer descending + data.table::setorder(kmer_counts, -V2) + + # compute number of hits to keep + keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif) + + # reduce kmer + reduced_kmer <- reduce_kmer(kmer = kmer_counts, keep_hits) + + message("Find kmer in sequences.") + # find k-mer in sequences + # TODO minoverlap as parameter + # columns: name, start, end, width + ranges_table <- find_kmer_regions(bed = bed_table, kmer_counts = reduced_kmer, minoverlap = kmer - 1, threads = threads) + names(ranges_table)[1:2] <- c("relative_start", "relative_end") + + # merge ranged_table with bed_table + keep column order + merged <- merge(x = bed_table, y = ranges_table, by = "name", sort = FALSE)[, union(names(bed_table), names(ranges_table)), with = FALSE] + + # delete sequences without hit + merged <- na.omit(merged, cols = c("relative_start", "relative_end")) + message("Removed ", nrow(bed_table) - nrow(merged), " sequence(s) without hit.") + + message("Reduce sequences.") + # create subsequences + merged[, sequence := stringr::str_sub(sequence, relative_start, relative_end)] + + # bed files count from 0 + merged[, `:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)] + # change start end location + merged[, `:=`(start = start + relative_start, end = start + relative_end)] + + # clean table + merged[, `:=`(relative_start = NULL, relative_end = NULL, width = NULL)] + + if (clean) { + file.remove(fasta_file, count_output_binary, mer_count_table) + } + + data.table::fwrite(merged, file = output, sep = "\t", col.names = FALSE) +} + +#' returns sum of top x kmer frequencies to keep +#' +#' @param bed Bed table with sequences in last column +#' @param kmer Length of kmer +#' @param motif Length of motif +#' @param minoverlap Minimum number of bases overlapping between kmer and motif. Must be <= motif & <= kmer. Defaults to ceiling(motif / 2). +#' +#' @return Number of interesting kmer. +significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2)) { + if (minoverlap > kmer || minoverlap > motif) { + stop("Kmer & motif must be greater or equal than minoverlap!") + } + + # minimum sequence length to get all interesting overlaps + min_seq_length <- motif + 2 * (kmer - minoverlap) + + seq_lengths <- nchar(bed[[ncol(bed)]]) + + # reduce to max interesting length + seq_lengths <- ifelse(seq_lengths > min_seq_length, min_seq_length, seq_lengths) + + # calculate max possible kmer + topx <- sum(seq_lengths - kmer + 1) + + return(topx) +} + +#' @param kmer Kmer table +#' @param significant +reduce_kmer <- function(kmer, significant) { + kmer[, cumsum := cumsum(V2)] + + return(kmer[cumsum <= significant]) +} + +#' create list of significant ranges (one for each bed entry) +#' +#' @param bed Data.table of bed with sequence in last column +#' @param kmer_counts Data.table of counted kmer. Column1 = kmer, column2 = count. +#' @param minoverlap Minimum overlapping nucleotides between kmers to be merged. Positive integer. Must be smaller than kmer length. +#' @param threads Number of threads. +#' +#' @return Data.table with relative positions and width (start, end, width). +find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) { + if (nchar(kmer_counts[1, 1]) <= minoverlap) { + stop("Minoverlap must be smaller than kmer length!") + } + + names(kmer_counts)[1:2] <- c("kmer", "count") + data.table::setorder(kmer_counts, -count) + + seq_ranges <- pbapply::pblapply(seq_len(nrow(bed)), cl = threads, function(x) { + seq <- bed[x][[ncol(bed)]] + + #### locate ranges + ranges <- data.table::data.table(do.call(rbind, stringi::stri_locate_all_fixed(seq, pattern = kmer_counts[[1]]))) + + ranges <- na.omit(ranges, cols = c("start", "end")) + + if (nrow(ranges) < 1) { + return(data.table::data.table(start = NA, end = NA, width = NA)) + } + + # add kmer sequences + ranges[, sub_seq := stringr::str_sub(seq, start, end)] + # add kmer count + ranges[, count := kmer_counts[ranges[["sub_seq"]], "count", on = "kmer"]] + + #### reduce ranges + reduced_ranges <- IRanges::IRanges(start = ranges[["start"]], end = ranges[["end"]], names = ranges[["sub_seq"]]) + + # list of overlapping ranges + edge_list <- as.matrix(IRanges::findOverlaps(reduced_ranges, minoverlap = minoverlap, drop.self = FALSE, drop.redundant = TRUE)) + + # get components (groups of connected ranges) + graph <- igraph::graph_from_edgelist(edge_list, directed = FALSE) + # vector of node membership (indices correspond to ranges above) + member <- as.factor(igraph::components(graph)$membership) + + # list of membership vectors + node_membership <- lapply(levels(member), function(x) { + which(member == x) + }) + + # calculate component score (= sum of kmer count) + score <- vapply(node_membership, FUN.VALUE = numeric(1), function(x) { + sum(kmer_counts[x, "count"]) + }) + + selected_ranges <- node_membership[[which(score == max(score))[1]]] + + # reduce selected ranges + reduced_ranges <- IRanges::reduce(reduced_ranges[selected_ranges]) + + reduced_ranges <- data.table::as.data.table(reduced_ranges) + + return(reduced_ranges) + }) + + # create ranges table + conserved_regions_table <- data.table::rbindlist(seq_ranges) + conserved_regions_table[, name := bed[[4]]] + + return(conserved_regions_table) +} + +# call function with given parameter if not in interactive context (e.g. run from shell) +if (!interactive()) { + # show apply progressbar + pbo <- pbapply::pboptions(type = "timer") + # remove last parameter (help param) + params <- opt[-length(opt)] + do.call(reduce_bed, args = params) +} diff --git a/masterenv.yml b/masterenv.yml index 175f53a..2c1879f 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -3,12 +3,19 @@ name: masterenv channels: - bioconda - conda-forge - - defaults - - r dependencies: - bedtools - python - - r-essentials - r-seqinr - numpy - pybigWig + - cd-hit + - jellyfish + - r-base>=3.5.1 + - r-data.table + - r-pbapply + - r-igraph + - r-stringi + - r-stringr + - r-optparse + - bioconductor-iranges