-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from loosolab/cluster
Cluster
- Loading branch information
Showing
3 changed files
with
326 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
#! /bin/Rscript | ||
library("optparse") | ||
|
||
option_list <- list( | ||
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"), | ||
make_option(opt_str = c("-s", "--similarity"), default = 0.8, help = "Similarity threshold. Default = %default", metavar = "double >= 0.8"), | ||
make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default", metavar = "integer"), | ||
make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file. Default = %default", metavar = "character"), | ||
make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical") | ||
# TODO more args | ||
) | ||
|
||
opt_parser <- OptionParser(option_list = option_list, | ||
description = "CD-HIT-EST Wrapper function.") | ||
|
||
opt <- parse_args(opt_parser) | ||
|
||
#' cd-hit wrapper | ||
#' | ||
#' @param input | ||
#' @param similarity Similarity threshold. | ||
#' @param coverage Minimal alignment length for both sequences in nucelotides. | ||
#' @param output Clustered bedfile. Adds cluster number in last column (lower number = bigger cluster). | ||
#' @param clean Clean up after run. | ||
#' | ||
#' @return bed_table with cluster in last column | ||
#' TODO add all cdhit parameter | ||
#' TODO check whether cdhit is installed | ||
cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE) { | ||
# load bed if neccessary | ||
if (!data.table::is.data.table(input)) { | ||
bed_table <- data.table::fread(input = input, header = FALSE) | ||
} else { | ||
bed_table <- input | ||
} | ||
|
||
### convert bed to fasta | ||
# 4th column = name | ||
# last column = sequence | ||
fasta_file <- "converted_bed.fasta" | ||
seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file) | ||
|
||
### cd-hit-est | ||
cdhit_output <- "cdhit_output" | ||
cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", similarity, "-A", coverage, "-G 0 -n 3 -g 1 -r 0 -l 5 -sc 1 -d 0") | ||
|
||
system(command = cdhit_call, wait = TRUE) | ||
|
||
# reformat cluster file | ||
# columns: id, clstr, clstr_size, length, clstr_rep, clstr_iden, clstr_cov | ||
cluster_file <- "reformated.clstr" | ||
cluster_call <- paste("clstr2txt.pl", paste0(cdhit_output, ".clstr"), ">", cluster_file) | ||
|
||
system(command = cluster_call, wait = TRUE) | ||
|
||
# load reformated file | ||
cluster <- data.table::fread(cluster_file) | ||
|
||
### add cluster to bed_table | ||
result <- merge(x = bed_table, y = cluster[, c("id", "clstr")], by.x = "V4", by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE] | ||
|
||
# delete files | ||
if (clean) { | ||
file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file) | ||
} | ||
|
||
data.table::fwrite(x = result, file = output, sep = "\t", col.names = FALSE) | ||
} | ||
|
||
# call function with given parameter if not in interactive context (e.g. run from shell) | ||
if (!interactive()) { | ||
# remove last parameter (help param) | ||
params <- opt[-length(opt)] | ||
do.call(cdhitest, args = params) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,241 @@ | ||
#! /bin/Rscript | ||
library("optparse") | ||
|
||
option_list <- list( | ||
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"), | ||
make_option(opt_str = c("-k", "--kmer"), default = 10, help = "Kmer length. Default = %default", metavar = "integer"), | ||
make_option(opt_str = c("-m", "--motif"), default = 10, help = "Estimated motif length. Default = %default", metavar = "integer"), | ||
make_option(opt_str = c("-o", "--output"), default = "reduced.bed", help = "Output file. Default = %default", metavar = "character"), | ||
make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use -1 for all available cores. Default = %default", metavar = "integer"), | ||
make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), | ||
make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to motif length.", metavar = "integer", type = "integer") | ||
# TODO more args | ||
) | ||
|
||
opt_parser <- OptionParser(option_list = option_list, | ||
description = "Reduce sequences to frequent regions.") | ||
|
||
opt <- parse_args(opt_parser) | ||
|
||
#' Reduce bed file to conserved regions | ||
#' | ||
#' @param input bed file | ||
#' @param kmer Length of kmer. | ||
#' @param motif Estimated motif length. | ||
#' @param output Output file | ||
#' @param threads Number of threads. | ||
#' @param clean Delete all temporary files. | ||
#' @param minoverlap_kmer Minimum required overlap between kmers. # TODO | ||
#' @param minoverlap_motif Minimum required overlap between motif and kmer. # TODO | ||
#' @param min_seq_length Must be smaller or equal to kmer and motif. Default = motif. | ||
#' | ||
#' @return reduced bed | ||
#' TODO check whether jellyfish is installed | ||
reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer, minoverlap_motif, min_seq_length = motif) { | ||
# get number of available cores | ||
if (threads == -1) { | ||
threads <- parallel::detectCores() | ||
} | ||
|
||
message("Loading bed...") | ||
# load bed | ||
# columns: chr, start, end, name, ..., sequence | ||
bed_table <- data.table::fread(input = input, header = FALSE) | ||
names(bed_table)[1:4] <- c("chr", "start", "end", "name") | ||
names(bed_table)[ncol(bed_table)] <- "sequence" | ||
# index | ||
data.table::setkey(bed_table, name, physical = FALSE) | ||
|
||
# check for duplicated names | ||
if (anyDuplicated(bed_table[, "name"])) { | ||
warning("Found duplicated names. Making names unique.") | ||
bed_table[, name := make.unique(name)] | ||
} | ||
|
||
# remove sequences below minimum length | ||
total_rows <- nrow(bed_table) | ||
bed_table <- bed_table[nchar(sequence) > min_seq_length] | ||
if (total_rows > nrow(bed_table)) { | ||
message("Removed ", total_rows - nrow(bed_table), " sequence(s) below minimum length of ", min_seq_length) | ||
} | ||
|
||
# TODO forward fasta file as parameter so no bed -> fasta conversion is needed. | ||
message("Writing fasta...") | ||
# save as fasta | ||
fasta_file <- paste0(basename(input), ".fasta") | ||
seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file) | ||
|
||
message("Counting kmer...") | ||
# count k-mer | ||
hashsize <- 4 ^ kmer | ||
count_output_binary <- "mer_count_binary.jf" | ||
input <- fasta_file | ||
jellyfish_call <- paste("jellyfish count ", "-m", kmer, "-s", hashsize, "-o", count_output_binary, input) | ||
|
||
system(command = jellyfish_call, wait = TRUE) | ||
|
||
mer_count_table <- "mer_count_table.jf" | ||
jellyfish_dump_call <- paste("jellyfish dump --column --tab --output", mer_count_table, count_output_binary) | ||
|
||
system(command = jellyfish_dump_call, wait = TRUE) | ||
|
||
message("Reduce kmer.") | ||
# load mer table | ||
# columns: kmer, count | ||
kmer_counts <- data.table::fread(input = mer_count_table, header = FALSE) | ||
# order kmer descending | ||
data.table::setorder(kmer_counts, -V2) | ||
|
||
# compute number of hits to keep | ||
keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif) | ||
|
||
# reduce kmer | ||
reduced_kmer <- reduce_kmer(kmer = kmer_counts, keep_hits) | ||
|
||
message("Find kmer in sequences.") | ||
# find k-mer in sequences | ||
# TODO minoverlap as parameter | ||
# columns: name, start, end, width | ||
ranges_table <- find_kmer_regions(bed = bed_table, kmer_counts = reduced_kmer, minoverlap = kmer - 1, threads = threads) | ||
names(ranges_table)[1:2] <- c("relative_start", "relative_end") | ||
|
||
# merge ranged_table with bed_table + keep column order | ||
merged <- merge(x = bed_table, y = ranges_table, by = "name", sort = FALSE)[, union(names(bed_table), names(ranges_table)), with = FALSE] | ||
|
||
# delete sequences without hit | ||
merged <- na.omit(merged, cols = c("relative_start", "relative_end")) | ||
message("Removed ", nrow(bed_table) - nrow(merged), " sequence(s) without hit.") | ||
|
||
message("Reduce sequences.") | ||
# create subsequences | ||
merged[, sequence := stringr::str_sub(sequence, relative_start, relative_end)] | ||
|
||
# bed files count from 0 | ||
merged[, `:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)] | ||
# change start end location | ||
merged[, `:=`(start = start + relative_start, end = start + relative_end)] | ||
|
||
# clean table | ||
merged[, `:=`(relative_start = NULL, relative_end = NULL, width = NULL)] | ||
|
||
if (clean) { | ||
file.remove(fasta_file, count_output_binary, mer_count_table) | ||
} | ||
|
||
data.table::fwrite(merged, file = output, sep = "\t", col.names = FALSE) | ||
} | ||
|
||
#' returns sum of top x kmer frequencies to keep | ||
#' | ||
#' @param bed Bed table with sequences in last column | ||
#' @param kmer Length of kmer | ||
#' @param motif Length of motif | ||
#' @param minoverlap Minimum number of bases overlapping between kmer and motif. Must be <= motif & <= kmer. Defaults to ceiling(motif / 2). | ||
#' | ||
#' @return Number of interesting kmer. | ||
significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2)) { | ||
if (minoverlap > kmer || minoverlap > motif) { | ||
stop("Kmer & motif must be greater or equal than minoverlap!") | ||
} | ||
|
||
# minimum sequence length to get all interesting overlaps | ||
min_seq_length <- motif + 2 * (kmer - minoverlap) | ||
|
||
seq_lengths <- nchar(bed[[ncol(bed)]]) | ||
|
||
# reduce to max interesting length | ||
seq_lengths <- ifelse(seq_lengths > min_seq_length, min_seq_length, seq_lengths) | ||
|
||
# calculate max possible kmer | ||
topx <- sum(seq_lengths - kmer + 1) | ||
|
||
return(topx) | ||
} | ||
|
||
#' @param kmer Kmer table | ||
#' @param significant | ||
reduce_kmer <- function(kmer, significant) { | ||
kmer[, cumsum := cumsum(V2)] | ||
|
||
return(kmer[cumsum <= significant]) | ||
} | ||
|
||
#' create list of significant ranges (one for each bed entry) | ||
#' | ||
#' @param bed Data.table of bed with sequence in last column | ||
#' @param kmer_counts Data.table of counted kmer. Column1 = kmer, column2 = count. | ||
#' @param minoverlap Minimum overlapping nucleotides between kmers to be merged. Positive integer. Must be smaller than kmer length. | ||
#' @param threads Number of threads. | ||
#' | ||
#' @return Data.table with relative positions and width (start, end, width). | ||
find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) { | ||
if (nchar(kmer_counts[1, 1]) <= minoverlap) { | ||
stop("Minoverlap must be smaller than kmer length!") | ||
} | ||
|
||
names(kmer_counts)[1:2] <- c("kmer", "count") | ||
data.table::setorder(kmer_counts, -count) | ||
|
||
seq_ranges <- pbapply::pblapply(seq_len(nrow(bed)), cl = threads, function(x) { | ||
seq <- bed[x][[ncol(bed)]] | ||
|
||
#### locate ranges | ||
ranges <- data.table::data.table(do.call(rbind, stringi::stri_locate_all_fixed(seq, pattern = kmer_counts[[1]]))) | ||
|
||
ranges <- na.omit(ranges, cols = c("start", "end")) | ||
|
||
if (nrow(ranges) < 1) { | ||
return(data.table::data.table(start = NA, end = NA, width = NA)) | ||
} | ||
|
||
# add kmer sequences | ||
ranges[, sub_seq := stringr::str_sub(seq, start, end)] | ||
# add kmer count | ||
ranges[, count := kmer_counts[ranges[["sub_seq"]], "count", on = "kmer"]] | ||
|
||
#### reduce ranges | ||
reduced_ranges <- IRanges::IRanges(start = ranges[["start"]], end = ranges[["end"]], names = ranges[["sub_seq"]]) | ||
|
||
# list of overlapping ranges | ||
edge_list <- as.matrix(IRanges::findOverlaps(reduced_ranges, minoverlap = minoverlap, drop.self = FALSE, drop.redundant = TRUE)) | ||
|
||
# get components (groups of connected ranges) | ||
graph <- igraph::graph_from_edgelist(edge_list, directed = FALSE) | ||
# vector of node membership (indices correspond to ranges above) | ||
member <- as.factor(igraph::components(graph)$membership) | ||
|
||
# list of membership vectors | ||
node_membership <- lapply(levels(member), function(x) { | ||
which(member == x) | ||
}) | ||
|
||
# calculate component score (= sum of kmer count) | ||
score <- vapply(node_membership, FUN.VALUE = numeric(1), function(x) { | ||
sum(kmer_counts[x, "count"]) | ||
}) | ||
|
||
selected_ranges <- node_membership[[which(score == max(score))[1]]] | ||
|
||
# reduce selected ranges | ||
reduced_ranges <- IRanges::reduce(reduced_ranges[selected_ranges]) | ||
|
||
reduced_ranges <- data.table::as.data.table(reduced_ranges) | ||
|
||
return(reduced_ranges) | ||
}) | ||
|
||
# create ranges table | ||
conserved_regions_table <- data.table::rbindlist(seq_ranges) | ||
conserved_regions_table[, name := bed[[4]]] | ||
|
||
return(conserved_regions_table) | ||
} | ||
|
||
# call function with given parameter if not in interactive context (e.g. run from shell) | ||
if (!interactive()) { | ||
# show apply progressbar | ||
pbo <- pbapply::pboptions(type = "timer") | ||
# remove last parameter (help param) | ||
params <- opt[-length(opt)] | ||
do.call(reduce_bed, args = params) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters