Skip to content

Commit

Permalink
Merge pull request #2 from loosolab/cluster
Browse files Browse the repository at this point in the history
Cluster
  • Loading branch information
renewiegandt authored Dec 4, 2018
2 parents 8c843fb + 0f2822d commit 3faf37a
Show file tree
Hide file tree
Showing 3 changed files with 326 additions and 3 deletions.
75 changes: 75 additions & 0 deletions bin/cdhit_wrapper.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#! /bin/Rscript
library("optparse")

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"),
make_option(opt_str = c("-s", "--similarity"), default = 0.8, help = "Similarity threshold. Default = %default", metavar = "double >= 0.8"),
make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default", metavar = "integer"),
make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file. Default = %default", metavar = "character"),
make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical")
# TODO more args
)

opt_parser <- OptionParser(option_list = option_list,
description = "CD-HIT-EST Wrapper function.")

opt <- parse_args(opt_parser)

#' cd-hit wrapper
#'
#' @param input
#' @param similarity Similarity threshold.
#' @param coverage Minimal alignment length for both sequences in nucelotides.
#' @param output Clustered bedfile. Adds cluster number in last column (lower number = bigger cluster).
#' @param clean Clean up after run.
#'
#' @return bed_table with cluster in last column
#' TODO add all cdhit parameter
#' TODO check whether cdhit is installed
cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE) {
# load bed if neccessary
if (!data.table::is.data.table(input)) {
bed_table <- data.table::fread(input = input, header = FALSE)
} else {
bed_table <- input
}

### convert bed to fasta
# 4th column = name
# last column = sequence
fasta_file <- "converted_bed.fasta"
seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file)

### cd-hit-est
cdhit_output <- "cdhit_output"
cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", similarity, "-A", coverage, "-G 0 -n 3 -g 1 -r 0 -l 5 -sc 1 -d 0")

system(command = cdhit_call, wait = TRUE)

# reformat cluster file
# columns: id, clstr, clstr_size, length, clstr_rep, clstr_iden, clstr_cov
cluster_file <- "reformated.clstr"
cluster_call <- paste("clstr2txt.pl", paste0(cdhit_output, ".clstr"), ">", cluster_file)

system(command = cluster_call, wait = TRUE)

# load reformated file
cluster <- data.table::fread(cluster_file)

### add cluster to bed_table
result <- merge(x = bed_table, y = cluster[, c("id", "clstr")], by.x = "V4", by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE]

# delete files
if (clean) {
file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file)
}

data.table::fwrite(x = result, file = output, sep = "\t", col.names = FALSE)
}

# call function with given parameter if not in interactive context (e.g. run from shell)
if (!interactive()) {
# remove last parameter (help param)
params <- opt[-length(opt)]
do.call(cdhitest, args = params)
}
241 changes: 241 additions & 0 deletions bin/reduce_bed.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
#! /bin/Rscript
library("optparse")

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"),
make_option(opt_str = c("-k", "--kmer"), default = 10, help = "Kmer length. Default = %default", metavar = "integer"),
make_option(opt_str = c("-m", "--motif"), default = 10, help = "Estimated motif length. Default = %default", metavar = "integer"),
make_option(opt_str = c("-o", "--output"), default = "reduced.bed", help = "Output file. Default = %default", metavar = "character"),
make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use -1 for all available cores. Default = %default", metavar = "integer"),
make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"),
make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to motif length.", metavar = "integer", type = "integer")
# TODO more args
)

opt_parser <- OptionParser(option_list = option_list,
description = "Reduce sequences to frequent regions.")

opt <- parse_args(opt_parser)

#' Reduce bed file to conserved regions
#'
#' @param input bed file
#' @param kmer Length of kmer.
#' @param motif Estimated motif length.
#' @param output Output file
#' @param threads Number of threads.
#' @param clean Delete all temporary files.
#' @param minoverlap_kmer Minimum required overlap between kmers. # TODO
#' @param minoverlap_motif Minimum required overlap between motif and kmer. # TODO
#' @param min_seq_length Must be smaller or equal to kmer and motif. Default = motif.
#'
#' @return reduced bed
#' TODO check whether jellyfish is installed
reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer, minoverlap_motif, min_seq_length = motif) {
# get number of available cores
if (threads == -1) {
threads <- parallel::detectCores()
}

message("Loading bed...")
# load bed
# columns: chr, start, end, name, ..., sequence
bed_table <- data.table::fread(input = input, header = FALSE)
names(bed_table)[1:4] <- c("chr", "start", "end", "name")
names(bed_table)[ncol(bed_table)] <- "sequence"
# index
data.table::setkey(bed_table, name, physical = FALSE)

# check for duplicated names
if (anyDuplicated(bed_table[, "name"])) {
warning("Found duplicated names. Making names unique.")
bed_table[, name := make.unique(name)]
}

# remove sequences below minimum length
total_rows <- nrow(bed_table)
bed_table <- bed_table[nchar(sequence) > min_seq_length]
if (total_rows > nrow(bed_table)) {
message("Removed ", total_rows - nrow(bed_table), " sequence(s) below minimum length of ", min_seq_length)
}

# TODO forward fasta file as parameter so no bed -> fasta conversion is needed.
message("Writing fasta...")
# save as fasta
fasta_file <- paste0(basename(input), ".fasta")
seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file)

message("Counting kmer...")
# count k-mer
hashsize <- 4 ^ kmer
count_output_binary <- "mer_count_binary.jf"
input <- fasta_file
jellyfish_call <- paste("jellyfish count ", "-m", kmer, "-s", hashsize, "-o", count_output_binary, input)

system(command = jellyfish_call, wait = TRUE)

mer_count_table <- "mer_count_table.jf"
jellyfish_dump_call <- paste("jellyfish dump --column --tab --output", mer_count_table, count_output_binary)

system(command = jellyfish_dump_call, wait = TRUE)

message("Reduce kmer.")
# load mer table
# columns: kmer, count
kmer_counts <- data.table::fread(input = mer_count_table, header = FALSE)
# order kmer descending
data.table::setorder(kmer_counts, -V2)

# compute number of hits to keep
keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif)

# reduce kmer
reduced_kmer <- reduce_kmer(kmer = kmer_counts, keep_hits)

message("Find kmer in sequences.")
# find k-mer in sequences
# TODO minoverlap as parameter
# columns: name, start, end, width
ranges_table <- find_kmer_regions(bed = bed_table, kmer_counts = reduced_kmer, minoverlap = kmer - 1, threads = threads)
names(ranges_table)[1:2] <- c("relative_start", "relative_end")

# merge ranged_table with bed_table + keep column order
merged <- merge(x = bed_table, y = ranges_table, by = "name", sort = FALSE)[, union(names(bed_table), names(ranges_table)), with = FALSE]

# delete sequences without hit
merged <- na.omit(merged, cols = c("relative_start", "relative_end"))
message("Removed ", nrow(bed_table) - nrow(merged), " sequence(s) without hit.")

message("Reduce sequences.")
# create subsequences
merged[, sequence := stringr::str_sub(sequence, relative_start, relative_end)]

# bed files count from 0
merged[, `:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)]
# change start end location
merged[, `:=`(start = start + relative_start, end = start + relative_end)]

# clean table
merged[, `:=`(relative_start = NULL, relative_end = NULL, width = NULL)]

if (clean) {
file.remove(fasta_file, count_output_binary, mer_count_table)
}

data.table::fwrite(merged, file = output, sep = "\t", col.names = FALSE)
}

#' returns sum of top x kmer frequencies to keep
#'
#' @param bed Bed table with sequences in last column
#' @param kmer Length of kmer
#' @param motif Length of motif
#' @param minoverlap Minimum number of bases overlapping between kmer and motif. Must be <= motif & <= kmer. Defaults to ceiling(motif / 2).
#'
#' @return Number of interesting kmer.
significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2)) {
if (minoverlap > kmer || minoverlap > motif) {
stop("Kmer & motif must be greater or equal than minoverlap!")
}

# minimum sequence length to get all interesting overlaps
min_seq_length <- motif + 2 * (kmer - minoverlap)

seq_lengths <- nchar(bed[[ncol(bed)]])

# reduce to max interesting length
seq_lengths <- ifelse(seq_lengths > min_seq_length, min_seq_length, seq_lengths)

# calculate max possible kmer
topx <- sum(seq_lengths - kmer + 1)

return(topx)
}

#' @param kmer Kmer table
#' @param significant
reduce_kmer <- function(kmer, significant) {
kmer[, cumsum := cumsum(V2)]

return(kmer[cumsum <= significant])
}

#' create list of significant ranges (one for each bed entry)
#'
#' @param bed Data.table of bed with sequence in last column
#' @param kmer_counts Data.table of counted kmer. Column1 = kmer, column2 = count.
#' @param minoverlap Minimum overlapping nucleotides between kmers to be merged. Positive integer. Must be smaller than kmer length.
#' @param threads Number of threads.
#'
#' @return Data.table with relative positions and width (start, end, width).
find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) {
if (nchar(kmer_counts[1, 1]) <= minoverlap) {
stop("Minoverlap must be smaller than kmer length!")
}

names(kmer_counts)[1:2] <- c("kmer", "count")
data.table::setorder(kmer_counts, -count)

seq_ranges <- pbapply::pblapply(seq_len(nrow(bed)), cl = threads, function(x) {
seq <- bed[x][[ncol(bed)]]

#### locate ranges
ranges <- data.table::data.table(do.call(rbind, stringi::stri_locate_all_fixed(seq, pattern = kmer_counts[[1]])))

ranges <- na.omit(ranges, cols = c("start", "end"))

if (nrow(ranges) < 1) {
return(data.table::data.table(start = NA, end = NA, width = NA))
}

# add kmer sequences
ranges[, sub_seq := stringr::str_sub(seq, start, end)]
# add kmer count
ranges[, count := kmer_counts[ranges[["sub_seq"]], "count", on = "kmer"]]

#### reduce ranges
reduced_ranges <- IRanges::IRanges(start = ranges[["start"]], end = ranges[["end"]], names = ranges[["sub_seq"]])

# list of overlapping ranges
edge_list <- as.matrix(IRanges::findOverlaps(reduced_ranges, minoverlap = minoverlap, drop.self = FALSE, drop.redundant = TRUE))

# get components (groups of connected ranges)
graph <- igraph::graph_from_edgelist(edge_list, directed = FALSE)
# vector of node membership (indices correspond to ranges above)
member <- as.factor(igraph::components(graph)$membership)

# list of membership vectors
node_membership <- lapply(levels(member), function(x) {
which(member == x)
})

# calculate component score (= sum of kmer count)
score <- vapply(node_membership, FUN.VALUE = numeric(1), function(x) {
sum(kmer_counts[x, "count"])
})

selected_ranges <- node_membership[[which(score == max(score))[1]]]

# reduce selected ranges
reduced_ranges <- IRanges::reduce(reduced_ranges[selected_ranges])

reduced_ranges <- data.table::as.data.table(reduced_ranges)

return(reduced_ranges)
})

# create ranges table
conserved_regions_table <- data.table::rbindlist(seq_ranges)
conserved_regions_table[, name := bed[[4]]]

return(conserved_regions_table)
}

# call function with given parameter if not in interactive context (e.g. run from shell)
if (!interactive()) {
# show apply progressbar
pbo <- pbapply::pboptions(type = "timer")
# remove last parameter (help param)
params <- opt[-length(opt)]
do.call(reduce_bed, args = params)
}
13 changes: 10 additions & 3 deletions masterenv.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,19 @@ name: masterenv
channels:
- bioconda
- conda-forge
- defaults
- r
dependencies:
- bedtools
- python
- r-essentials
- r-seqinr
- numpy
- pybigWig
- cd-hit
- jellyfish
- r-base>=3.5.1
- r-data.table
- r-pbapply
- r-igraph
- r-stringi
- r-stringr
- r-optparse
- bioconductor-iranges

0 comments on commit 3faf37a

Please sign in to comment.