Skip to content

Cluster #43

Merged
merged 9 commits into from
Jan 9, 2019
21 changes: 18 additions & 3 deletions bin/2.1_clustering/cdhit_wrapper.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#! /bin/Rscript
if (!require(optparse)) install.packages("optparse"); library(optparse)
if (!require(optparse, quietly = TRUE)) install.packages("optparse"); library(optparse)

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Fourth column is expected to contain names, last column must be sequences.", metavar = "character"),
Expand Down Expand Up @@ -74,6 +74,7 @@ opt <- parse_args(opt_parser)
#' @author Hendrik Schultheis <Hendrik.Schultheis@@mpi-bn.mpg.de>
#'
cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) {
# parameter checks
if (system("which cd-hit-est", ignore.stdout = FALSE) != 0) {
stop("Required program CD-HIT not found! Please check whether it is installed.")
}
Expand All @@ -82,6 +83,14 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed"
stop("No input specified! Please forward a valid bed-file.")
}

if (!file.exists(input)) {
stop("File ", input, " does not exist!")
}

if (!is.logical(clean)) {
stop("Parameter clean has to be a boolean value.")
}

message("Loading bed.")
# load bed if necessary
if (!data.table::is.data.table(input)) {
Expand Down Expand Up @@ -166,6 +175,12 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed"
# call function with given parameter if not in interactive context (e.g. run from shell)
if (!interactive()) {
# remove last parameter (help param)
params <- opt[-length(opt)]
do.call(cdhitest, args = params)

# show help if called without arguments
if (length(commandArgs(trailingOnly = TRUE)) <= 0) {
print_help(opt_parser)
} else {
params <- opt[-length(opt)]
do.call(cdhitest, args = params)
}
}
52 changes: 48 additions & 4 deletions bin/2.1_clustering/reduce_sequence.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#! /bin/Rscript
if (!require(optparse)) install.packages("optparse"); library(optparse)
if (!require(optparse, quietly = TRUE)) install.packages("optparse"); library(optparse)

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"),
Expand Down Expand Up @@ -38,6 +38,7 @@ opt <- parse_args(opt_parser)
#' @author Hendrik Schultheis <Hendrik.Schultheis@@mpi-bn.mpg.de>
#'
reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurrence = 1) {
# parameter checks
if (system("which jellyfish", ignore.stdout = TRUE) != 0) {
stop("Required program jellyfish not found! Please check whether it is installed.")
}
Expand All @@ -46,6 +47,42 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed"
stop("No input specified! Please forward a valid bed-file.")
}

if (!file.exists(input)) {
stop("File ", input, " does not exist!")
}

if (!is.numeric(kmer) || kmer != as.integer(kmer) || kmer <= 0) {
stop("K-mer has to be a positive integer above 0.")
}

if (!is.numeric(motif) || motif != as.integer(motif) || motif <= 0) {
stop("Motif has to be a positive integer above 0.")
}

if (!is.numeric(threads) || threads != as.integer(threads) || threads < 0) {
stop("Threads has to be a positive integer (0 or greater).")
}

if (!is.logical(clean)) {
stop("Parameter clean has to be a boolean value.")
}

if (!is.numeric(minoverlap_kmer) || minoverlap_kmer != as.integer(minoverlap_kmer) || minoverlap_kmer <= 0) {
stop("Minoverlap_kmer has to be a positive integer above 0.")
}

if (!is.numeric(minoverlap_motif) || minoverlap_motif != as.integer(minoverlap_motif) || minoverlap_motif <= 0) {
stop("Minoverlap_motif has to be a positive integer above 0.")
}

if (!is.numeric(min_seq_length) || min_seq_length != as.integer(min_seq_length) || min_seq_length <= 0) {
stop("Min_seq_length hat to be a positive integer above 0.")
}

if (!is.numeric(motif_occurrence) || motif_occurrence < 0 || motif_occurrence > 1) { # TODO remove motif_occurence > 1. See TODO of find_kmer_regions below.
stop("Motif_occurence has to be a numeric value above 0 and can not be greater than 1.")
}

# get number of available cores
if (threads == 0) {
threads <- parallel::detectCores()
Expand Down Expand Up @@ -119,6 +156,7 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed"

# reduce k-mer
reduced_kmer <- reduce_kmer(kmer = kmer_counts, significant = keep_hits)
message("Reduced kmer to first most frequent ", nrow(reduced_kmer), " out of ", nrow(kmer_counts), ".")

message("Find k-mer in sequences.")
# find k-mer in sequences
Expand Down Expand Up @@ -280,7 +318,13 @@ find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL)
if (!interactive()) {
# show apply progressbar
pbo <- pbapply::pboptions(type = "timer")
# remove last parameter (help param)
params <- opt[-length(opt)]
do.call(reduce_sequence, args = params)

# show help if called without arguments
if (length(commandArgs(trailingOnly = TRUE)) <= 0) {
print_help(opt_parser)
} else {
# remove last parameter (help param)
params <- opt[-length(opt)]
do.call(reduce_sequence, args = params)
}
}