From 24a5a063c0f5d5305f2857fb29229f5c73c78b35 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 5 Dec 2018 09:49:48 +0100 Subject: [PATCH 01/14] fixed minimum sequence length and added check --- bin/reduce_bed.R | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index 223c946..ea2c017 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -8,7 +8,7 @@ option_list <- list( make_option(opt_str = c("-o", "--output"), default = "reduced.bed", help = "Output file. Default = %default", metavar = "character"), make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use -1 for all available cores. Default = %default", metavar = "integer"), make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), - make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to motif length.", metavar = "integer", type = "integer") + make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer") # TODO more args ) @@ -25,13 +25,13 @@ opt <- parse_args(opt_parser) #' @param output Output file #' @param threads Number of threads. #' @param clean Delete all temporary files. -#' @param minoverlap_kmer Minimum required overlap between kmers. # TODO -#' @param minoverlap_motif Minimum required overlap between motif and kmer. # TODO -#' @param min_seq_length Must be smaller or equal to kmer and motif. Default = motif. +#' @param minoverlap_kmer Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges.# TODO +#' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation.# TODO +#' @param min_seq_length Must be greater or equal to kmer and motif. Default = max(c(motif, kmer)). #' #' @return reduced bed #' TODO check whether jellyfish is installed -reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer, minoverlap_motif, min_seq_length = motif) { +reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer, minoverlap_motif, min_seq_length = max(c(motif, kmer))) { # get number of available cores if (threads == -1) { threads <- parallel::detectCores() @@ -53,6 +53,10 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr } # remove sequences below minimum length + if (min_seq_length < max(c(kmer, motif))) { + stop("Minimum sequence length must be greater or equal to ", max(c(motif, kmer)), " (maximum value of kmer and motif).") + } + total_rows <- nrow(bed_table) bed_table <- bed_table[nchar(sequence) > min_seq_length] if (total_rows > nrow(bed_table)) { From 689331e5f4cf7156f118db1f2aa5ebb3041eb817 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 5 Dec 2018 10:12:45 +0100 Subject: [PATCH 02/14] implemented minoverlap_kmer & minoverlap_motif --- bin/reduce_bed.R | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index ea2c017..c0b1ae7 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -8,7 +8,9 @@ option_list <- list( make_option(opt_str = c("-o", "--output"), default = "reduced.bed", help = "Output file. Default = %default", metavar = "character"), make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use -1 for all available cores. Default = %default", metavar = "integer"), make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), - make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer") + make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"), + make_option(opt_str = c("-ko", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), + make_option(opt_str = c("-mo", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer") # TODO more args ) @@ -25,13 +27,13 @@ opt <- parse_args(opt_parser) #' @param output Output file #' @param threads Number of threads. #' @param clean Delete all temporary files. -#' @param minoverlap_kmer Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges.# TODO -#' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation.# TODO +#' @param minoverlap_kmer Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1 +#' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2) #' @param min_seq_length Must be greater or equal to kmer and motif. Default = max(c(motif, kmer)). #' #' @return reduced bed #' TODO check whether jellyfish is installed -reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer, minoverlap_motif, min_seq_length = max(c(motif, kmer))) { +reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer))) { # get number of available cores if (threads == -1) { threads <- parallel::detectCores() @@ -91,16 +93,16 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr data.table::setorder(kmer_counts, -V2) # compute number of hits to keep - keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif) + keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif, minoverlap = minoverlap_motif) # reduce kmer - reduced_kmer <- reduce_kmer(kmer = kmer_counts, keep_hits) + reduced_kmer <- reduce_kmer(kmer = kmer_counts, significant = keep_hits) message("Find kmer in sequences.") # find k-mer in sequences # TODO minoverlap as parameter # columns: name, start, end, width - ranges_table <- find_kmer_regions(bed = bed_table, kmer_counts = reduced_kmer, minoverlap = kmer - 1, threads = threads) + ranges_table <- find_kmer_regions(bed = bed_table, kmer_counts = reduced_kmer, minoverlap = minoverlap_kmer, threads = threads) names(ranges_table)[1:2] <- c("relative_start", "relative_end") # merge ranged_table with bed_table + keep column order @@ -139,7 +141,7 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr #' @return Number of interesting kmer. significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2)) { if (minoverlap > kmer || minoverlap > motif) { - stop("Kmer & motif must be greater or equal than minoverlap!") + stop("Kmer & motif must be greater or equal to minoverlap!") } # minimum sequence length to get all interesting overlaps From 617ff9f56b7fb3074e24ae4985ef8b8556081ea9 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 5 Dec 2018 11:25:04 +0100 Subject: [PATCH 03/14] better docu; reorder data in reduce_kmer() --- bin/reduce_bed.R | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index c0b1ae7..d539399 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -100,7 +100,6 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr message("Find kmer in sequences.") # find k-mer in sequences - # TODO minoverlap as parameter # columns: name, start, end, width ranges_table <- find_kmer_regions(bed = bed_table, kmer_counts = reduced_kmer, minoverlap = minoverlap_kmer, threads = threads) names(ranges_table)[1:2] <- c("relative_start", "relative_end") @@ -131,7 +130,7 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr data.table::fwrite(merged, file = output, sep = "\t", col.names = FALSE) } -#' returns sum of top x kmer frequencies to keep +#' Predict how many interesting kmer are possible for the given data. #' #' @param bed Bed table with sequences in last column #' @param kmer Length of kmer @@ -158,9 +157,15 @@ significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2)) return(topx) } -#' @param kmer Kmer table -#' @param significant +#' Orders kmer table after count descending and keeps all kmer with a cumulative sum below the given significance threshold. +#' +#' @param kmer Kmer data.table columns: kmer, count +#' @param significant Value from significant_kmer function. +#' +#' @return reduced data.table reduce_kmer <- function(kmer, significant) { + data.table::setorderv(kmer, cols = names(kmer)[2], order = -1) + kmer[, cumsum := cumsum(V2)] return(kmer[cumsum <= significant]) From a584817a1409eeda3aefb4acb93feb887c09210a Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 5 Dec 2018 15:10:57 +0100 Subject: [PATCH 04/14] added progress messages; thread param; documentation --- bin/cdhit_wrapper.R | 49 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R index ed5ea7f..f26a71a 100644 --- a/bin/cdhit_wrapper.R +++ b/bin/cdhit_wrapper.R @@ -2,11 +2,34 @@ library("optparse") option_list <- list( - make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"), - make_option(opt_str = c("-s", "--similarity"), default = 0.8, help = "Similarity threshold. Default = %default", metavar = "double >= 0.8"), + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Fourth column is expected to contain names, last column must be sequences.", metavar = "character"), + make_option(opt_str = c("-c", "--identity"), default = 0.8, help = "Identity threshold. Default = %default", metavar = "double >= 0.8"), make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default", metavar = "integer"), - make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file. Default = %default", metavar = "character"), - make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical") + make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file same as input but with appended column of cluster numbers. Default = %default", metavar = "character"), + make_option(opt_str = c("--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), + # make_option(opt_str = c("-G", "--global"), default = FALSE), + # make_option(opt_str = c("-b", "--band_width"), default = 20), + # make_option(opt_str = c("-M", "--memory"), default = 800), + make_option(opt_str = c("-T", "--threads"), default = 1)#, + # make_option(opt_str = c("-n", "--word_length"), default = 3), + # make_option(opt_str = c("-l", "--throw_away_sequences"), default = 5), + # make_option(opt_str = c("-d", "--description")), # can not produce bed if this is != 0 + # make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0), + # make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999), + # make_option(opt_str = c("-aL", "--alignment_coverage_longer_p"), default = 0.0), + # make_option(opt_str = c("-AL", "--alignment_coverage_longer_n"), default = 99999999), + # make_option(opt_str = c("-aS", "--alignment_coverage_shorter_p"), default = 0.0), + # make_option(opt_str = c("-AS", "--alignment_coverage_shorter_n"), default = 99999999), + # make_option(opt_str = c("-uL", "--max_unmatched_longer_p"), default = 1.0), + # make_option(opt_str = c("-uS", "--max_unmatched_shorter_p"), default = 99999999), + # make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999), + # make_option(opt_str = c("-g", "--fast_cluster"), default = FALSE), + # make_option(opt_str = c("-r", "--ignore_strand"), default = TRUE), + # make_option(opt_str = c("--match"), default = 2), + # make_option(opt_str = c("--mismatch"), default = -2), + # make_option(opt_str = c("--gap"), default = -6), + # make_option(opt_str = c("--gap_ext"), default = -1), + # make_option(opt_str = c("-sc", "--sort_cluster_by_size"), default = TRUE) # TODO more args ) @@ -17,16 +40,17 @@ opt <- parse_args(opt_parser) #' cd-hit wrapper #' -#' @param input -#' @param similarity Similarity threshold. +#' @param input Data.table or file in bed format (requires names in fourth column and sequences in last column). +#' @param identity Identity threshold. #' @param coverage Minimal alignment length for both sequences in nucelotides. #' @param output Clustered bedfile. Adds cluster number in last column (lower number = bigger cluster). #' @param clean Clean up after run. +#' @param threads Number of threads to use in cd-hit (0 = all cpus). #' -#' @return bed_table with cluster in last column -#' TODO add all cdhit parameter +#' TODO add all cdhit parameter (necessary ?) #' TODO check whether cdhit is installed -cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE) { +cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1) { + message("Loading bed.") # load bed if neccessary if (!data.table::is.data.table(input)) { bed_table <- data.table::fread(input = input, header = FALSE) @@ -34,18 +58,21 @@ cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.be bed_table <- input } + message("Convert to fasta.") ### convert bed to fasta # 4th column = name # last column = sequence fasta_file <- "converted_bed.fasta" seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file) + message("Clustering.") ### cd-hit-est cdhit_output <- "cdhit_output" - cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", similarity, "-A", coverage, "-G 0 -n 3 -g 1 -r 0 -l 5 -sc 1 -d 0") + cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", identity, "-A", coverage, "-T ", threads, "-G 0 -n 3 -g 1 -r 0 -l 5 -sc 1 -d 0") system(command = cdhit_call, wait = TRUE) + message("Formatting/ writing results.") # reformat cluster file # columns: id, clstr, clstr_size, length, clstr_rep, clstr_iden, clstr_cov cluster_file <- "reformated.clstr" @@ -71,5 +98,5 @@ cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.be if (!interactive()) { # remove last parameter (help param) params <- opt[-length(opt)] - do.call(cdhitest, args = params) + do.call(cdhitest, args = params) } From 8476db5a21a5891f34574ef55375289f25caed66 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 09:51:41 +0100 Subject: [PATCH 05/14] implemented motif_occurence for cutoff computation --- bin/reduce_bed.R | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index d539399..b3c1c6d 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -10,8 +10,8 @@ option_list <- list( make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"), make_option(opt_str = c("-ko", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), - make_option(opt_str = c("-mo", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer") - # TODO more args + make_option(opt_str = c("-mo", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"), + make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Number of motifs per sequence any value above 0. Default = %default.") ) opt_parser <- OptionParser(option_list = option_list, @@ -30,10 +30,11 @@ opt <- parse_args(opt_parser) #' @param minoverlap_kmer Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1 #' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2) #' @param min_seq_length Must be greater or equal to kmer and motif. Default = max(c(motif, kmer)). +#' @param motif_occurence Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence. #' #' @return reduced bed #' TODO check whether jellyfish is installed -reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer))) { +reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) { # get number of available cores if (threads == -1) { threads <- parallel::detectCores() @@ -93,7 +94,7 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr data.table::setorder(kmer_counts, -V2) # compute number of hits to keep - keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif, minoverlap = minoverlap_motif) + keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif, minoverlap = minoverlap_motif, motif_occurence = motif_occurence) # reduce kmer reduced_kmer <- reduce_kmer(kmer = kmer_counts, significant = keep_hits) @@ -136,12 +137,16 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr #' @param kmer Length of kmer #' @param motif Length of motif #' @param minoverlap Minimum number of bases overlapping between kmer and motif. Must be <= motif & <= kmer. Defaults to ceiling(motif / 2). +#' @param motif_occurence Define how many motifs are expected per sequence. Default = 1 #' #' @return Number of interesting kmer. -significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2)) { +significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2), motif_occurence = 1) { if (minoverlap > kmer || minoverlap > motif) { stop("Kmer & motif must be greater or equal to minoverlap!") } + if (motif_occurence <= 0) { + stop("Motif_occurence must be a numeric value above 0!") + } # minimum sequence length to get all interesting overlaps min_seq_length <- motif + 2 * (kmer - minoverlap) @@ -154,7 +159,7 @@ significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2)) # calculate max possible kmer topx <- sum(seq_lengths - kmer + 1) - return(topx) + return(topx * motif_occurence) } #' Orders kmer table after count descending and keeps all kmer with a cumulative sum below the given significance threshold. @@ -179,6 +184,8 @@ reduce_kmer <- function(kmer, significant) { #' @param threads Number of threads. #' #' @return Data.table with relative positions and width (start, end, width). +#' +#' TODO Include number of motifs per sequence (aka motif_occurence). Attempt to keep best 2 regions for occurence = 2? Probably high impact on performance. find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) { if (nchar(kmer_counts[1, 1]) <= minoverlap) { stop("Minoverlap must be smaller than kmer length!") From f0e2cef3c6fe4a0a81144b6dfbbadfa651f753f7 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 09:56:06 +0100 Subject: [PATCH 06/14] threads = 0 for all cores to match cd-hit --- bin/reduce_bed.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index b3c1c6d..c022b43 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -6,7 +6,7 @@ option_list <- list( make_option(opt_str = c("-k", "--kmer"), default = 10, help = "Kmer length. Default = %default", metavar = "integer"), make_option(opt_str = c("-m", "--motif"), default = 10, help = "Estimated motif length. Default = %default", metavar = "integer"), make_option(opt_str = c("-o", "--output"), default = "reduced.bed", help = "Output file. Default = %default", metavar = "character"), - make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use -1 for all available cores. Default = %default", metavar = "integer"), + make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use 0 for all available cores. Default = %default", metavar = "integer"), make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"), make_option(opt_str = c("-ko", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), @@ -25,7 +25,7 @@ opt <- parse_args(opt_parser) #' @param kmer Length of kmer. #' @param motif Estimated motif length. #' @param output Output file -#' @param threads Number of threads. +#' @param threads Number of threads. Default = 1. 0 for all cores. #' @param clean Delete all temporary files. #' @param minoverlap_kmer Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1 #' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2) @@ -36,7 +36,7 @@ opt <- parse_args(opt_parser) #' TODO check whether jellyfish is installed reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) { # get number of available cores - if (threads == -1) { + if (threads == 0) { threads <- parallel::detectCores() } From 0467d7b581680773dd23ea119cfd710897136182 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 11:31:00 +0100 Subject: [PATCH 07/14] added more cdhit parameter --- bin/cdhit_wrapper.R | 126 ++++++++++++++++++++++++++++++++------------ 1 file changed, 91 insertions(+), 35 deletions(-) diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R index f26a71a..9c62564 100644 --- a/bin/cdhit_wrapper.R +++ b/bin/cdhit_wrapper.R @@ -3,53 +3,77 @@ library("optparse") option_list <- list( make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Fourth column is expected to contain names, last column must be sequences.", metavar = "character"), - make_option(opt_str = c("-c", "--identity"), default = 0.8, help = "Identity threshold. Default = %default", metavar = "double >= 0.8"), - make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default", metavar = "integer"), + make_option(opt_str = c("-c", "--identity"), default = 0.8, help = "Identity threshold. Default = %default (CD-HIT parameter)", metavar = "double >= 0.8"), + make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file same as input but with appended column of cluster numbers. Default = %default", metavar = "character"), make_option(opt_str = c("--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), - # make_option(opt_str = c("-G", "--global"), default = FALSE), - # make_option(opt_str = c("-b", "--band_width"), default = 20), - # make_option(opt_str = c("-M", "--memory"), default = 800), - make_option(opt_str = c("-T", "--threads"), default = 1)#, - # make_option(opt_str = c("-n", "--word_length"), default = 3), - # make_option(opt_str = c("-l", "--throw_away_sequences"), default = 5), + make_option(opt_str = c("-G", "--global"), default = 0, help = "Global sequence identity = 1, local = 0. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-b", "--band_width"), default = 20, help = "Alignment band width. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-M", "--memory"), default = 800, help = "Memory limit in MB. 0 for unlimited. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-T", "--threads"), default = 1, help = "Number of threads. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-n", "--word_length"), default = 3, help = "Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-l", "--throw_away_sequences"), default = 5, help = "Maximum length of sequences thrown away. Default = %default (CD-HIT parameter)", metavar = "integer"), # make_option(opt_str = c("-d", "--description")), # can not produce bed if this is != 0 - # make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0), - # make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999), - # make_option(opt_str = c("-aL", "--alignment_coverage_longer_p"), default = 0.0), - # make_option(opt_str = c("-AL", "--alignment_coverage_longer_n"), default = 99999999), - # make_option(opt_str = c("-aS", "--alignment_coverage_shorter_p"), default = 0.0), - # make_option(opt_str = c("-AS", "--alignment_coverage_shorter_n"), default = 99999999), - # make_option(opt_str = c("-uL", "--max_unmatched_longer_p"), default = 1.0), - # make_option(opt_str = c("-uS", "--max_unmatched_shorter_p"), default = 99999999), - # make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999), - # make_option(opt_str = c("-g", "--fast_cluster"), default = FALSE), - # make_option(opt_str = c("-r", "--ignore_strand"), default = TRUE), - # make_option(opt_str = c("--match"), default = 2), - # make_option(opt_str = c("--mismatch"), default = -2), - # make_option(opt_str = c("--gap"), default = -6), - # make_option(opt_str = c("--gap_ext"), default = -1), - # make_option(opt_str = c("-sc", "--sort_cluster_by_size"), default = TRUE) - # TODO more args + make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0, help = "Shorter sequences length must be at least x percent of longer sequenecs length. Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999, help = "Length difference between sequences can not be higher than x nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-aL", "--alignment_coverage_longer_p"), default = 0.0, help = "Alignment must cover x percent of longer sequence. Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-AL", "--alignment_coverage_longer_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-aS", "--alignment_coverage_shorter_p"), default = 0.0, help = "Alignment must cover x percent of shorter sequence. Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-AS", "--alignment_coverage_shorter_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-uL", "--max_unmatched_longer_p"), default = 1.0, help = "Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-uS", "--max_unmatched_shorter_p"), default = 1.0, help = "Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999, help = "Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-g", "--fast_cluster"), default = 1, help = "Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-r", "--ignore_strand"), default = 0, help = "Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--match"), default = 2, help = "Matching score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--gap_ext"), default = -1, help = "Gap extension score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-sc", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter)", metavar = "integer") ) opt_parser <- OptionParser(option_list = option_list, - description = "CD-HIT-EST Wrapper function.") + description = "CD-HIT-EST Wrapper function. See https://github.com/weizhongli/cdhit/wiki for more information regarding CD-HITs parameters.") opt <- parse_args(opt_parser) -#' cd-hit wrapper +#' cd-hit-est wrapper +#' Receives bed with sequences in last column and writes bed with added cluster numbers. #' #' @param input Data.table or file in bed format (requires names in fourth column and sequences in last column). -#' @param identity Identity threshold. -#' @param coverage Minimal alignment length for both sequences in nucelotides. -#' @param output Clustered bedfile. Adds cluster number in last column (lower number = bigger cluster). -#' @param clean Clean up after run. -#' @param threads Number of threads to use in cd-hit (0 = all cpus). +#' @param identity Identity threshold. Default = 0.8 (CD-HIT parameter) +#' @param coverage Minimal alignment length for both sequences in nucelotides. Default = 8 (CD-HIT parameter) +#' @param output Clustered bedfile. Adds cluster number in last column (numbering depend on sort_by_cluster_size parameter). Default = cluster.bed +#' @param clean Clean up after run. Default = TRUE +#' @param threads Number of threads to use (0 = all cores). Default = 1 (CD-HIT parameter) +#' @param global Global sequence identity = 1, local = 0. Default = 0 (CD-HIT parameter) +#' @param band_width "Alignment band width. Default = 20 (CD-HIT parameter)" +#' @param memory Memory limit in MB. 0 for unlimited. Default = 800 (CD-HIT parameter) +#' @param word_length Default = 3 (CD-HIT parameter) +#' @param throw_away_sequences Maximum length of sequences thrown away. Default = %default (CD-HIT parameter) +#' @param length_dif_cutoff_shorter_p Shorter sequences length must be at least x percent of longer sequenecs length. Default = 0 (CD-HIT parameter) +#' @param length_dif_cutoff_shorter_n Length difference between sequences can not be higher than x nucleotides. Default = 999999 (CD-HIT parameter) +#' @param alignment_coverage_longer_p Alignment must cover x percent of longer sequence. Default = 0 (CD-HIT parameter) +#' @param alignment_coverage_longer_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter) +#' @param alignment_coverage_shorter_p Alignment must cover x percent of shorter sequence. Default = 0 (CD-HIT parameter) +#' @param alignment_coverage_shorter_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter) +#' @param max_unmatched_longer_p Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) +#' @param max_unmatched_shorter_p Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) +#' @param max_unmatched_both_n Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = 99999999 (CD-HIT parameter) +#' @param fast_cluster Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = 1 (CD-HIT parameter) +#' @param ignore_strand Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = 0 (CD-HIT parameter) +#' @param match Matching score. Default = 2 (CD-HIT parameter) +#' @param mismatch Mismatch score. Default = -2 (CD-HIT parameter) +#' @param gap Gap score. Default = -6 (CD-HIT parameter) +#' @param gat_ext Gap extension score. Default = -1 (CD-HIT parameter) +#' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter) #' -#' TODO add all cdhit parameter (necessary ?) #' TODO check whether cdhit is installed -cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1) { +cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, ignore_strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) { + if (missing(input)) { + stop("Input parameter missing!") + } + message("Loading bed.") # load bed if neccessary if (!data.table::is.data.table(input)) { @@ -58,6 +82,12 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed" bed_table <- input } + # check for duplicated names + if (anyDuplicated(bed_table[, "V4"])) { + warning("Found duplicated names. Making names unique.") + bed_table[, V4 := make.unique(V4)] + } + message("Convert to fasta.") ### convert bed to fasta # 4th column = name @@ -68,7 +98,33 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed" message("Clustering.") ### cd-hit-est cdhit_output <- "cdhit_output" - cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", identity, "-A", coverage, "-T ", threads, "-G 0 -n 3 -g 1 -r 0 -l 5 -sc 1 -d 0") + cdhit_call <- paste("cd-hit-est -i", fasta_file, + "-o", cdhit_output, + "-c", identity, + "-A", coverage, + "-T", threads, + "-G", global, + "-b", band_width, + "-M", memory, + "-n", word_length, + "-l", throw_away_sequences, + "-s", length_dif_cutoff_shorter_p, + "-S", length_dif_cutoff_shorter_n, + "-aL", alignment_coverage_longer_p, + "-AL", alignment_coverage_longer_n, + "-aS", alignment_coverage_shorter_p, + "-AS", alignment_coverage_shorter_n, + "-uL", max_unmatched_longer_p, + "-uS", max_unmatched_shorter_p, + "-U", max_unmatched_both_n, + "-g", fast_cluster, + "-r", ignore_strand, + "-match", match, + "-mismatch", mismatch, + "-gap", gap, + "-gap-ext", gap_ext, + "-sc", sort_cluster_by_size, + "-d 0") system(command = cdhit_call, wait = TRUE) From 4b44ca59e7b7cb0b06c44352eb15677cc5020511 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 12:15:38 +0100 Subject: [PATCH 08/14] changed type of motif_occurence to double --- bin/reduce_bed.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index c022b43..efe5bf5 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -11,7 +11,7 @@ option_list <- list( make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"), make_option(opt_str = c("-ko", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), make_option(opt_str = c("-mo", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"), - make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Number of motifs per sequence any value above 0. Default = %default.") + make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Number of motifs per sequence any value above 0. Default = %default.", metavar = "double") ) opt_parser <- OptionParser(option_list = option_list, From c6c42ca1e19e6a3f838a3960cd732e86ae96ff21 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 12:30:21 +0100 Subject: [PATCH 09/14] renamed ignore_strand to strand --- bin/cdhit_wrapper.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R index 9c62564..c9a8241 100644 --- a/bin/cdhit_wrapper.R +++ b/bin/cdhit_wrapper.R @@ -24,7 +24,7 @@ option_list <- list( make_option(opt_str = c("-uS", "--max_unmatched_shorter_p"), default = 1.0, help = "Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "double"), make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999, help = "Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-g", "--fast_cluster"), default = 1, help = "Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = %default (CD-HIT parameter)", metavar = "integer"), - make_option(opt_str = c("-r", "--ignore_strand"), default = 0, help = "Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-r", "--strand"), default = 0, help = "Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--match"), default = 2, help = "Matching score. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"), @@ -61,7 +61,7 @@ opt <- parse_args(opt_parser) #' @param max_unmatched_shorter_p Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) #' @param max_unmatched_both_n Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = 99999999 (CD-HIT parameter) #' @param fast_cluster Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = 1 (CD-HIT parameter) -#' @param ignore_strand Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = 0 (CD-HIT parameter) +#' @param strand Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = 0 (CD-HIT parameter) #' @param match Matching score. Default = 2 (CD-HIT parameter) #' @param mismatch Mismatch score. Default = -2 (CD-HIT parameter) #' @param gap Gap score. Default = -6 (CD-HIT parameter) @@ -69,7 +69,7 @@ opt <- parse_args(opt_parser) #' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter) #' #' TODO check whether cdhit is installed -cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, ignore_strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) { +cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) { if (missing(input)) { stop("Input parameter missing!") } @@ -118,7 +118,7 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed" "-uS", max_unmatched_shorter_p, "-U", max_unmatched_both_n, "-g", fast_cluster, - "-r", ignore_strand, + "-r", strand, "-match", match, "-mismatch", mismatch, "-gap", gap, From b769d49f6506b47915bf32821a00aed1603f9f82 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 12:43:58 +0100 Subject: [PATCH 10/14] new clustering parameter added --- config/cluster.config | 14 ++++++++++++-- pipeline.nf | 4 ++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/config/cluster.config b/config/cluster.config index 3dce063..2a1828c 100644 --- a/config/cluster.config +++ b/config/cluster.config @@ -1,7 +1,17 @@ params{ - sequence_coverage=8 + threads=1 + //reduce_bed kmer=10 aprox_motif_len=10 - threads=1 + motif_occurence=1 + min_seq_length=10 + + //cdhit_wrapper + global=0 + identity=0.8 + sequence_coverage=8 + memory=800 + throw_away_seq=9 + strand=0 } diff --git a/pipeline.nf b/pipeline.nf index 5ed2eac..1fabdc3 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -75,7 +75,7 @@ process reduce_bed { script: """ - Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} + Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} """ } @@ -91,7 +91,7 @@ process clustering { script: """ - Rscript ${path_bin}/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed + Rscript ${path_bin}/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} """ } From 1a457039b1aff1cff51256fd4117b4a9cbd74866 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 13:55:35 +0100 Subject: [PATCH 11/14] fixed ambiguous flags --- bin/cdhit_wrapper.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R index c9a8241..e5ed3e4 100644 --- a/bin/cdhit_wrapper.R +++ b/bin/cdhit_wrapper.R @@ -16,12 +16,12 @@ option_list <- list( # make_option(opt_str = c("-d", "--description")), # can not produce bed if this is != 0 make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0, help = "Shorter sequences length must be at least x percent of longer sequenecs length. Default = %default (CD-HIT parameter)", metavar = "double"), make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999, help = "Length difference between sequences can not be higher than x nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"), - make_option(opt_str = c("-aL", "--alignment_coverage_longer_p"), default = 0.0, help = "Alignment must cover x percent of longer sequence. Default = %default (CD-HIT parameter)", metavar = "double"), - make_option(opt_str = c("-AL", "--alignment_coverage_longer_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter)", metavar = "integer"), - make_option(opt_str = c("-aS", "--alignment_coverage_shorter_p"), default = 0.0, help = "Alignment must cover x percent of shorter sequence. Default = %default (CD-HIT parameter)", metavar = "double"), - make_option(opt_str = c("-AS", "--alignment_coverage_shorter_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter)", metavar = "integer"), - make_option(opt_str = c("-uL", "--max_unmatched_longer_p"), default = 1.0, help = "Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "double"), - make_option(opt_str = c("-uS", "--max_unmatched_shorter_p"), default = 1.0, help = "Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-e", "--alignment_coverage_longer_p"), default = 0.0, help = "Alignment must cover x percent of longer sequence. Default = %default (CD-HIT parameter: -aL)", metavar = "double"), + make_option(opt_str = c("-E", "--alignment_coverage_longer_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter: -AL)", metavar = "integer"), + make_option(opt_str = c("-f", "--alignment_coverage_shorter_p"), default = 0.0, help = "Alignment must cover x percent of shorter sequence. Default = %default (CD-HIT parameter: -aS)", metavar = "double"), + make_option(opt_str = c("-F", "--alignment_coverage_shorter_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter: -AS)", metavar = "integer"), + make_option(opt_str = c("-w", "--max_unmatched_longer_p"), default = 1.0, help = "Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter: -uL)", metavar = "double"), + make_option(opt_str = c("-W", "--max_unmatched_shorter_p"), default = 1.0, help = "Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter: -uS)", metavar = "double"), make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999, help = "Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-g", "--fast_cluster"), default = 1, help = "Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-r", "--strand"), default = 0, help = "Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = %default (CD-HIT parameter)", metavar = "integer"), @@ -29,7 +29,7 @@ option_list <- list( make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--gap_ext"), default = -1, help = "Gap extension score. Default = %default (CD-HIT parameter)", metavar = "integer"), - make_option(opt_str = c("-sc", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter)", metavar = "integer") + make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer") ) opt_parser <- OptionParser(option_list = option_list, From cd53d3c4dc58f4eaadfdbe556dbb8532c2ea46af Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 13:56:20 +0100 Subject: [PATCH 12/14] fixed ambiguous flags + nameing bug in find_kmer_regions --- bin/reduce_bed.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index efe5bf5..2f8e8a2 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -9,8 +9,8 @@ option_list <- list( make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use 0 for all available cores. Default = %default", metavar = "integer"), make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"), - make_option(opt_str = c("-ko", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), - make_option(opt_str = c("-mo", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"), + make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), + make_option(opt_str = c("-v", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"), make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Number of motifs per sequence any value above 0. Default = %default.", metavar = "double") ) @@ -196,6 +196,7 @@ find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) seq_ranges <- pbapply::pblapply(seq_len(nrow(bed)), cl = threads, function(x) { seq <- bed[x][[ncol(bed)]] + name <- bed[x][[4]] #### locate ranges ranges <- data.table::data.table(do.call(rbind, stringi::stri_locate_all_fixed(seq, pattern = kmer_counts[[1]]))) @@ -203,7 +204,7 @@ find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) ranges <- na.omit(ranges, cols = c("start", "end")) if (nrow(ranges) < 1) { - return(data.table::data.table(start = NA, end = NA, width = NA)) + return(data.table::data.table(start = NA, end = NA, width = NA, name = name)) } # add kmer sequences @@ -237,14 +238,13 @@ find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) # reduce selected ranges reduced_ranges <- IRanges::reduce(reduced_ranges[selected_ranges]) - reduced_ranges <- data.table::as.data.table(reduced_ranges) + reduced_ranges <- data.table::as.data.table(reduced_ranges)[, name := name] return(reduced_ranges) }) # create ranges table conserved_regions_table <- data.table::rbindlist(seq_ranges) - conserved_regions_table[, name := bed[[4]]] return(conserved_regions_table) } From e68a89dcabd401a8108c86be987433b55acaebce Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 13:57:27 +0100 Subject: [PATCH 13/14] removed param threads --- config/cluster.config | 2 -- 1 file changed, 2 deletions(-) diff --git a/config/cluster.config b/config/cluster.config index 2a1828c..1b444e2 100644 --- a/config/cluster.config +++ b/config/cluster.config @@ -1,6 +1,4 @@ params{ - threads=1 - //reduce_bed kmer=10 aprox_motif_len=10 From 92baa4d08228b23ff96972a0cbc9a54c3c92a59d Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 14:32:46 +0100 Subject: [PATCH 14/14] added parameter to list and description --- pipeline.nf | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index 0ebc301..f12edad 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -23,10 +23,19 @@ params.config="" params.max_size_fp=100 //clustering - params.sequence_coverage=8 - - params.kmer=10 - params.aprox_motif_len=10 + //reduce_bed + params.kmer=10 + params.aprox_motif_len=10 + params.motif_occurence=1 + params.min_seq_length=10 + + //cdhit_wrapper + params.global=0 + params.identity=0.8 + params.sequence_coverage=8 + params.memory=800 + params.throw_away_seq=9 + params.strand=0 //motif_estimation //bed_to_clustered_fasta @@ -67,9 +76,19 @@ Optional arguments: --max_size_fp INT (Default: 100) Clustering: - --sequence_coverage INT (Default: 8) - --kmer INT (Default: 10) - --aprox_motif_len INT (Default: 10) + Sequence preparation/ reduction: + --kmer INT Kmer length (Default: 10) + --aprox_motif_len INT Motif length (Default: 10) + --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. + --min_seq_length INT Remove all sequences below this value. (Default: 10) + + Clustering: + --global INT Global (=1) or local (=0) alignment. (Default: 0) + --identity FLOAT Identity threshold. (Default: 0.8) + --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) + --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) + --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) + --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) Motif estimation: --motif_min_len INT Minimum length of Motif (Default: 8)