diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R index ed5ea7f..e5ed3e4 100644 --- a/bin/cdhit_wrapper.R +++ b/bin/cdhit_wrapper.R @@ -2,31 +2,79 @@ library("optparse") option_list <- list( - make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"), - make_option(opt_str = c("-s", "--similarity"), default = 0.8, help = "Similarity threshold. Default = %default", metavar = "double >= 0.8"), - make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default", metavar = "integer"), - make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file. Default = %default", metavar = "character"), - make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical") - # TODO more args + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Fourth column is expected to contain names, last column must be sequences.", metavar = "character"), + make_option(opt_str = c("-c", "--identity"), default = 0.8, help = "Identity threshold. Default = %default (CD-HIT parameter)", metavar = "double >= 0.8"), + make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file same as input but with appended column of cluster numbers. Default = %default", metavar = "character"), + make_option(opt_str = c("--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), + make_option(opt_str = c("-G", "--global"), default = 0, help = "Global sequence identity = 1, local = 0. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-b", "--band_width"), default = 20, help = "Alignment band width. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-M", "--memory"), default = 800, help = "Memory limit in MB. 0 for unlimited. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-T", "--threads"), default = 1, help = "Number of threads. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-n", "--word_length"), default = 3, help = "Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-l", "--throw_away_sequences"), default = 5, help = "Maximum length of sequences thrown away. Default = %default (CD-HIT parameter)", metavar = "integer"), + # make_option(opt_str = c("-d", "--description")), # can not produce bed if this is != 0 + make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0, help = "Shorter sequences length must be at least x percent of longer sequenecs length. Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999, help = "Length difference between sequences can not be higher than x nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-e", "--alignment_coverage_longer_p"), default = 0.0, help = "Alignment must cover x percent of longer sequence. Default = %default (CD-HIT parameter: -aL)", metavar = "double"), + make_option(opt_str = c("-E", "--alignment_coverage_longer_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter: -AL)", metavar = "integer"), + make_option(opt_str = c("-f", "--alignment_coverage_shorter_p"), default = 0.0, help = "Alignment must cover x percent of shorter sequence. Default = %default (CD-HIT parameter: -aS)", metavar = "double"), + make_option(opt_str = c("-F", "--alignment_coverage_shorter_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter: -AS)", metavar = "integer"), + make_option(opt_str = c("-w", "--max_unmatched_longer_p"), default = 1.0, help = "Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter: -uL)", metavar = "double"), + make_option(opt_str = c("-W", "--max_unmatched_shorter_p"), default = 1.0, help = "Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter: -uS)", metavar = "double"), + make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999, help = "Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-g", "--fast_cluster"), default = 1, help = "Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-r", "--strand"), default = 0, help = "Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--match"), default = 2, help = "Matching score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--gap_ext"), default = -1, help = "Gap extension score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer") ) opt_parser <- OptionParser(option_list = option_list, - description = "CD-HIT-EST Wrapper function.") + description = "CD-HIT-EST Wrapper function. See https://github.com/weizhongli/cdhit/wiki for more information regarding CD-HITs parameters.") opt <- parse_args(opt_parser) -#' cd-hit wrapper +#' cd-hit-est wrapper +#' Receives bed with sequences in last column and writes bed with added cluster numbers. #' -#' @param input -#' @param similarity Similarity threshold. -#' @param coverage Minimal alignment length for both sequences in nucelotides. -#' @param output Clustered bedfile. Adds cluster number in last column (lower number = bigger cluster). -#' @param clean Clean up after run. +#' @param input Data.table or file in bed format (requires names in fourth column and sequences in last column). +#' @param identity Identity threshold. Default = 0.8 (CD-HIT parameter) +#' @param coverage Minimal alignment length for both sequences in nucelotides. Default = 8 (CD-HIT parameter) +#' @param output Clustered bedfile. Adds cluster number in last column (numbering depend on sort_by_cluster_size parameter). Default = cluster.bed +#' @param clean Clean up after run. Default = TRUE +#' @param threads Number of threads to use (0 = all cores). Default = 1 (CD-HIT parameter) +#' @param global Global sequence identity = 1, local = 0. Default = 0 (CD-HIT parameter) +#' @param band_width "Alignment band width. Default = 20 (CD-HIT parameter)" +#' @param memory Memory limit in MB. 0 for unlimited. Default = 800 (CD-HIT parameter) +#' @param word_length Default = 3 (CD-HIT parameter) +#' @param throw_away_sequences Maximum length of sequences thrown away. Default = %default (CD-HIT parameter) +#' @param length_dif_cutoff_shorter_p Shorter sequences length must be at least x percent of longer sequenecs length. Default = 0 (CD-HIT parameter) +#' @param length_dif_cutoff_shorter_n Length difference between sequences can not be higher than x nucleotides. Default = 999999 (CD-HIT parameter) +#' @param alignment_coverage_longer_p Alignment must cover x percent of longer sequence. Default = 0 (CD-HIT parameter) +#' @param alignment_coverage_longer_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter) +#' @param alignment_coverage_shorter_p Alignment must cover x percent of shorter sequence. Default = 0 (CD-HIT parameter) +#' @param alignment_coverage_shorter_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter) +#' @param max_unmatched_longer_p Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) +#' @param max_unmatched_shorter_p Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) +#' @param max_unmatched_both_n Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = 99999999 (CD-HIT parameter) +#' @param fast_cluster Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = 1 (CD-HIT parameter) +#' @param strand Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = 0 (CD-HIT parameter) +#' @param match Matching score. Default = 2 (CD-HIT parameter) +#' @param mismatch Mismatch score. Default = -2 (CD-HIT parameter) +#' @param gap Gap score. Default = -6 (CD-HIT parameter) +#' @param gat_ext Gap extension score. Default = -1 (CD-HIT parameter) +#' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter) #' -#' @return bed_table with cluster in last column -#' TODO add all cdhit parameter #' TODO check whether cdhit is installed -cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE) { +cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) { + if (missing(input)) { + stop("Input parameter missing!") + } + + message("Loading bed.") # load bed if neccessary if (!data.table::is.data.table(input)) { bed_table <- data.table::fread(input = input, header = FALSE) @@ -34,18 +82,53 @@ cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.be bed_table <- input } + # check for duplicated names + if (anyDuplicated(bed_table[, "V4"])) { + warning("Found duplicated names. Making names unique.") + bed_table[, V4 := make.unique(V4)] + } + + message("Convert to fasta.") ### convert bed to fasta # 4th column = name # last column = sequence fasta_file <- "converted_bed.fasta" seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file) + message("Clustering.") ### cd-hit-est cdhit_output <- "cdhit_output" - cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", similarity, "-A", coverage, "-G 0 -n 3 -g 1 -r 0 -l 5 -sc 1 -d 0") + cdhit_call <- paste("cd-hit-est -i", fasta_file, + "-o", cdhit_output, + "-c", identity, + "-A", coverage, + "-T", threads, + "-G", global, + "-b", band_width, + "-M", memory, + "-n", word_length, + "-l", throw_away_sequences, + "-s", length_dif_cutoff_shorter_p, + "-S", length_dif_cutoff_shorter_n, + "-aL", alignment_coverage_longer_p, + "-AL", alignment_coverage_longer_n, + "-aS", alignment_coverage_shorter_p, + "-AS", alignment_coverage_shorter_n, + "-uL", max_unmatched_longer_p, + "-uS", max_unmatched_shorter_p, + "-U", max_unmatched_both_n, + "-g", fast_cluster, + "-r", strand, + "-match", match, + "-mismatch", mismatch, + "-gap", gap, + "-gap-ext", gap_ext, + "-sc", sort_cluster_by_size, + "-d 0") system(command = cdhit_call, wait = TRUE) + message("Formatting/ writing results.") # reformat cluster file # columns: id, clstr, clstr_size, length, clstr_rep, clstr_iden, clstr_cov cluster_file <- "reformated.clstr" @@ -71,5 +154,5 @@ cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.be if (!interactive()) { # remove last parameter (help param) params <- opt[-length(opt)] - do.call(cdhitest, args = params) + do.call(cdhitest, args = params) } diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index 223c946..2f8e8a2 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -6,10 +6,12 @@ option_list <- list( make_option(opt_str = c("-k", "--kmer"), default = 10, help = "Kmer length. Default = %default", metavar = "integer"), make_option(opt_str = c("-m", "--motif"), default = 10, help = "Estimated motif length. Default = %default", metavar = "integer"), make_option(opt_str = c("-o", "--output"), default = "reduced.bed", help = "Output file. Default = %default", metavar = "character"), - make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use -1 for all available cores. Default = %default", metavar = "integer"), + make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use 0 for all available cores. Default = %default", metavar = "integer"), make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), - make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to motif length.", metavar = "integer", type = "integer") - # TODO more args + make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"), + make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), + make_option(opt_str = c("-v", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"), + make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Number of motifs per sequence any value above 0. Default = %default.", metavar = "double") ) opt_parser <- OptionParser(option_list = option_list, @@ -23,17 +25,18 @@ opt <- parse_args(opt_parser) #' @param kmer Length of kmer. #' @param motif Estimated motif length. #' @param output Output file -#' @param threads Number of threads. +#' @param threads Number of threads. Default = 1. 0 for all cores. #' @param clean Delete all temporary files. -#' @param minoverlap_kmer Minimum required overlap between kmers. # TODO -#' @param minoverlap_motif Minimum required overlap between motif and kmer. # TODO -#' @param min_seq_length Must be smaller or equal to kmer and motif. Default = motif. +#' @param minoverlap_kmer Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1 +#' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2) +#' @param min_seq_length Must be greater or equal to kmer and motif. Default = max(c(motif, kmer)). +#' @param motif_occurence Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence. #' #' @return reduced bed #' TODO check whether jellyfish is installed -reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer, minoverlap_motif, min_seq_length = motif) { +reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) { # get number of available cores - if (threads == -1) { + if (threads == 0) { threads <- parallel::detectCores() } @@ -53,6 +56,10 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr } # remove sequences below minimum length + if (min_seq_length < max(c(kmer, motif))) { + stop("Minimum sequence length must be greater or equal to ", max(c(motif, kmer)), " (maximum value of kmer and motif).") + } + total_rows <- nrow(bed_table) bed_table <- bed_table[nchar(sequence) > min_seq_length] if (total_rows > nrow(bed_table)) { @@ -87,16 +94,15 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr data.table::setorder(kmer_counts, -V2) # compute number of hits to keep - keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif) + keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif, minoverlap = minoverlap_motif, motif_occurence = motif_occurence) # reduce kmer - reduced_kmer <- reduce_kmer(kmer = kmer_counts, keep_hits) + reduced_kmer <- reduce_kmer(kmer = kmer_counts, significant = keep_hits) message("Find kmer in sequences.") # find k-mer in sequences - # TODO minoverlap as parameter # columns: name, start, end, width - ranges_table <- find_kmer_regions(bed = bed_table, kmer_counts = reduced_kmer, minoverlap = kmer - 1, threads = threads) + ranges_table <- find_kmer_regions(bed = bed_table, kmer_counts = reduced_kmer, minoverlap = minoverlap_kmer, threads = threads) names(ranges_table)[1:2] <- c("relative_start", "relative_end") # merge ranged_table with bed_table + keep column order @@ -125,17 +131,21 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr data.table::fwrite(merged, file = output, sep = "\t", col.names = FALSE) } -#' returns sum of top x kmer frequencies to keep +#' Predict how many interesting kmer are possible for the given data. #' #' @param bed Bed table with sequences in last column #' @param kmer Length of kmer #' @param motif Length of motif #' @param minoverlap Minimum number of bases overlapping between kmer and motif. Must be <= motif & <= kmer. Defaults to ceiling(motif / 2). +#' @param motif_occurence Define how many motifs are expected per sequence. Default = 1 #' #' @return Number of interesting kmer. -significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2)) { +significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2), motif_occurence = 1) { if (minoverlap > kmer || minoverlap > motif) { - stop("Kmer & motif must be greater or equal than minoverlap!") + stop("Kmer & motif must be greater or equal to minoverlap!") + } + if (motif_occurence <= 0) { + stop("Motif_occurence must be a numeric value above 0!") } # minimum sequence length to get all interesting overlaps @@ -149,12 +159,18 @@ significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2)) # calculate max possible kmer topx <- sum(seq_lengths - kmer + 1) - return(topx) + return(topx * motif_occurence) } -#' @param kmer Kmer table -#' @param significant +#' Orders kmer table after count descending and keeps all kmer with a cumulative sum below the given significance threshold. +#' +#' @param kmer Kmer data.table columns: kmer, count +#' @param significant Value from significant_kmer function. +#' +#' @return reduced data.table reduce_kmer <- function(kmer, significant) { + data.table::setorderv(kmer, cols = names(kmer)[2], order = -1) + kmer[, cumsum := cumsum(V2)] return(kmer[cumsum <= significant]) @@ -168,6 +184,8 @@ reduce_kmer <- function(kmer, significant) { #' @param threads Number of threads. #' #' @return Data.table with relative positions and width (start, end, width). +#' +#' TODO Include number of motifs per sequence (aka motif_occurence). Attempt to keep best 2 regions for occurence = 2? Probably high impact on performance. find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) { if (nchar(kmer_counts[1, 1]) <= minoverlap) { stop("Minoverlap must be smaller than kmer length!") @@ -178,6 +196,7 @@ find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) seq_ranges <- pbapply::pblapply(seq_len(nrow(bed)), cl = threads, function(x) { seq <- bed[x][[ncol(bed)]] + name <- bed[x][[4]] #### locate ranges ranges <- data.table::data.table(do.call(rbind, stringi::stri_locate_all_fixed(seq, pattern = kmer_counts[[1]]))) @@ -185,7 +204,7 @@ find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) ranges <- na.omit(ranges, cols = c("start", "end")) if (nrow(ranges) < 1) { - return(data.table::data.table(start = NA, end = NA, width = NA)) + return(data.table::data.table(start = NA, end = NA, width = NA, name = name)) } # add kmer sequences @@ -219,14 +238,13 @@ find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) # reduce selected ranges reduced_ranges <- IRanges::reduce(reduced_ranges[selected_ranges]) - reduced_ranges <- data.table::as.data.table(reduced_ranges) + reduced_ranges <- data.table::as.data.table(reduced_ranges)[, name := name] return(reduced_ranges) }) # create ranges table conserved_regions_table <- data.table::rbindlist(seq_ranges) - conserved_regions_table[, name := bed[[4]]] return(conserved_regions_table) } diff --git a/config/cluster.config b/config/cluster.config index 0a21a05..1b444e2 100644 --- a/config/cluster.config +++ b/config/cluster.config @@ -1,6 +1,15 @@ params{ - sequence_coverage=8 - + //reduce_bed kmer=10 aprox_motif_len=10 + motif_occurence=1 + min_seq_length=10 + + //cdhit_wrapper + global=0 + identity=0.8 + sequence_coverage=8 + memory=800 + throw_away_seq=9 + strand=0 } diff --git a/pipeline.nf b/pipeline.nf index e694ba0..f12edad 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -23,10 +23,19 @@ params.config="" params.max_size_fp=100 //clustering - params.sequence_coverage=8 - - params.kmer=10 - params.aprox_motif_len=10 + //reduce_bed + params.kmer=10 + params.aprox_motif_len=10 + params.motif_occurence=1 + params.min_seq_length=10 + + //cdhit_wrapper + params.global=0 + params.identity=0.8 + params.sequence_coverage=8 + params.memory=800 + params.throw_away_seq=9 + params.strand=0 //motif_estimation //bed_to_clustered_fasta @@ -67,9 +76,19 @@ Optional arguments: --max_size_fp INT (Default: 100) Clustering: - --sequence_coverage INT (Default: 8) - --kmer INT (Default: 10) - --aprox_motif_len INT (Default: 10) + Sequence preparation/ reduction: + --kmer INT Kmer length (Default: 10) + --aprox_motif_len INT Motif length (Default: 10) + --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. + --min_seq_length INT Remove all sequences below this value. (Default: 10) + + Clustering: + --global INT Global (=1) or local (=0) alignment. (Default: 0) + --identity FLOAT Identity threshold. (Default: 0.8) + --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) + --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) + --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) + --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) Motif estimation: --motif_min_len INT Minimum length of Motif (Default: 8) @@ -157,7 +176,7 @@ process reduce_bed { script: """ - Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} + Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} """ } @@ -173,7 +192,7 @@ process clustering { script: """ - Rscript ${path_bin}/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed + Rscript ${path_bin}/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} """ }