From 98985d13512ede1d8c352207ea002d84522a2766 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 19 Dec 2018 13:47:43 +0100 Subject: [PATCH 01/14] refactoring; renamed reduce_bed to reduce_sequence --- bin/{reduce_bed.R => reduce_sequence.R} | 38 ++++++++++++++----------- 1 file changed, 21 insertions(+), 17 deletions(-) rename bin/{reduce_bed.R => reduce_sequence.R} (84%) diff --git a/bin/reduce_bed.R b/bin/reduce_sequence.R similarity index 84% rename from bin/reduce_bed.R rename to bin/reduce_sequence.R index 2f8e8a2..586b074 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_sequence.R @@ -9,32 +9,35 @@ option_list <- list( make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use 0 for all available cores. Default = %default", metavar = "integer"), make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"), - make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), + make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer. Used to create reduced sequence ranges out of merged kmer. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), make_option(opt_str = c("-v", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"), - make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Number of motifs per sequence any value above 0. Default = %default.", metavar = "double") + make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = %default meaning that there should be approximately one motif per sequence.", metavar = "double") ) opt_parser <- OptionParser(option_list = option_list, - description = "Reduce sequences to frequent regions.") + description = "Reduces each sequence to its most frequent region.") opt <- parse_args(opt_parser) -#' Reduce bed file to conserved regions +#' Reduces each sequence to its most frequent region. #' -#' @param input bed file -#' @param kmer Length of kmer. -#' @param motif Estimated motif length. -#' @param output Output file -#' @param threads Number of threads. Default = 1. 0 for all cores. +#' @param input Input bed-file. Last column must be sequences. +#' @param kmer Kmer length. Default = 10 +#' @param motif Estimated motif length. Default = 10 +#' @param output Output file. Default = reduced.bed +#' @param threads Number of threads to use. Default = 1. Use 0 for all cores. #' @param clean Delete all temporary files. -#' @param minoverlap_kmer Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1 +#' @param minoverlap_kmer Minimum required overlap between kmer. Used to create reduced sequence ranges out of merged kmer. Can not be greater than kmer length . Default = kmer - 1 #' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2) -#' @param min_seq_length Must be greater or equal to kmer and motif. Default = max(c(motif, kmer)). +#' @param min_seq_length Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower. #' @param motif_occurence Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence. #' -#' @return reduced bed #' TODO check whether jellyfish is installed -reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) { +reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) { + if (missing(input)) { + stop("No input specified! Please forward a valid bed-file.") + } + # get number of available cores if (threads == 0) { threads <- parallel::detectCores() @@ -117,12 +120,12 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr merged[, sequence := stringr::str_sub(sequence, relative_start, relative_end)] # bed files count from 0 - merged[, `:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)] + merged[, data.table::`:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)] # change start end location - merged[, `:=`(start = start + relative_start, end = start + relative_end)] + merged[, data.table::`:=`(start = start + relative_start, end = start + relative_end)] # clean table - merged[, `:=`(relative_start = NULL, relative_end = NULL, width = NULL)] + merged[, data.table::`:=`(relative_start = NULL, relative_end = NULL, width = NULL)] if (clean) { file.remove(fasta_file, count_output_binary, mer_count_table) @@ -171,6 +174,7 @@ significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2), reduce_kmer <- function(kmer, significant) { data.table::setorderv(kmer, cols = names(kmer)[2], order = -1) + # TODO don't use 'V2' kmer[, cumsum := cumsum(V2)] return(kmer[cumsum <= significant]) @@ -255,5 +259,5 @@ if (!interactive()) { pbo <- pbapply::pboptions(type = "timer") # remove last parameter (help param) params <- opt[-length(opt)] - do.call(reduce_bed, args = params) + do.call(reduce_sequence, args = params) } From e0b9d38a9dc5cf978eed2fef436ee7be8ed2b0c9 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 19 Dec 2018 14:02:54 +0100 Subject: [PATCH 02/14] check whether jellyfish is installed --- bin/reduce_sequence.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bin/reduce_sequence.R b/bin/reduce_sequence.R index 586b074..69fc764 100644 --- a/bin/reduce_sequence.R +++ b/bin/reduce_sequence.R @@ -32,8 +32,11 @@ opt <- parse_args(opt_parser) #' @param min_seq_length Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower. #' @param motif_occurence Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence. #' -#' TODO check whether jellyfish is installed reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) { + if (system("jellyfish count --version") != 0) { + stop("Required programm jellyfish not found! Please check whether it is installed.") + } + if (missing(input)) { stop("No input specified! Please forward a valid bed-file.") } From 17308687327a8cd17e0b7af3916a58073152f344 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 19 Dec 2018 14:06:13 +0100 Subject: [PATCH 03/14] reduce_bed renamed to reduce_sequence --- pipeline.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index a39616a..db3bbdf 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -21,7 +21,7 @@ params.max_size_fp=100 //clustering - //reduce_bed + //reduce_sequence params.kmer=10 params.aprox_motif_len=10 params.motif_occurence=1 @@ -268,7 +268,7 @@ process overlap_with_known_TFBS { /* */ -process reduce_bed { +process reduce_sequence { conda "${path_env}" echo true publishDir "${params.out}/cluster/reduced_bed/", mode: 'copy' @@ -281,7 +281,7 @@ process reduce_bed { script: """ - Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} + Rscript ${path_bin}/reduce_sequence.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} """ } From 88fa2980f9b7345bf02c65d0539c88f019dc7a42 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 19 Dec 2018 15:03:38 +0100 Subject: [PATCH 04/14] check whether jellyfish is installed --- bin/reduce_sequence.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/reduce_sequence.R b/bin/reduce_sequence.R index 69fc764..d13f899 100644 --- a/bin/reduce_sequence.R +++ b/bin/reduce_sequence.R @@ -33,8 +33,8 @@ opt <- parse_args(opt_parser) #' @param motif_occurence Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence. #' reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) { - if (system("jellyfish count --version") != 0) { - stop("Required programm jellyfish not found! Please check whether it is installed.") + if (system("which jellyfish", ignore.stdout = TRUE) != 0) { + stop("Required program jellyfish not found! Please check whether it is installed.") } if (missing(input)) { From e17d1dbbe25b8604e2ab5ffe1c18ee8d1c0a81a9 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 19 Dec 2018 15:03:56 +0100 Subject: [PATCH 05/14] check whether cdhit is installed --- bin/cdhit_wrapper.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R index e5ed3e4..cf53ffc 100644 --- a/bin/cdhit_wrapper.R +++ b/bin/cdhit_wrapper.R @@ -70,6 +70,10 @@ opt <- parse_args(opt_parser) #' #' TODO check whether cdhit is installed cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) { + if (system("which cd-hit-est", ignore.stdout = FALSE) != 0) { + stop("Required program CD-HIT not found! Please check whether it is installed.") + } + if (missing(input)) { stop("Input parameter missing!") } From dcd185e80177a6fc873aaaf9c0347f1e7c68769c Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 19 Dec 2018 16:20:54 +0100 Subject: [PATCH 06/14] omit TODO --- bin/cdhit_wrapper.R | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R index cf53ffc..71bf881 100644 --- a/bin/cdhit_wrapper.R +++ b/bin/cdhit_wrapper.R @@ -68,7 +68,6 @@ opt <- parse_args(opt_parser) #' @param gat_ext Gap extension score. Default = -1 (CD-HIT parameter) #' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter) #' -#' TODO check whether cdhit is installed cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) { if (system("which cd-hit-est", ignore.stdout = FALSE) != 0) { stop("Required program CD-HIT not found! Please check whether it is installed.") From 4c16f6f796238619700711f5030c5ae983e8e74a Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 19 Dec 2018 16:55:13 +0100 Subject: [PATCH 07/14] check for header and forward it if provided --- bin/cdhit_wrapper.R | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R index 71bf881..c5f7230 100644 --- a/bin/cdhit_wrapper.R +++ b/bin/cdhit_wrapper.R @@ -68,6 +68,8 @@ opt <- parse_args(opt_parser) #' @param gat_ext Gap extension score. Default = -1 (CD-HIT parameter) #' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter) #' +#' @details If there is a header supplied other then the default data.table nameing scheme ('V1', 'V2', etc.) it will be kept and extended. +#' cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) { if (system("which cd-hit-est", ignore.stdout = FALSE) != 0) { stop("Required program CD-HIT not found! Please check whether it is installed.") @@ -80,15 +82,19 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed" message("Loading bed.") # load bed if neccessary if (!data.table::is.data.table(input)) { - bed_table <- data.table::fread(input = input, header = FALSE) + bed_table <- data.table::fread(input = input) } else { bed_table <- input } + # check if there are column names to keep them + default_col_names <- grepl(pattern = "^V+\\d$", names(bed_table), perl = TRUE) + keep_col_names <- ifelse(any(default_col_names), FALSE, TRUE) + # check for duplicated names - if (anyDuplicated(bed_table[, "V4"])) { + if (anyDuplicated(bed_table[, 4])) { warning("Found duplicated names. Making names unique.") - bed_table[, V4 := make.unique(V4)] + bed_table[, 4 := make.unique(names(bed_table)[4])] } message("Convert to fasta.") @@ -140,17 +146,18 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed" system(command = cluster_call, wait = TRUE) # load reformated file - cluster <- data.table::fread(cluster_file) + cluster <- data.table::fread(cluster_file)[, c("id", "clstr")] + names(cluster) <- c("id", "cluster") ### add cluster to bed_table - result <- merge(x = bed_table, y = cluster[, c("id", "clstr")], by.x = "V4", by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE] + result <- merge(x = bed_table, y = cluster, by.x = names(bed_table)[4], by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE] # delete files if (clean) { file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file) } - data.table::fwrite(x = result, file = output, sep = "\t", col.names = FALSE) + data.table::fwrite(x = result, file = output, sep = "\t", col.names = keep_col_names) } # call function with given parameter if not in interactive context (e.g. run from shell) From 5a7c84eb969aed0d111ba320a0b15310c275508d Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 19 Dec 2018 17:11:46 +0100 Subject: [PATCH 08/14] automatically detect and keep column names if provided --- bin/reduce_sequence.R | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/bin/reduce_sequence.R b/bin/reduce_sequence.R index d13f899..f9e8340 100644 --- a/bin/reduce_sequence.R +++ b/bin/reduce_sequence.R @@ -32,6 +32,8 @@ opt <- parse_args(opt_parser) #' @param min_seq_length Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower. #' @param motif_occurence Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence. #' +#' @details If there is a header supplied other then the default data.table nameing scheme ('V1', 'V2', etc.) it will be kept. +#' reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) { if (system("which jellyfish", ignore.stdout = TRUE) != 0) { stop("Required program jellyfish not found! Please check whether it is installed.") @@ -49,7 +51,17 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed" message("Loading bed...") # load bed # columns: chr, start, end, name, ..., sequence - bed_table <- data.table::fread(input = input, header = FALSE) + bed_table <- data.table::fread(input = input) + + # check for header and save it if provided + default_col_names <- grepl(pattern = "^V+\\d$", names(bed_table), perl = TRUE) + if (!any(default_col_names)) { + keep_col_names <- TRUE + col_names <- names(bed_table) + } else { + keep_col_names <- FALSE + } + names(bed_table)[1:4] <- c("chr", "start", "end", "name") names(bed_table)[ncol(bed_table)] <- "sequence" # index @@ -123,18 +135,23 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed" merged[, sequence := stringr::str_sub(sequence, relative_start, relative_end)] # bed files count from 0 - merged[, data.table::`:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)] + merged[, `:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)] # change start end location - merged[, data.table::`:=`(start = start + relative_start, end = start + relative_end)] + merged[, `:=`(start = start + relative_start, end = start + relative_end)] # clean table - merged[, data.table::`:=`(relative_start = NULL, relative_end = NULL, width = NULL)] + merged[, `:=`(relative_start = NULL, relative_end = NULL, width = NULL)] if (clean) { file.remove(fasta_file, count_output_binary, mer_count_table) } - data.table::fwrite(merged, file = output, sep = "\t", col.names = FALSE) + # keep provided column names + if (keep_col_names) { + names(merged) <- col_names + } + + data.table::fwrite(merged, file = output, sep = "\t", col.names = keep_col_names) } #' Predict how many interesting kmer are possible for the given data. From 97464ca036c2f0830af7b2497e3c7d8f72e94e47 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 19 Dec 2018 17:17:12 +0100 Subject: [PATCH 09/14] added author; better missing input error --- bin/cdhit_wrapper.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R index c5f7230..fcbd5eb 100644 --- a/bin/cdhit_wrapper.R +++ b/bin/cdhit_wrapper.R @@ -33,7 +33,8 @@ option_list <- list( ) opt_parser <- OptionParser(option_list = option_list, - description = "CD-HIT-EST Wrapper function. See https://github.com/weizhongli/cdhit/wiki for more information regarding CD-HITs parameters.") + description = "CD-HIT-EST Wrapper function. See https://github.com/weizhongli/cdhit/wiki for more information regarding CD-HITs parameters.", + epilogue = "Author: Hendrik Schultheis ") opt <- parse_args(opt_parser) @@ -76,7 +77,7 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed" } if (missing(input)) { - stop("Input parameter missing!") + stop("No input specified! Please forward a valid bed-file.") } message("Loading bed.") From cc532bf00dc6d0a967e0fced688757f7f67a0fb1 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 19 Dec 2018 17:17:29 +0100 Subject: [PATCH 10/14] added author --- bin/reduce_sequence.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/reduce_sequence.R b/bin/reduce_sequence.R index f9e8340..6df5bdc 100644 --- a/bin/reduce_sequence.R +++ b/bin/reduce_sequence.R @@ -15,7 +15,8 @@ option_list <- list( ) opt_parser <- OptionParser(option_list = option_list, - description = "Reduces each sequence to its most frequent region.") + description = "Reduces each sequence to its most frequent region.", + epilogue = "Author: Hendrik Schultheis ") opt <- parse_args(opt_parser) From d60faa704e974a4b66af3bfb582268e4817397d7 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 20 Dec 2018 13:21:35 +0100 Subject: [PATCH 11/14] spell check --- bin/cdhit_wrapper.R | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R index fcbd5eb..38b863a 100644 --- a/bin/cdhit_wrapper.R +++ b/bin/cdhit_wrapper.R @@ -4,7 +4,7 @@ library("optparse") option_list <- list( make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Fourth column is expected to contain names, last column must be sequences.", metavar = "character"), make_option(opt_str = c("-c", "--identity"), default = 0.8, help = "Identity threshold. Default = %default (CD-HIT parameter)", metavar = "double >= 0.8"), - make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file same as input but with appended column of cluster numbers. Default = %default", metavar = "character"), make_option(opt_str = c("--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), make_option(opt_str = c("-G", "--global"), default = 0, help = "Global sequence identity = 1, local = 0. Default = %default (CD-HIT parameter)", metavar = "integer"), @@ -14,7 +14,7 @@ option_list <- list( make_option(opt_str = c("-n", "--word_length"), default = 3, help = "Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-l", "--throw_away_sequences"), default = 5, help = "Maximum length of sequences thrown away. Default = %default (CD-HIT parameter)", metavar = "integer"), # make_option(opt_str = c("-d", "--description")), # can not produce bed if this is != 0 - make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0, help = "Shorter sequences length must be at least x percent of longer sequenecs length. Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0, help = "Shorter sequences length must be at least x percent of longer sequences length. Default = %default (CD-HIT parameter)", metavar = "double"), make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999, help = "Length difference between sequences can not be higher than x nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-e", "--alignment_coverage_longer_p"), default = 0.0, help = "Alignment must cover x percent of longer sequence. Default = %default (CD-HIT parameter: -aL)", metavar = "double"), make_option(opt_str = c("-E", "--alignment_coverage_longer_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter: -AL)", metavar = "integer"), @@ -22,7 +22,7 @@ option_list <- list( make_option(opt_str = c("-F", "--alignment_coverage_shorter_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter: -AS)", metavar = "integer"), make_option(opt_str = c("-w", "--max_unmatched_longer_p"), default = 1.0, help = "Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter: -uL)", metavar = "double"), make_option(opt_str = c("-W", "--max_unmatched_shorter_p"), default = 1.0, help = "Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter: -uS)", metavar = "double"), - make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999, help = "Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999, help = "Maximum unmatched nucleotide on both sequences (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-g", "--fast_cluster"), default = 1, help = "Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-r", "--strand"), default = 0, help = "Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--match"), default = 2, help = "Matching score. Default = %default (CD-HIT parameter)", metavar = "integer"), @@ -43,8 +43,8 @@ opt <- parse_args(opt_parser) #' #' @param input Data.table or file in bed format (requires names in fourth column and sequences in last column). #' @param identity Identity threshold. Default = 0.8 (CD-HIT parameter) -#' @param coverage Minimal alignment length for both sequences in nucelotides. Default = 8 (CD-HIT parameter) -#' @param output Clustered bedfile. Adds cluster number in last column (numbering depend on sort_by_cluster_size parameter). Default = cluster.bed +#' @param coverage Minimal alignment length for both sequences in nucleotides. Default = 8 (CD-HIT parameter) +#' @param output Clustered bed-file. Adds cluster number in last column (numbering depend on sort_by_cluster_size parameter). Default = cluster.bed #' @param clean Clean up after run. Default = TRUE #' @param threads Number of threads to use (0 = all cores). Default = 1 (CD-HIT parameter) #' @param global Global sequence identity = 1, local = 0. Default = 0 (CD-HIT parameter) @@ -52,7 +52,7 @@ opt <- parse_args(opt_parser) #' @param memory Memory limit in MB. 0 for unlimited. Default = 800 (CD-HIT parameter) #' @param word_length Default = 3 (CD-HIT parameter) #' @param throw_away_sequences Maximum length of sequences thrown away. Default = %default (CD-HIT parameter) -#' @param length_dif_cutoff_shorter_p Shorter sequences length must be at least x percent of longer sequenecs length. Default = 0 (CD-HIT parameter) +#' @param length_dif_cutoff_shorter_p Shorter sequences length must be at least x percent of longer sequences length. Default = 0 (CD-HIT parameter) #' @param length_dif_cutoff_shorter_n Length difference between sequences can not be higher than x nucleotides. Default = 999999 (CD-HIT parameter) #' @param alignment_coverage_longer_p Alignment must cover x percent of longer sequence. Default = 0 (CD-HIT parameter) #' @param alignment_coverage_longer_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter) @@ -60,16 +60,16 @@ opt <- parse_args(opt_parser) #' @param alignment_coverage_shorter_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter) #' @param max_unmatched_longer_p Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) #' @param max_unmatched_shorter_p Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) -#' @param max_unmatched_both_n Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = 99999999 (CD-HIT parameter) +#' @param max_unmatched_both_n Maximum unmatched nucleotide on both sequences (excludes leading tailing gap). Default = 99999999 (CD-HIT parameter) #' @param fast_cluster Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = 1 (CD-HIT parameter) #' @param strand Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = 0 (CD-HIT parameter) #' @param match Matching score. Default = 2 (CD-HIT parameter) #' @param mismatch Mismatch score. Default = -2 (CD-HIT parameter) #' @param gap Gap score. Default = -6 (CD-HIT parameter) -#' @param gat_ext Gap extension score. Default = -1 (CD-HIT parameter) +#' @param gap_ext Gap extension score. Default = -1 (CD-HIT parameter) #' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter) #' -#' @details If there is a header supplied other then the default data.table nameing scheme ('V1', 'V2', etc.) it will be kept and extended. +#' @details If there is a header supplied other then the default data.table naming scheme ('V1', 'V2', etc.) it will be kept and extended. #' cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) { if (system("which cd-hit-est", ignore.stdout = FALSE) != 0) { @@ -81,7 +81,7 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed" } message("Loading bed.") - # load bed if neccessary + # load bed if necessary if (!data.table::is.data.table(input)) { bed_table <- data.table::fread(input = input) } else { From 65076439ec5990c384b63fc9c9bc7781428bac86 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 20 Dec 2018 13:53:09 +0100 Subject: [PATCH 12/14] spell check --- bin/reduce_sequence.R | 76 +++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/bin/reduce_sequence.R b/bin/reduce_sequence.R index 6df5bdc..cf91187 100644 --- a/bin/reduce_sequence.R +++ b/bin/reduce_sequence.R @@ -3,15 +3,15 @@ library("optparse") option_list <- list( make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"), - make_option(opt_str = c("-k", "--kmer"), default = 10, help = "Kmer length. Default = %default", metavar = "integer"), + make_option(opt_str = c("-k", "--kmer"), default = 10, help = "k-mer length. Default = %default", metavar = "integer"), make_option(opt_str = c("-m", "--motif"), default = 10, help = "Estimated motif length. Default = %default", metavar = "integer"), make_option(opt_str = c("-o", "--output"), default = "reduced.bed", help = "Output file. Default = %default", metavar = "character"), make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use 0 for all available cores. Default = %default", metavar = "integer"), make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), - make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"), - make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer. Used to create reduced sequence ranges out of merged kmer. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), - make_option(opt_str = c("-v", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"), - make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = %default meaning that there should be approximately one motif per sequence.", metavar = "double") + make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and k-mer and can not be lower.", metavar = "integer", type = "integer"), + make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between k-mer. Used to create reduced sequence ranges out of merged k-mer. Can not be greater than k-mer length. Default = kmer - 1", metavar = "integer", type = "integer"), + make_option(opt_str = c("-v", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and k-mer to consider k-mer significant. Used for k-mer cutoff calculation. Can not be greater than motif and k-mer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"), + make_option(opt_str = c("-f", "--motif_occurrence"), default = 1, help = "Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = %default meaning that there should be approximately one motif per sequence.", metavar = "double") ) opt_parser <- OptionParser(option_list = option_list, @@ -23,19 +23,19 @@ opt <- parse_args(opt_parser) #' Reduces each sequence to its most frequent region. #' #' @param input Input bed-file. Last column must be sequences. -#' @param kmer Kmer length. Default = 10 +#' @param kmer k-mer length. Default = 10 #' @param motif Estimated motif length. Default = 10 #' @param output Output file. Default = reduced.bed #' @param threads Number of threads to use. Default = 1. Use 0 for all cores. #' @param clean Delete all temporary files. -#' @param minoverlap_kmer Minimum required overlap between kmer. Used to create reduced sequence ranges out of merged kmer. Can not be greater than kmer length . Default = kmer - 1 -#' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2) -#' @param min_seq_length Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower. -#' @param motif_occurence Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence. +#' @param minoverlap_kmer Minimum required overlap between k-mer. Used to create reduced sequence ranges out of merged k-mer. Can not be greater than k-mer length . Default = kmer - 1 +#' @param minoverlap_motif Minimum required overlap between motif and k-mer to consider k-mer significant. Used for k-mer cutoff calculation. Can not be greater than motif and k-mer length. Default = ceiling(motif / 2) +#' @param min_seq_length Remove sequences below this length. Defaults to the maximum value of motif and k-mer and can not be lower. +#' @param motif_occurrence Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence. #' -#' @details If there is a header supplied other then the default data.table nameing scheme ('V1', 'V2', etc.) it will be kept. +#' @details If there is a header supplied other then the default data.table naming scheme ('V1', 'V2', etc.) it will be kept. #' -reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) { +reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurrence = 1) { if (system("which jellyfish", ignore.stdout = TRUE) != 0) { stop("Required program jellyfish not found! Please check whether it is installed.") } @@ -76,7 +76,7 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed" # remove sequences below minimum length if (min_seq_length < max(c(kmer, motif))) { - stop("Minimum sequence length must be greater or equal to ", max(c(motif, kmer)), " (maximum value of kmer and motif).") + stop("Minimum sequence length must be greater or equal to ", max(c(motif, kmer)), " (maximum value of k-mer and motif).") } total_rows <- nrow(bed_table) @@ -91,7 +91,7 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed" fasta_file <- paste0(basename(input), ".fasta") seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file) - message("Counting kmer...") + message("Counting k-mer...") # count k-mer hashsize <- 4 ^ kmer count_output_binary <- "mer_count_binary.jf" @@ -105,20 +105,20 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed" system(command = jellyfish_dump_call, wait = TRUE) - message("Reduce kmer.") + message("Reduce k-mer.") # load mer table # columns: kmer, count kmer_counts <- data.table::fread(input = mer_count_table, header = FALSE) - # order kmer descending + # order k-mer descending data.table::setorder(kmer_counts, -V2) # compute number of hits to keep - keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif, minoverlap = minoverlap_motif, motif_occurence = motif_occurence) + keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif, minoverlap = minoverlap_motif, motif_occurrence = motif_occurrence) - # reduce kmer + # reduce k-mer reduced_kmer <- reduce_kmer(kmer = kmer_counts, significant = keep_hits) - message("Find kmer in sequences.") + message("Find k-mer in sequences.") # find k-mer in sequences # columns: name, start, end, width ranges_table <- find_kmer_regions(bed = bed_table, kmer_counts = reduced_kmer, minoverlap = minoverlap_kmer, threads = threads) @@ -155,21 +155,21 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed" data.table::fwrite(merged, file = output, sep = "\t", col.names = keep_col_names) } -#' Predict how many interesting kmer are possible for the given data. +#' Predict how many interesting k-mer are possible for the given data. #' #' @param bed Bed table with sequences in last column -#' @param kmer Length of kmer +#' @param kmer Length of k-mer #' @param motif Length of motif -#' @param minoverlap Minimum number of bases overlapping between kmer and motif. Must be <= motif & <= kmer. Defaults to ceiling(motif / 2). -#' @param motif_occurence Define how many motifs are expected per sequence. Default = 1 +#' @param minoverlap Minimum number of bases overlapping between k-mer and motif. Must be <= motif & <= kmer. Defaults to ceiling(motif / 2). +#' @param motif_occurrence Define how many motifs are expected per sequence. Default = 1 #' -#' @return Number of interesting kmer. -significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2), motif_occurence = 1) { +#' @return Number of interesting k-mer. +significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2), motif_occurrence = 1) { if (minoverlap > kmer || minoverlap > motif) { stop("Kmer & motif must be greater or equal to minoverlap!") } - if (motif_occurence <= 0) { - stop("Motif_occurence must be a numeric value above 0!") + if (motif_occurrence <= 0) { + stop("Motif_occurrence must be a numeric value above 0!") } # minimum sequence length to get all interesting overlaps @@ -180,15 +180,15 @@ significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2), # reduce to max interesting length seq_lengths <- ifelse(seq_lengths > min_seq_length, min_seq_length, seq_lengths) - # calculate max possible kmer + # calculate max possible k-mer topx <- sum(seq_lengths - kmer + 1) - return(topx * motif_occurence) + return(topx * motif_occurrence) } -#' Orders kmer table after count descending and keeps all kmer with a cumulative sum below the given significance threshold. +#' Orders k-mer table after count descending and keeps all k-mer with a cumulative sum below the given significance threshold. #' -#' @param kmer Kmer data.table columns: kmer, count +#' @param kmer K-mer data.table columns: kmer, count #' @param significant Value from significant_kmer function. #' #' @return reduced data.table @@ -204,16 +204,16 @@ reduce_kmer <- function(kmer, significant) { #' create list of significant ranges (one for each bed entry) #' #' @param bed Data.table of bed with sequence in last column -#' @param kmer_counts Data.table of counted kmer. Column1 = kmer, column2 = count. -#' @param minoverlap Minimum overlapping nucleotides between kmers to be merged. Positive integer. Must be smaller than kmer length. +#' @param kmer_counts Data.table of counted k-mer. Column1 = kmer, column2 = count. +#' @param minoverlap Minimum overlapping nucleotides between k-mers to be merged. Positive integer. Must be smaller than k-mer length. #' @param threads Number of threads. #' #' @return Data.table with relative positions and width (start, end, width). #' -#' TODO Include number of motifs per sequence (aka motif_occurence). Attempt to keep best 2 regions for occurence = 2? Probably high impact on performance. +#' TODO Include number of motifs per sequence (aka motif_occurrence). Attempt to keep best 2 regions for occurrence = 2? Probably high impact on performance. find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) { if (nchar(kmer_counts[1, 1]) <= minoverlap) { - stop("Minoverlap must be smaller than kmer length!") + stop("Minoverlap must be smaller than k-mer length!") } names(kmer_counts)[1:2] <- c("kmer", "count") @@ -232,9 +232,9 @@ find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) return(data.table::data.table(start = NA, end = NA, width = NA, name = name)) } - # add kmer sequences + # add k-mer sequences ranges[, sub_seq := stringr::str_sub(seq, start, end)] - # add kmer count + # add k-mer count ranges[, count := kmer_counts[ranges[["sub_seq"]], "count", on = "kmer"]] #### reduce ranges @@ -253,7 +253,7 @@ find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) which(member == x) }) - # calculate component score (= sum of kmer count) + # calculate component score (= sum of k-mer count) score <- vapply(node_membership, FUN.VALUE = numeric(1), function(x) { sum(kmer_counts[x, "count"]) }) From d86f788766e24a2ee3c5144cd2273d8a491b9de2 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 20 Dec 2018 13:57:34 +0100 Subject: [PATCH 13/14] fixed more typos --- bin/reduce_sequence.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/reduce_sequence.R b/bin/reduce_sequence.R index cf91187..53ac69f 100644 --- a/bin/reduce_sequence.R +++ b/bin/reduce_sequence.R @@ -3,7 +3,7 @@ library("optparse") option_list <- list( make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"), - make_option(opt_str = c("-k", "--kmer"), default = 10, help = "k-mer length. Default = %default", metavar = "integer"), + make_option(opt_str = c("-k", "--kmer"), default = 10, help = "K-mer length. Default = %default", metavar = "integer"), make_option(opt_str = c("-m", "--motif"), default = 10, help = "Estimated motif length. Default = %default", metavar = "integer"), make_option(opt_str = c("-o", "--output"), default = "reduced.bed", help = "Output file. Default = %default", metavar = "character"), make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use 0 for all available cores. Default = %default", metavar = "integer"), @@ -28,7 +28,7 @@ opt <- parse_args(opt_parser) #' @param output Output file. Default = reduced.bed #' @param threads Number of threads to use. Default = 1. Use 0 for all cores. #' @param clean Delete all temporary files. -#' @param minoverlap_kmer Minimum required overlap between k-mer. Used to create reduced sequence ranges out of merged k-mer. Can not be greater than k-mer length . Default = kmer - 1 +#' @param minoverlap_kmer Minimum required overlap between k-mer. Used to create reduced sequence ranges out of merged k-mer. Can not be greater than k-mer length. Default = kmer - 1 #' @param minoverlap_motif Minimum required overlap between motif and k-mer to consider k-mer significant. Used for k-mer cutoff calculation. Can not be greater than motif and k-mer length. Default = ceiling(motif / 2) #' @param min_seq_length Remove sequences below this length. Defaults to the maximum value of motif and k-mer and can not be lower. #' @param motif_occurrence Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence. From 756e98f550ac6dde7df603fb506a83d203654182 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 20 Dec 2018 14:01:22 +0100 Subject: [PATCH 14/14] process description for reduce_sequence and clustering --- pipeline.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pipeline.nf b/pipeline.nf index db3bbdf..c5f16b6 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -267,6 +267,7 @@ process overlap_with_known_TFBS { /* +Reduce each sequence to its most conserved region. */ process reduce_sequence { conda "${path_env}" @@ -287,6 +288,7 @@ process reduce_sequence { /* +Cluster all sequences. Appends a column with cluster numbers to the bed-file. */ process clustering { conda "${path_env}"