-
Notifications
You must be signed in to change notification settings - Fork 0
Cluster #17
Cluster #17
Changes from 10 commits
98985d1
e0b9d38
1730868
88fa298
e17d1db
dcd185e
4c16f6f
5a7c84e
97464ca
cc532bf
d60faa7
6507643
d86f788
756e98f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,32 +9,41 @@ option_list <- list( | |
make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use 0 for all available cores. Default = %default", metavar = "integer"), | ||
make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), | ||
make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"), | ||
make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), | ||
make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer. Used to create reduced sequence ranges out of merged kmer. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Typo: k-mers. 2x |
||
make_option(opt_str = c("-v", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"), | ||
make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Number of motifs per sequence any value above 0. Default = %default.", metavar = "double") | ||
make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = %default meaning that there should be approximately one motif per sequence.", metavar = "double") | ||
) | ||
|
||
opt_parser <- OptionParser(option_list = option_list, | ||
description = "Reduce sequences to frequent regions.") | ||
description = "Reduces each sequence to its most frequent region.", | ||
epilogue = "Author: Hendrik Schultheis <Hendrik.Schultheis@mpi-bn.mpg.de>") | ||
|
||
opt <- parse_args(opt_parser) | ||
|
||
#' Reduce bed file to conserved regions | ||
#' Reduces each sequence to its most frequent region. | ||
#' | ||
#' @param input bed file | ||
#' @param kmer Length of kmer. | ||
#' @param motif Estimated motif length. | ||
#' @param output Output file | ||
#' @param threads Number of threads. Default = 1. 0 for all cores. | ||
#' @param input Input bed-file. Last column must be sequences. | ||
#' @param kmer Kmer length. Default = 10 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. kmer_len instead of kmer? |
||
#' @param motif Estimated motif length. Default = 10 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Parameter 'motif' does not contain a lot of information about its function. |
||
#' @param output Output file. Default = reduced.bed | ||
#' @param threads Number of threads to use. Default = 1. Use 0 for all cores. | ||
#' @param clean Delete all temporary files. | ||
#' @param minoverlap_kmer Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1 | ||
#' @param minoverlap_kmer Minimum required overlap between kmer. Used to create reduced sequence ranges out of merged kmer. Can not be greater than kmer length . Default = kmer - 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Typo:
|
||
#' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2) | ||
#' @param min_seq_length Must be greater or equal to kmer and motif. Default = max(c(motif, kmer)). | ||
#' @param min_seq_length Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower. | ||
#' @param motif_occurence Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence. | ||
#' | ||
#' @return reduced bed | ||
#' TODO check whether jellyfish is installed | ||
reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) { | ||
#' @details If there is a header supplied other then the default data.table nameing scheme ('V1', 'V2', etc.) it will be kept. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Typo: naming scheme |
||
#' | ||
reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) { | ||
if (system("which jellyfish", ignore.stdout = TRUE) != 0) { | ||
stop("Required program jellyfish not found! Please check whether it is installed.") | ||
} | ||
|
||
if (missing(input)) { | ||
stop("No input specified! Please forward a valid bed-file.") | ||
} | ||
|
||
# get number of available cores | ||
if (threads == 0) { | ||
threads <- parallel::detectCores() | ||
|
@@ -43,7 +52,17 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr | |
message("Loading bed...") | ||
# load bed | ||
# columns: chr, start, end, name, ..., sequence | ||
bed_table <- data.table::fread(input = input, header = FALSE) | ||
bed_table <- data.table::fread(input = input) | ||
|
||
# check for header and save it if provided | ||
default_col_names <- grepl(pattern = "^V+\\d$", names(bed_table), perl = TRUE) | ||
if (!any(default_col_names)) { | ||
keep_col_names <- TRUE | ||
col_names <- names(bed_table) | ||
} else { | ||
keep_col_names <- FALSE | ||
} | ||
|
||
names(bed_table)[1:4] <- c("chr", "start", "end", "name") | ||
names(bed_table)[ncol(bed_table)] <- "sequence" | ||
# index | ||
|
@@ -128,7 +147,12 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr | |
file.remove(fasta_file, count_output_binary, mer_count_table) | ||
} | ||
|
||
data.table::fwrite(merged, file = output, sep = "\t", col.names = FALSE) | ||
# keep provided column names | ||
if (keep_col_names) { | ||
names(merged) <- col_names | ||
} | ||
|
||
data.table::fwrite(merged, file = output, sep = "\t", col.names = keep_col_names) | ||
} | ||
|
||
#' Predict how many interesting kmer are possible for the given data. | ||
|
@@ -171,6 +195,7 @@ significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2), | |
reduce_kmer <- function(kmer, significant) { | ||
data.table::setorderv(kmer, cols = names(kmer)[2], order = -1) | ||
|
||
# TODO don't use 'V2' | ||
kmer[, cumsum := cumsum(V2)] | ||
|
||
return(kmer[cumsum <= significant]) | ||
|
@@ -255,5 +280,5 @@ if (!interactive()) { | |
pbo <- pbapply::pboptions(type = "timer") | ||
# remove last parameter (help param) | ||
params <- opt[-length(opt)] | ||
do.call(reduce_bed, args = params) | ||
do.call(reduce_sequence, args = params) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,7 +21,7 @@ | |
params.max_size_fp=100 | ||
|
||
//clustering | ||
//reduce_bed | ||
//reduce_sequence | ||
params.kmer=10 | ||
params.aprox_motif_len=10 | ||
params.motif_occurence=1 | ||
|
@@ -268,7 +268,7 @@ process overlap_with_known_TFBS { | |
|
||
/* | ||
*/ | ||
process reduce_bed { | ||
process reduce_sequence { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add a small description to the process reduce_sequence and clustering. |
||
conda "${path_env}" | ||
echo true | ||
publishDir "${params.out}/cluster/reduced_bed/", mode: 'copy' | ||
|
@@ -281,7 +281,7 @@ process reduce_bed { | |
|
||
script: | ||
""" | ||
Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} | ||
Rscript ${path_bin}/reduce_sequence.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} | ||
""" | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Typo: naming scheme