Skip to content

Cluster #17

Merged
merged 14 commits into from
Dec 21, 2018
Merged
29 changes: 20 additions & 9 deletions bin/cdhit_wrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ option_list <- list(
)

opt_parser <- OptionParser(option_list = option_list,
description = "CD-HIT-EST Wrapper function. See https://github.com/weizhongli/cdhit/wiki for more information regarding CD-HITs parameters.")
description = "CD-HIT-EST Wrapper function. See https://github.com/weizhongli/cdhit/wiki for more information regarding CD-HITs parameters.",
epilogue = "Author: Hendrik Schultheis <Hendrik.Schultheis@mpi-bn.mpg.de>")

opt <- parse_args(opt_parser)

Expand Down Expand Up @@ -68,24 +69,33 @@ opt <- parse_args(opt_parser)
#' @param gat_ext Gap extension score. Default = -1 (CD-HIT parameter)
#' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter)
#'
#' TODO check whether cdhit is installed
#' @details If there is a header supplied other then the default data.table nameing scheme ('V1', 'V2', etc.) it will be kept and extended.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo: naming scheme

#'
cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) {
if (system("which cd-hit-est", ignore.stdout = FALSE) != 0) {
stop("Required program CD-HIT not found! Please check whether it is installed.")
}

if (missing(input)) {
stop("Input parameter missing!")
stop("No input specified! Please forward a valid bed-file.")
}

message("Loading bed.")
# load bed if neccessary
if (!data.table::is.data.table(input)) {
bed_table <- data.table::fread(input = input, header = FALSE)
bed_table <- data.table::fread(input = input)
} else {
bed_table <- input
}

# check if there are column names to keep them
default_col_names <- grepl(pattern = "^V+\\d$", names(bed_table), perl = TRUE)
keep_col_names <- ifelse(any(default_col_names), FALSE, TRUE)

# check for duplicated names
if (anyDuplicated(bed_table[, "V4"])) {
if (anyDuplicated(bed_table[, 4])) {
warning("Found duplicated names. Making names unique.")
bed_table[, V4 := make.unique(V4)]
bed_table[, 4 := make.unique(names(bed_table)[4])]
}

message("Convert to fasta.")
Expand Down Expand Up @@ -137,17 +147,18 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed"
system(command = cluster_call, wait = TRUE)

# load reformated file
cluster <- data.table::fread(cluster_file)
cluster <- data.table::fread(cluster_file)[, c("id", "clstr")]
names(cluster) <- c("id", "cluster")

### add cluster to bed_table
result <- merge(x = bed_table, y = cluster[, c("id", "clstr")], by.x = "V4", by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE]
result <- merge(x = bed_table, y = cluster, by.x = names(bed_table)[4], by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE]

# delete files
if (clean) {
file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file)
}

data.table::fwrite(x = result, file = output, sep = "\t", col.names = FALSE)
data.table::fwrite(x = result, file = output, sep = "\t", col.names = keep_col_names)
}

# call function with given parameter if not in interactive context (e.g. run from shell)
Expand Down
59 changes: 42 additions & 17 deletions bin/reduce_bed.R → bin/reduce_sequence.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,32 +9,41 @@ option_list <- list(
make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use 0 for all available cores. Default = %default", metavar = "integer"),
make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"),
make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"),
make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"),
make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer. Used to create reduced sequence ranges out of merged kmer. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo: k-mers. 2x

make_option(opt_str = c("-v", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"),
make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Number of motifs per sequence any value above 0. Default = %default.", metavar = "double")
make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = %default meaning that there should be approximately one motif per sequence.", metavar = "double")
)

opt_parser <- OptionParser(option_list = option_list,
description = "Reduce sequences to frequent regions.")
description = "Reduces each sequence to its most frequent region.",
epilogue = "Author: Hendrik Schultheis <Hendrik.Schultheis@mpi-bn.mpg.de>")

opt <- parse_args(opt_parser)

#' Reduce bed file to conserved regions
#' Reduces each sequence to its most frequent region.
#'
#' @param input bed file
#' @param kmer Length of kmer.
#' @param motif Estimated motif length.
#' @param output Output file
#' @param threads Number of threads. Default = 1. 0 for all cores.
#' @param input Input bed-file. Last column must be sequences.
#' @param kmer Kmer length. Default = 10
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kmer_len instead of kmer?

#' @param motif Estimated motif length. Default = 10
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Parameter 'motif' does not contain a lot of information about its function.
Maybe it should be named something like 'estimated_motif_len'?

#' @param output Output file. Default = reduced.bed
#' @param threads Number of threads to use. Default = 1. Use 0 for all cores.
#' @param clean Delete all temporary files.
#' @param minoverlap_kmer Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1
#' @param minoverlap_kmer Minimum required overlap between kmer. Used to create reduced sequence ranges out of merged kmer. Can not be greater than kmer length . Default = kmer - 1
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo:

  • length.
  • k-mers

#' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)
#' @param min_seq_length Must be greater or equal to kmer and motif. Default = max(c(motif, kmer)).
#' @param min_seq_length Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.
#' @param motif_occurence Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence.
#'
#' @return reduced bed
#' TODO check whether jellyfish is installed
reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) {
#' @details If there is a header supplied other then the default data.table nameing scheme ('V1', 'V2', etc.) it will be kept.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo: naming scheme

#'
reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) {
if (system("which jellyfish", ignore.stdout = TRUE) != 0) {
stop("Required program jellyfish not found! Please check whether it is installed.")
}

if (missing(input)) {
stop("No input specified! Please forward a valid bed-file.")
}

# get number of available cores
if (threads == 0) {
threads <- parallel::detectCores()
Expand All @@ -43,7 +52,17 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr
message("Loading bed...")
# load bed
# columns: chr, start, end, name, ..., sequence
bed_table <- data.table::fread(input = input, header = FALSE)
bed_table <- data.table::fread(input = input)

# check for header and save it if provided
default_col_names <- grepl(pattern = "^V+\\d$", names(bed_table), perl = TRUE)
if (!any(default_col_names)) {
keep_col_names <- TRUE
col_names <- names(bed_table)
} else {
keep_col_names <- FALSE
}

names(bed_table)[1:4] <- c("chr", "start", "end", "name")
names(bed_table)[ncol(bed_table)] <- "sequence"
# index
Expand Down Expand Up @@ -128,7 +147,12 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr
file.remove(fasta_file, count_output_binary, mer_count_table)
}

data.table::fwrite(merged, file = output, sep = "\t", col.names = FALSE)
# keep provided column names
if (keep_col_names) {
names(merged) <- col_names
}

data.table::fwrite(merged, file = output, sep = "\t", col.names = keep_col_names)
}

#' Predict how many interesting kmer are possible for the given data.
Expand Down Expand Up @@ -171,6 +195,7 @@ significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2),
reduce_kmer <- function(kmer, significant) {
data.table::setorderv(kmer, cols = names(kmer)[2], order = -1)

# TODO don't use 'V2'
kmer[, cumsum := cumsum(V2)]

return(kmer[cumsum <= significant])
Expand Down Expand Up @@ -255,5 +280,5 @@ if (!interactive()) {
pbo <- pbapply::pboptions(type = "timer")
# remove last parameter (help param)
params <- opt[-length(opt)]
do.call(reduce_bed, args = params)
do.call(reduce_sequence, args = params)
}
6 changes: 3 additions & 3 deletions pipeline.nf
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
params.max_size_fp=100

//clustering
//reduce_bed
//reduce_sequence
params.kmer=10
params.aprox_motif_len=10
params.motif_occurence=1
Expand Down Expand Up @@ -268,7 +268,7 @@ process overlap_with_known_TFBS {

/*
*/
process reduce_bed {
process reduce_sequence {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a small description to the process reduce_sequence and clustering.

conda "${path_env}"
echo true
publishDir "${params.out}/cluster/reduced_bed/", mode: 'copy'
Expand All @@ -281,7 +281,7 @@ process reduce_bed {

script:
"""
Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length}
Rscript ${path_bin}/reduce_sequence.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length}
"""
}

Expand Down