Skip to content

Cluster #17

Merged
merged 14 commits into from
Dec 21, 2018
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 20 additions & 9 deletions bin/cdhit_wrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ option_list <- list(
)

opt_parser <- OptionParser(option_list = option_list,
description = "CD-HIT-EST Wrapper function. See https://github.com/weizhongli/cdhit/wiki for more information regarding CD-HITs parameters.")
description = "CD-HIT-EST Wrapper function. See https://github.com/weizhongli/cdhit/wiki for more information regarding CD-HITs parameters.",
epilogue = "Author: Hendrik Schultheis <Hendrik.Schultheis@mpi-bn.mpg.de>")

opt <- parse_args(opt_parser)

Expand Down Expand Up @@ -68,24 +69,33 @@ opt <- parse_args(opt_parser)
#' @param gat_ext Gap extension score. Default = -1 (CD-HIT parameter)
#' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter)
#'
#' TODO check whether cdhit is installed
#' @details If there is a header supplied other then the default data.table nameing scheme ('V1', 'V2', etc.) it will be kept and extended.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo: naming scheme

#'
cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) {
if (system("which cd-hit-est", ignore.stdout = FALSE) != 0) {
stop("Required program CD-HIT not found! Please check whether it is installed.")
}

if (missing(input)) {
stop("Input parameter missing!")
stop("No input specified! Please forward a valid bed-file.")
}

message("Loading bed.")
# load bed if neccessary
if (!data.table::is.data.table(input)) {
bed_table <- data.table::fread(input = input, header = FALSE)
bed_table <- data.table::fread(input = input)
} else {
bed_table <- input
}

# check if there are column names to keep them
default_col_names <- grepl(pattern = "^V+\\d$", names(bed_table), perl = TRUE)
keep_col_names <- ifelse(any(default_col_names), FALSE, TRUE)

# check for duplicated names
if (anyDuplicated(bed_table[, "V4"])) {
if (anyDuplicated(bed_table[, 4])) {
warning("Found duplicated names. Making names unique.")
bed_table[, V4 := make.unique(V4)]
bed_table[, 4 := make.unique(names(bed_table)[4])]
}

message("Convert to fasta.")
Expand Down Expand Up @@ -137,17 +147,18 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed"
system(command = cluster_call, wait = TRUE)

# load reformated file
cluster <- data.table::fread(cluster_file)
cluster <- data.table::fread(cluster_file)[, c("id", "clstr")]
names(cluster) <- c("id", "cluster")

### add cluster to bed_table
result <- merge(x = bed_table, y = cluster[, c("id", "clstr")], by.x = "V4", by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE]
result <- merge(x = bed_table, y = cluster, by.x = names(bed_table)[4], by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE]

# delete files
if (clean) {
file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file)
}

data.table::fwrite(x = result, file = output, sep = "\t", col.names = FALSE)
data.table::fwrite(x = result, file = output, sep = "\t", col.names = keep_col_names)
}

# call function with given parameter if not in interactive context (e.g. run from shell)
Expand Down
59 changes: 42 additions & 17 deletions bin/reduce_bed.R → bin/reduce_sequence.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,32 +9,41 @@ option_list <- list(
make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use 0 for all available cores. Default = %default", metavar = "integer"),
make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"),
make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"),
make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"),
make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer. Used to create reduced sequence ranges out of merged kmer. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"),
Copy link
Collaborator

@renewiegandt renewiegandt Dec 19, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo: k-mers. 2x

make_option(opt_str = c("-v", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"),
make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Number of motifs per sequence any value above 0. Default = %default.", metavar = "double")
make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = %default meaning that there should be approximately one motif per sequence.", metavar = "double")
)

opt_parser <- OptionParser(option_list = option_list,
description = "Reduce sequences to frequent regions.")
description = "Reduces each sequence to its most frequent region.",
epilogue = "Author: Hendrik Schultheis <Hendrik.Schultheis@mpi-bn.mpg.de>")

opt <- parse_args(opt_parser)

#' Reduce bed file to conserved regions
#' Reduces each sequence to its most frequent region.
#'
#' @param input bed file
#' @param kmer Length of kmer.
#' @param motif Estimated motif length.
#' @param output Output file
#' @param threads Number of threads. Default = 1. 0 for all cores.
#' @param input Input bed-file. Last column must be sequences.
#' @param kmer Kmer length. Default = 10
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kmer_len instead of kmer?

#' @param motif Estimated motif length. Default = 10
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Parameter 'motif' does not contain a lot of information about its function.
Maybe it should be named something like 'estimated_motif_len'?

#' @param output Output file. Default = reduced.bed
#' @param threads Number of threads to use. Default = 1. Use 0 for all cores.
#' @param clean Delete all temporary files.
#' @param minoverlap_kmer Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1
#' @param minoverlap_kmer Minimum required overlap between kmer. Used to create reduced sequence ranges out of merged kmer. Can not be greater than kmer length . Default = kmer - 1
Copy link
Collaborator

@renewiegandt renewiegandt Dec 19, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo:

  • length.
  • k-mers

#' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)
#' @param min_seq_length Must be greater or equal to kmer and motif. Default = max(c(motif, kmer)).
#' @param min_seq_length Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.
#' @param motif_occurence Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence.
#'
#' @return reduced bed
#' TODO check whether jellyfish is installed
reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) {
#' @details If there is a header supplied other then the default data.table nameing scheme ('V1', 'V2', etc.) it will be kept.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo: naming scheme

#'
reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) {
if (system("which jellyfish", ignore.stdout = TRUE) != 0) {
stop("Required program jellyfish not found! Please check whether it is installed.")
}

if (missing(input)) {
stop("No input specified! Please forward a valid bed-file.")
}

# get number of available cores
if (threads == 0) {
threads <- parallel::detectCores()
Expand All @@ -43,7 +52,17 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr
message("Loading bed...")
# load bed
# columns: chr, start, end, name, ..., sequence
bed_table <- data.table::fread(input = input, header = FALSE)
bed_table <- data.table::fread(input = input)

# check for header and save it if provided
default_col_names <- grepl(pattern = "^V+\\d$", names(bed_table), perl = TRUE)
if (!any(default_col_names)) {
keep_col_names <- TRUE
col_names <- names(bed_table)
} else {
keep_col_names <- FALSE
}

names(bed_table)[1:4] <- c("chr", "start", "end", "name")
names(bed_table)[ncol(bed_table)] <- "sequence"
# index
Expand Down Expand Up @@ -128,7 +147,12 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr
file.remove(fasta_file, count_output_binary, mer_count_table)
}

data.table::fwrite(merged, file = output, sep = "\t", col.names = FALSE)
# keep provided column names
if (keep_col_names) {
names(merged) <- col_names
}

data.table::fwrite(merged, file = output, sep = "\t", col.names = keep_col_names)
}

#' Predict how many interesting kmer are possible for the given data.
Expand Down Expand Up @@ -171,6 +195,7 @@ significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2),
reduce_kmer <- function(kmer, significant) {
data.table::setorderv(kmer, cols = names(kmer)[2], order = -1)

# TODO don't use 'V2'
kmer[, cumsum := cumsum(V2)]

return(kmer[cumsum <= significant])
Expand Down Expand Up @@ -255,5 +280,5 @@ if (!interactive()) {
pbo <- pbapply::pboptions(type = "timer")
# remove last parameter (help param)
params <- opt[-length(opt)]
do.call(reduce_bed, args = params)
do.call(reduce_sequence, args = params)
}
6 changes: 3 additions & 3 deletions pipeline.nf
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
params.max_size_fp=100

//clustering
//reduce_bed
//reduce_sequence
params.kmer=10
params.aprox_motif_len=10
params.motif_occurence=1
Expand Down Expand Up @@ -268,7 +268,7 @@ process overlap_with_known_TFBS {

/*
*/
process reduce_bed {
process reduce_sequence {
Copy link
Collaborator

@renewiegandt renewiegandt Dec 19, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a small description to the process reduce_sequence and clustering.

conda "${path_env}"
echo true
publishDir "${params.out}/cluster/reduced_bed/", mode: 'copy'
Expand All @@ -281,7 +281,7 @@ process reduce_bed {

script:
"""
Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length}
Rscript ${path_bin}/reduce_sequence.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length}
"""
}

Expand Down