Skip to content

Commit

Permalink
check for header and forward it if provided
Browse files Browse the repository at this point in the history
  • Loading branch information
HendrikSchultheis committed Dec 19, 2018
1 parent dcd185e commit 4c16f6f
Showing 1 changed file with 13 additions and 6 deletions.
19 changes: 13 additions & 6 deletions bin/cdhit_wrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ opt <- parse_args(opt_parser)
#' @param gat_ext Gap extension score. Default = -1 (CD-HIT parameter)
#' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter)
#'
#' @details If there is a header supplied other then the default data.table nameing scheme ('V1', 'V2', etc.) it will be kept and extended.
#'
cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) {
if (system("which cd-hit-est", ignore.stdout = FALSE) != 0) {
stop("Required program CD-HIT not found! Please check whether it is installed.")
Expand All @@ -80,15 +82,19 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed"
message("Loading bed.")
# load bed if neccessary
if (!data.table::is.data.table(input)) {
bed_table <- data.table::fread(input = input, header = FALSE)
bed_table <- data.table::fread(input = input)
} else {
bed_table <- input
}

# check if there are column names to keep them
default_col_names <- grepl(pattern = "^V+\\d$", names(bed_table), perl = TRUE)
keep_col_names <- ifelse(any(default_col_names), FALSE, TRUE)

# check for duplicated names
if (anyDuplicated(bed_table[, "V4"])) {
if (anyDuplicated(bed_table[, 4])) {
warning("Found duplicated names. Making names unique.")
bed_table[, V4 := make.unique(V4)]
bed_table[, 4 := make.unique(names(bed_table)[4])]
}

message("Convert to fasta.")
Expand Down Expand Up @@ -140,17 +146,18 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed"
system(command = cluster_call, wait = TRUE)

# load reformated file
cluster <- data.table::fread(cluster_file)
cluster <- data.table::fread(cluster_file)[, c("id", "clstr")]
names(cluster) <- c("id", "cluster")

### add cluster to bed_table
result <- merge(x = bed_table, y = cluster[, c("id", "clstr")], by.x = "V4", by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE]
result <- merge(x = bed_table, y = cluster, by.x = names(bed_table)[4], by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE]

# delete files
if (clean) {
file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file)
}

data.table::fwrite(x = result, file = output, sep = "\t", col.names = FALSE)
data.table::fwrite(x = result, file = output, sep = "\t", col.names = keep_col_names)
}

# call function with given parameter if not in interactive context (e.g. run from shell)
Expand Down

0 comments on commit 4c16f6f

Please sign in to comment.