From cc2325031a60709be32b05f512209e3cd5e1f7ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Tue, 18 Dec 2018 08:00:39 -0500 Subject: [PATCH] Bugfix in bed_to_fasta.R: Get last and second last instead of fixed indices --- bin/bed_to_fasta.R | 62 +++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R index 84910f3..8af8516 100644 --- a/bin/bed_to_fasta.R +++ b/bin/bed_to_fasta.R @@ -1,28 +1,34 @@ -#!/usr/bin/env Rscript - -# Splitting BED-files depending on their cluster. -# The Sequences of each cluster are writen as an FASTA-file. -# @parameter bedInput BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8 -# @parameter prefix prefix for filenames -# @parameter min_seq min. number of sequences per cluster - -args = commandArgs(trailingOnly = TRUE) - -bedInput <- args[1] -prefix <- args[2] -min_seq <- args[3] - -bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") - -clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Cluster column -discard <- lapply(1:length(clusters), function(i){ - clust <- as.data.frame(clusters[i]) - print(nrow(clust)) - if (nrow(clust) >= as.numeric(min_seq) ) { - sequences <- as.list(clust[[10]]) # <---- sequenze column - outfile <- paste0(prefix,"_cluster_",i,".FASTA") - seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column - } else { - print(paste0("Cluster: ",i," is to small")) - } -}) +#!/usr/bin/env Rscript + +#' Splitting BED-files depending on their cluster. +#' The Sequences of each cluster are writen as an FASTA-file. +#' @parameter bedInput BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8 +#' @parameter prefix prefix for filenames +#' @parameter min_seq min. number of sequences per cluster +#' +#' @author René Wiegandt +#' @contact rene.wiegandt(at)mpi-bn.mpg.de + +args = commandArgs(trailingOnly = TRUE) + +bedInput <- args[1] +prefix <- args[2] +min_seq <- args[3] + +bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") + +# Get last column of data.table, which refers to the cluster, as a vector. +cluster_no <- as.vector(bed[[ncol(bed)]]) + +clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) # <---- Cluster column + +discard <- lapply(1:length(clusters), function(i){ + clust <- as.data.frame(clusters[i]) + if (nrow(clust) >= as.numeric(min_seq) ) { + sequences <- as.list(clust[[ncol(clust) - 1]]) # <---- sequenze column + outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA") + seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column + } else { + print(paste0("Cluster: ",i," is to small")) + } +})