From 62f6f3f1e95ad6038567619a3f846203f1e92751 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Tue, 18 Dec 2018 08:10:13 -0500 Subject: [PATCH] bed_to_fasta.R: Improved documentation --- bin/bed_to_fasta.R | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R index 8af8516..dce3839 100644 --- a/bin/bed_to_fasta.R +++ b/bin/bed_to_fasta.R @@ -2,9 +2,10 @@ #' Splitting BED-files depending on their cluster. #' The Sequences of each cluster are writen as an FASTA-file. -#' @parameter bedInput BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8 -#' @parameter prefix prefix for filenames -#' @parameter min_seq min. number of sequences per cluster +#' @param bedInput BED-file with sequences and cluster-id as last two columns: +#' Sequence: second last column; Cluster ID: last column +#' @param prefix prefix for filenames +#' @param min_seq min. number of sequences per cluster #' #' @author René Wiegandt #' @contact rene.wiegandt(at)mpi-bn.mpg.de @@ -20,14 +21,20 @@ bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") # Get last column of data.table, which refers to the cluster, as a vector. cluster_no <- as.vector(bed[[ncol(bed)]]) -clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) # <---- Cluster column +# Split data.table bed on its last column (cluster_no) into list of data.frames +clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) +# For each data.frame(cluster) in list clusters: discard <- lapply(1:length(clusters), function(i){ clust <- as.data.frame(clusters[i]) + # Filter data.tables(clusters), which are to small if (nrow(clust) >= as.numeric(min_seq) ) { - sequences <- as.list(clust[[ncol(clust) - 1]]) # <---- sequenze column + # Get second last column, which contains the nucleotide sequences + sequences <- as.list(clust[[ncol(clust) - 1]]) + # Create filename outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA") - seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column + # Write fasta file + seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) } else { print(paste0("Cluster: ",i," is to small")) }