Skip to content

Commit

Permalink
bed_to_fasta.R: Improved documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
renewiegandt committed Dec 18, 2018
1 parent cc23250 commit 62f6f3f
Showing 1 changed file with 13 additions and 6 deletions.
19 changes: 13 additions & 6 deletions bin/bed_to_fasta.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

#' Splitting BED-files depending on their cluster.
#' The Sequences of each cluster are writen as an FASTA-file.
#' @parameter bedInput <string> BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8
#' @parameter prefix <string> prefix for filenames
#' @parameter min_seq <INT> min. number of sequences per cluster
#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns:
#' Sequence: second last column; Cluster ID: last column
#' @param prefix <string> prefix for filenames
#' @param min_seq <INT> min. number of sequences per cluster
#'
#' @author René Wiegandt
#' @contact rene.wiegandt(at)mpi-bn.mpg.de
Expand All @@ -20,14 +21,20 @@ bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")
# Get last column of data.table, which refers to the cluster, as a vector.
cluster_no <- as.vector(bed[[ncol(bed)]])

clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) # <---- Cluster column
# Split data.table bed on its last column (cluster_no) into list of data.frames
clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)

# For each data.frame(cluster) in list clusters:
discard <- lapply(1:length(clusters), function(i){
clust <- as.data.frame(clusters[i])
# Filter data.tables(clusters), which are to small
if (nrow(clust) >= as.numeric(min_seq) ) {
sequences <- as.list(clust[[ncol(clust) - 1]]) # <---- sequenze column
# Get second last column, which contains the nucleotide sequences
sequences <- as.list(clust[[ncol(clust) - 1]])
# Create filename
outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column
# Write fasta file
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
} else {
print(paste0("Cluster: ",i," is to small"))
}
Expand Down

0 comments on commit 62f6f3f

Please sign in to comment.