bin/bed_to_fasta.R

#!/usr/bin/env Rscript

# Splitting BED-files depending on their cluster.
# The Sequences of each cluster are writen as an FASTA-file.
# @parameter bedInput <string> BED-file with sequences and cluster-id as column"TEs
# @parameter prefix <string> prefix for filenames
# @parameter min_seq <INT> min. number of sequences per cluster

args = commandArgs(trailingOnly = TRUE)

bedInput <- args[1]
prefix <- args[2] 
min_seq <- args[3] 

bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")

clusters <- split(bed, bed$V8, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster
discard <- lapply(1:length(clusters), function(i){
  clust <- as.data.frame(clusters[i])
  print(nrow(clust))
  if (nrow(clust) >= as.numeric(min_seq) ) {
    sequences <- as.list(clust[[7]])   # <---- Splate mit Sequenz
    outfile <- paste0(prefix,"_cluster_",i,".FASTA")
    seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name
  } else {
    print(paste0("Cluster: ",i," is to small"))
  }
})