diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R index 6767ab5..09e3f4f 100644 --- a/bin/bed_to_fasta.R +++ b/bin/bed_to_fasta.R @@ -4,18 +4,25 @@ # The Sequences of each cluster are writen as an FASTA-file. # @parameter bedInput BED-file with sequences and cluster-id as column"TEs # @parameter prefix prefix for filenames +# @parameter min_seq min. number of sequences per cluster args = commandArgs(trailingOnly = TRUE) bedInput <- args[1] -prefix <- args[2] # "Fasta" +prefix <- args[2] +min_seq <- args[3] bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") -clusters <- split(bed, bed$V3, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster +clusters <- split(bed, bed$V8, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster discard <- lapply(1:length(clusters), function(i){ - sequences <- as.list(as.data.frame(clusters[i])[[2]]) # <---- Splate mit Sequenz - outfile <- paste0(prefix,"_cluster_",i,".FASTA") - seqinr::write.fasta(sequences = sequences, names = as.data.frame(clusters[i])[[1]] - , file.out = outfile, as.string = TRUE) # <---- Spalte mit Name + clust <- as.data.frame(clusters[i]) + print(nrow(clust)) + if (nrow(clust) >= as.numeric(min_seq) ) { + sequences <- as.list(clust[[7]]) # <---- Splate mit Sequenz + outfile <- paste0(prefix,"_cluster_",i,".FASTA") + seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name + } else { + print(paste0("Cluster: ",i," is to small")) + } })