From 14390450c3056a6db8b9c8fb11a6586679ee0d6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Wed, 9 Jan 2019 09:39:58 -0500
Subject: [PATCH] Added log file for part 2.2_motif_estimation

---
 bin/2.2_motif_estimation/bed_to_fasta.R | 17 +++++++++++++++-
 pipeline.nf                             | 27 ++++++++++++++++++++++---
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/bin/2.2_motif_estimation/bed_to_fasta.R b/bin/2.2_motif_estimation/bed_to_fasta.R
index 7e716b7..7dbb36f 100644
--- a/bin/2.2_motif_estimation/bed_to_fasta.R
+++ b/bin/2.2_motif_estimation/bed_to_fasta.R
@@ -1,5 +1,5 @@
 #!/usr/bin/env Rscript
-if (!require(optparse)) install.packages("optparse"); library(optparse)
+if (!require(optparse, quietly = T)) install.packages("optparse"); library(optparse)
 
 option_list <- list(
   make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster id.", metavar = "character"),
@@ -36,6 +36,9 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
   # Split data.table bed on its last column (cluster_no) into list of data.frames
   clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)
 
+  # Get number of all clusters
+  num_clusters <- length(clusters)
+
   # For each data.frame(cluster) in list clusters:
   discard <- lapply(seq_len(length(clusters)), function(i){
     clust <- as.data.frame(clusters[i])
@@ -47,10 +50,22 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
       outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
       # Write fasta file
       seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
+      return(TRUE)
     } else {
       print(paste0("Cluster: ",i," is to small"))
+      return(FALSE)
     }
   })
+
+  # Get number of clusters which contain enought sequences
+  tmp <- unlist(discard)
+  count_clust <- length(tmp[tmp == FALSE])
+
+  # Write log-file
+  write(paste0( "------------------------------------------------------------\nfile: ",
+                bedInput,"\nNumber of all clusters: ", num_clusters, "\nRemoved small clusters ( < ",
+                min_seq," sequences ): " ,count_clust , "\nRemaining number of clusters: ", num_clusters - count_clust),
+        file = "bed_to_fasta.log", append = T)
 }
 
 # run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell)
diff --git a/pipeline.nf b/pipeline.nf
index 4ebab19..48b1455 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -200,6 +200,7 @@ process footprint_extraction {
 
 	output:
 	set name, file ('*.bed') into bed_for_overlap_with_TFBS
+	file('*.log')
 
 	script:
 	"""
@@ -317,7 +318,7 @@ Converting BED-File to one FASTA-File per cluster
 process bed_to_clustered_fasta {
 
 	//conda "${path_env}"
-	publishDir "${params.out}/2.2_motif_estimation/fasta/", mode: 'copy'
+	publishDir "${params.out}/2.2_motif_estimation/fasta/", mode: 'copy', pattern: '*.FASTA'
     tag{name}
 
     input:
@@ -326,6 +327,7 @@ process bed_to_clustered_fasta {
     output:
     file ('*.FASTA') into fasta_for_glam2
     file ('*.FASTA') into fasta_for_motif_cluster
+	file ('*.log') into 22_log
 
     script:
     """
@@ -572,14 +574,33 @@ process tomtom {
 
 //Joining channels with meme and tsv files. Filter joined channel on line count.
 //Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel.
-for_filter2 = for_filter.join( tsv_for_filter )
+for_filter.join( tsv_for_filter ).into {for_filter2; for_log}
 for_filter2
     .filter { name, meme, tsv ->
         long count = tsv.readLines().size()
         count <= 1
         }
-    .into { meme_for_scan; check }
+    .into { meme_for_scan; check; num_cluster }
 
+count_cluster = num_cluster.count()
+count_cluster_before_filter = for_log.count()
+
+process write_log_for_motif_estimation {
+
+	publishDir "${params.out}/2.2_motif_estimation/log/", mode: 'copy'
+
+	input:
+	file log, val (after_filter), val (before_filter) from 22_log.combine(count_cluster).combine(count_cluster_before_filter = for_log.count())
+
+	output:
+	file ('*.log')
+
+	script:
+	removed = before_filter - after_filter
+	"""
+	printf "\nMotifs found in Database: ${removed}\nNumber of remaining unknown motifs/cluster${after_filter}" >> log
+	"""
+}
 
 //If channel 'check' is empty print errormessage
 process check_for_unknown_motifs {