From e38a469124e87c8d551cf5574cd7d7858ad6b0fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Fri, 4 Jan 2019 11:17:30 -0500
Subject: [PATCH 1/6] Added check for optparse to bed_to_fasta.R

---
 bin/2.2_motif_estimation/bed_to_fasta.R | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/bin/2.2_motif_estimation/bed_to_fasta.R b/bin/2.2_motif_estimation/bed_to_fasta.R
index e0ade14..8006e64 100644
--- a/bin/2.2_motif_estimation/bed_to_fasta.R
+++ b/bin/2.2_motif_estimation/bed_to_fasta.R
@@ -1,5 +1,5 @@
 #!/usr/bin/env Rscript
-library("optparse")
+if (!require(optparse)) install.packages("optparse"); library(optparse)
 
 option_list <- list(
   make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"),
@@ -7,7 +7,7 @@ option_list <- list(
   make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer")
 )
 
-opt_parser <- OptionParser(option_list = option_list, 
+opt_parser <- OptionParser(option_list = option_list,
                            description = "Convert BED-file to one FASTA-file per cluster")
 
 opt <- parse_args(opt_parser)
@@ -22,19 +22,19 @@ opt <- parse_args(opt_parser)
 #' @author RenĂŠ Wiegandt
 #' @contact rene.wiegandt(at)mpi-bn.mpg.de
 bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
-  
+
   if (is.null(bedInput)) {
     stop("ERROR: Input parameter cannot be null! Please specify the input parameter.")
   }
 
   bed <- data.table::fread(bedInput, sep = "\t")
-  
+
   # Get last column of data.table, which refers to the cluster, as a vector.
   cluster_no <- as.vector(bed[[ncol(bed)]])
-  
+
   # Split data.table bed on its last column (cluster_no) into list of data.frames
   clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)
-  
+
   # For each data.frame(cluster) in list clusters:
   discard <- lapply(1:length(clusters), function(i){
     clust <- as.data.frame(clusters[i])

From 8b281499f1519abe3f63911095a64121545058d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Fri, 4 Jan 2019 15:47:05 -0500
Subject: [PATCH 2/6] improved motif clustering

---
 bin/2.2_motif_estimation/label_cluster.R | 39 ++++++++++
 pipeline.nf                              | 90 +++++++++++++++++++++---
 2 files changed, 119 insertions(+), 10 deletions(-)
 create mode 100644 bin/2.2_motif_estimation/label_cluster.R

diff --git a/bin/2.2_motif_estimation/label_cluster.R b/bin/2.2_motif_estimation/label_cluster.R
new file mode 100644
index 0000000..b00a97a
--- /dev/null
+++ b/bin/2.2_motif_estimation/label_cluster.R
@@ -0,0 +1,39 @@
+#!/usr/bin/env Rscript
+if (!require(optparse)) install.packages("optparse"); library(optparse)
+
+option_list <- list(
+  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from tomtom", metavar = "character"),
+  make_option(opt_str = c("-o", "--output"), default = NULL, help = "Output TSV-file.", metavar = "character")
+)
+
+opt_parser <- OptionParser(option_list = option_list, 
+                           description = "Adding Cluster ID to Query_ID Column")
+
+opt <- parse_args(opt_parser)
+
+#' Adding Cluster ID to Query_ID Column
+#' 
+#' @param input <string> TSV-file. Output from tomtom.
+#' @param input <string> Output name.
+#'
+#' @author René Wiegandt
+#' @contact rene.wiegandt(at)mpi-bn.mpg.de
+label_cluster <- function(input, output){
+  #Reading TSV-file
+  tsv <- data.table::fread(input, header = T, sep = "\t")
+  
+  #Getting cluster ID/number 
+  splitted_name <- unlist(strsplit(input, "\\_|\\."))
+  cluster_number <- as.numeric(splitted_name[length(splitted_name) - 1]) + 1
+  
+  #Adding cluster ID to first column
+  tsv$Query_ID <- paste(tsv$Query_ID, ".", cluster_number,sep = "")
+  
+  
+  data.table::fwrite(tsv, file = output, sep = "\t", col.names = FALSE)
+}
+
+# run function label_cluster with given parameteres if not in interactive context (e.g. run from shell)
+if (!interactive()) {
+  label_cluster(opt$input, opt$output)
+}
\ No newline at end of file
diff --git a/pipeline.nf b/pipeline.nf
index 1f2f379..b4b7537 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -315,6 +315,7 @@ process clustering {
 Converting BED-File to one FASTA-File per cluster
 */
 process bed_to_clustered_fasta {
+
 	conda "${path_env}"
 	publishDir "${params.out}/2.2_motif_estimation/fasta/", mode: 'copy'
     tag{name}
@@ -345,14 +346,16 @@ Generating Motifs through alignment and scoring best local matches.
 process glam2 {
 
     tag{name}
-	publishDir "${params.out}/2.2_motif_estimation/glam2/${name}/", mode: 'copy'
+	publishDir "${params.out}/2.2_motif_estimation/glam2/", mode: 'copy'
+	conda "${path_env}"
 
     input:
     set name, file (fasta) from fasta_for_glam2
 
-    output:
+	output:
     file("${name}/*.meme") into meme_to_merge
 	set name, file("${name}/*.meme") into meme_for_tomtom
+	set name, file("${name}/*.meme") into meme_to_search_in_merged
 	set name, file("${name}/*.meme") into meme_for_filter
 	file ('*')
 
@@ -364,11 +367,12 @@ process glam2 {
 
 /*
 Combining all MEME-files to one big MEME-file.
-The paths are sorted numerically depending on the cluster number.
+The paths are sorted numerically depending on the cluster ID.
 */
 process merge_meme {
 
 	publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_meme/", mode: 'copy'
+	conda "${path_env}"
 
 	input:
 	val (memelist) from meme_to_merge.toList()
@@ -380,48 +384,104 @@ process merge_meme {
 	params.cluster_motif == 1
 
 	script:
+	//sorting
 	memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") }
 	meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer }
 	meme_sorted_full = meme_sorted.collect {it.toString() + "/glam2.meme"}
+	//create list for bash
 	meme_list = meme_sorted_full.toString().replaceAll(/\,|\[|\]/,"")
 	"""
 	meme2meme ${meme_list} > merged_meme.meme
 	"""
 }
 
+to_find_similar_motifs = meme_to_search_in_merged.combine(merged_meme)
+
 /*
 Running Tomtom on merged meme-files.
 Output table has the information which clusters are similar to each other.
 */
 process find_similar_motifs {
 
+	tag{name}
 	publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/cluster_similarity/", mode: 'copy'
+	conda "${path_env}"
+
+	input:
+	set name, file (meme) ,file (merged_meme) from to_find_similar_motifs
+
+	output:
+	set name, file ("${name}.tsv") into motif_similarity
+
+	when:
+	params.cluster_motif == 1
+
+	script:
+	"""
+	tomtom ${meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > ${name}.tsv
+	"""
+}
+
+/*
+Label first column of TSV-file with Cluster ID
+*/
+process label_cluster {
+
+	tag{name}
+	conda "${path_env}"
+
 	input:
-	file (merged_meme) from merged_meme
+	set name, file (tsv) from motif_similarity
 
 	output:
-	file ('all_to_all.tsv') into motif_similarity
+	file ("${name}_labeled.tsv") into labeled_tsv
 
 	when:
 	params.cluster_motif == 1
 
 	script:
 	"""
-	tomtom ${merged_meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > all_to_all.tsv
+	Rscript ${path_bin}/2.2_motif_estimation/label_cluster.R -i ${tsv} -o ${name}_labeled.tsv
 	"""
 }
 
+/*
+Merging tsv files_for_merge_fasta
+*/
+process merge_labeled_tsv {
+
+	publishDir "${params.out}/esimated_motifs/", mode: 'copy'
+
+	input:
+	val (tsvlist) from labeled_tsv.toSortedList { it.toString().tokenize('_')[-2] as Integer }
+
+	output:
+	file ('*.tsv') into merged_labeled_tsv
+
+	when:
+	params.cluster_motif == 1
+
+	script:
+	tsvs = tsvlist.toString().replaceAll(/\,|\[|\]/,"")
+	"""
+	echo -e 'Query_ID\tTarget_ID\tOptimal_offset\tp-value\tE-value\tq-value\tOverlap\tQuery_consensus\tTarget_consensus\tOrientation'> merged_labeled.tsv
+	for i in $tsvs; do
+		cat \$i >> merged_labeled.tsv
+	done
+	"""
+}
 
-files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster)
+files_for_merge_fasta = merged_labeled_tsv.combine(fasta_for_motif_cluster)
 
 
 /*
 Merging FASTA-files of similar clusters
 */
 process merge_fasta {
+
 	conda "${path_env}"
 	publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_fasta/", mode: 'copy'
-	echo true
+
 	input:
 	set file (motiv_sim), val (fasta_list) from files_for_merge_fasta
 
@@ -443,10 +503,14 @@ process merge_fasta {
 
 motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten()
 
-
+/*
+Run GLAM2 on emrged FASTA-files
+*/
 process clustered_glam2 {
 
 	publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/glam2/${name}/", mode: 'copy'
+	tag{name}
+	conda "${path_env}"
 
 	input:
 	file (fasta) from motif_clustered_fasta_flat
@@ -466,6 +530,10 @@ process clustered_glam2 {
 	"""
 }
 
+/*
+Forward files depending on set parameter
+If motif clustering is activated or not.
+*/
 if(params.cluster_motif == 1){
 	for_tomtom = clustered_meme_for_tomtom
 	for_filter = clustered_meme_for_filter
@@ -483,6 +551,7 @@ process tomtom {
 
     tag{name}
 	publishDir "${params.out}/2.2_motif_estimation/tomtom/", mode: 'copy'
+	conda "${path_env}"
 
     input:
     set name, file (meme) from for_tomtom
@@ -527,8 +596,9 @@ process check_for_unknown_motifs {
 
 //Get the best(first) Motif from each MEME-file
 process get_best_motif {
-	conda "${path_env}"
 
+	conda "${path_env}"
+	tag{name}
 	publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/", mode: 'copy'
 
 	input:

From 2064e527e26bb5b4843e2decda5f3ff172e8463f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Sat, 5 Jan 2019 11:15:16 -0500
Subject: [PATCH 3/6] imporved structure of merge_similar_clusters.R

---
 .../merge_similar_clusters.R                  | 130 ++++++++++--------
 pipeline.nf                                   |   9 +-
 2 files changed, 79 insertions(+), 60 deletions(-)

diff --git a/bin/2.2_motif_estimation/merge_similar_clusters.R b/bin/2.2_motif_estimation/merge_similar_clusters.R
index 03b8cf1..0104d05 100644
--- a/bin/2.2_motif_estimation/merge_similar_clusters.R
+++ b/bin/2.2_motif_estimation/merge_similar_clusters.R
@@ -1,61 +1,77 @@
 #!/usr/bin/env Rscript
+if (!require(optparse)) install.packages("optparse"); library(optparse)
 
-# Merging FASTA-files, which motifs are similar.
-#
-# @parameter tsv_in <string> Path to TSV file generated by Tomtom.
-#                            The input for Tomtom is a from all clusters merged meme-file.
-# @parameter file_list <string> Numerically sorted whitespace separated list of absolute fasta-file paths
-# @parameter min_weight <INT> Minimum weight of edge allowed in graph clusters.
-
-
-args = commandArgs(trailingOnly = TRUE)
-
-tsv_in <- args[1]
-file_list <- args[2]
-min_weight <- args[3]
-
-files <- unlist(as.list(strsplit(file_list, ",")))
-
-# split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster.
-tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character')
-query_cluster <- unlist(lapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){
-  tail(l,n=1)
-}))
-target_cluster <- unlist(lapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){
-  tail(l,n=1)
-}))
-
-# create data.table with only the cluster-numbers
-sim_not_unique <- data.table::data.table(query_cluster,target_cluster)
-# convert from character to numeric values
-sim_not_unique[, query_cluster := as.numeric(query_cluster)]
-sim_not_unique[, target_cluster := as.numeric(target_cluster)]
-
-# remove rows if column 1 is idential to column 2
-edgelist <- sim_not_unique[query_cluster != target_cluster]
-
-# create graph from data.frame
-g <- igraph::graph_from_edgelist(as.matrix(edgelist))
-# converting graph to adjacency matrix
-adj_matrix <- igraph::get.adjacency(g, names = T)
-# generating weighted graph from adjacency matrix
-g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T)
-
-# get subgraphs from graph with edges of weight > min_weight
-s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight>min_weight], del=F)
-png('motif_clusters.png')
-plot(s1)
-dev.off()
-clust <- igraph::clusters(s1)
-if (clust$no < 1){
-  b <- lapply(files, function(f){
-    system(paste("cat",f,">",basename(f)))
+option_list <- list(
+  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from merged tomtom results", metavar = "character"),
+  make_option(opt_str = c("-l", "--list"), default = NULL, help = "Numerically sorted whitespace separated list of absolute fasta-file paths", metavar = "character"),
+  make_option(opt_str = c("-w", "--min"), default = NULL, help = "Minimum weight of edge allowed in graph clusters.", metavar = "character")
+)
+
+opt_parser <- OptionParser(option_list = option_list,
+                           description = "Adding Cluster ID to Query_ID Column")
+
+opt <- parse_args(opt_parser)
+
+#' Merging FASTA-files, which motifs are similar.
+#'
+#' @parameter tsv_in <string> Path to TSV file generated by Tomtom.
+#'                            The input for Tomtom is a from all clusters merged meme-file.
+#' @parameter file_list <string> Numerically sorted whitespace separated list of absolute fasta-file paths
+#' @parameter min_weight <INT> Minimum weight of edge allowed in graph clusters.
+#'
+#' @author RenĂŠ Wiegandt
+#' @contact rene.wiegandt(at)mpi-bn.mpg.de
+merge_similar <- function(tsv_in, file_list, min_weight){
+
+  files <- unlist(as.list(strsplit(file_list, ",")))
+
+  # split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster.
+  tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character')
+  query_cluster <- unlist(lapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){
+    tail(l,n=1)
+  }))
+  target_cluster <- unlist(lapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){
+    tail(l,n=1)
+  }))
+
+  # create data.table with only the cluster-numbers
+  sim_not_unique <- data.table::data.table(query_cluster,target_cluster)
+  # convert from character to numeric values
+  sim_not_unique[, query_cluster := as.numeric(query_cluster)]
+  sim_not_unique[, target_cluster := as.numeric(target_cluster)]
+
+  # remove rows if column 1 is idential to column 2
+  edgelist <- sim_not_unique[query_cluster != target_cluster]
+
+  # create graph from data.frame
+  g <- igraph::graph_from_edgelist(as.matrix(edgelist))
+  # converting graph to adjacency matrix
+  adj_matrix <- igraph::get.adjacency(g, names = T)
+  # generating weighted graph from adjacency matrix
+  g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T)
+
+  # get subgraphs from graph with edges of weight > min_weight
+  s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight>min_weight], del=F)
+  png('motif_clusters.png')
+  plot(s1)
+  dev.off()
+  clust <- igraph::clusters(s1)
+  if (clust$no < 1){
+    b <- lapply(files, function(f){
+      system(paste("cat",f,">",basename(f)))
+    })
+  }
+  # merge FASTA-files depending on the clustered graphs
+  a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){
+    cl <- as.vector(which(clust$membership %in% c(i)))
+    fasta_list <- paste(files[cl], collapse = " ")
+    name <- paste0("Cluster_",i,".fasta")
+    system(paste("cat",fasta_list,">",name))
   })
 }
-# merge FASTA-files depending on the clustered graphs
-a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){
-  cl <- as.vector(which(clust$membership %in% c(i)))
-  fasta_list <- paste(files[cl], collapse = " ")
-  name <- paste0("Cluster_",i,".fasta")
-  system(paste("cat",fasta_list,">",name))
-})
+
+
+# run function merge_similar with given parameteres if not in interactive context (e.g. run from shell)
+if (!interactive()) {
+  merge_similar(opt$input, opt$list, opt$min)
+}
diff --git a/pipeline.nf b/pipeline.nf
index b4b7537..8358a68 100644
--- a/pipeline.nf
+++ b/pipeline.nf
@@ -251,7 +251,7 @@ if(params.tfbs_path == "") {
 */
 process overlap_with_known_TFBS {
 	conda "${path_env}"
-
+	echo true
 	publishDir "${params.out}/1.2_filter_motifs/compareBed/", mode :'copy'
 
 	input:
@@ -263,7 +263,10 @@ process overlap_with_known_TFBS {
 	script:
 	motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"")
 	"""
-	${path_bin}/1.2_filter_motifs/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}_unknown.bed -min ${params.min_size_fp} -max ${params.max_size_fp} -p ${path_bin}/1.2/filter_motifs
+	echo ${bed_footprints}
+	echo ${motif_path}
+	echo ${fasta}
+	${path_bin}/1.2_filter_motifs/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}_unknown.bed -min ${params.min_size_fp} -max ${params.max_size_fp}
 	"""
 }
 
@@ -450,7 +453,7 @@ Merging tsv files_for_merge_fasta
 */
 process merge_labeled_tsv {
 
-	publishDir "${params.out}/esimated_motifs/", mode: 'copy'
+	publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/", mode: 'copy'
 
 	input:
 	val (tsvlist) from labeled_tsv.toSortedList { it.toString().tokenize('_')[-2] as Integer }

From f800e77658e321bfc8e78e238cddc5e573cae464 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Sun, 6 Jan 2019 05:51:55 -0500
Subject: [PATCH 4/6] bed_to_fasta.R: improved syntax; fixed typos

---
 bin/2.2_motif_estimation/bed_to_fasta.R | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/bin/2.2_motif_estimation/bed_to_fasta.R b/bin/2.2_motif_estimation/bed_to_fasta.R
index 8006e64..7e716b7 100644
--- a/bin/2.2_motif_estimation/bed_to_fasta.R
+++ b/bin/2.2_motif_estimation/bed_to_fasta.R
@@ -2,18 +2,19 @@
 if (!require(optparse)) install.packages("optparse"); library(optparse)
 
 option_list <- list(
-  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"),
+  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster id.", metavar = "character"),
   make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"),
   make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer")
 )
 
 opt_parser <- OptionParser(option_list = option_list,
-                           description = "Convert BED-file to one FASTA-file per cluster")
+                           description = "Convert BED-file to one FASTA-file per cluster.",
+                           epilogue = "Author: Rene Wiegandt <Rene.Wiegandt@mpi-bn.mpg.de>")
 
 opt <- parse_args(opt_parser)
 
 #' Splitting BED-files depending on their cluster.
-#' The Sequences of each cluster are written as an FASTA-file.
+#' The Sequences of each cluster are written as a FASTA-file.
 #' @param bedInput <string> BED-file with sequences and cluster-id as last two columns:
 #'                              Sequence: second last column; Cluster ID: last column
 #' @param prefix <string> prefix for filenames
@@ -30,13 +31,13 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
   bed <- data.table::fread(bedInput, sep = "\t")
 
   # Get last column of data.table, which refers to the cluster, as a vector.
-  cluster_no <- as.vector(bed[[ncol(bed)]])
+  cluster_no <- bed[[ncol(bed)]]
 
   # Split data.table bed on its last column (cluster_no) into list of data.frames
   clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)
 
   # For each data.frame(cluster) in list clusters:
-  discard <- lapply(1:length(clusters), function(i){
+  discard <- lapply(seq_len(length(clusters)), function(i){
     clust <- as.data.frame(clusters[i])
     # Filter data.tables(clusters), which are to small
     if (nrow(clust) >= as.numeric(min_seq) ) {

From 7ff588be8322561e249de1982cf25db2e800b8d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Sun, 6 Jan 2019 06:04:06 -0500
Subject: [PATCH 5/6] label_cluster.R: improved syntax and documentation; fixed
 typos

---
 bin/2.2_motif_estimation/label_cluster.R | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/bin/2.2_motif_estimation/label_cluster.R b/bin/2.2_motif_estimation/label_cluster.R
index b00a97a..60f1ec8 100644
--- a/bin/2.2_motif_estimation/label_cluster.R
+++ b/bin/2.2_motif_estimation/label_cluster.R
@@ -7,7 +7,8 @@ option_list <- list(
 )
 
 opt_parser <- OptionParser(option_list = option_list, 
-                           description = "Adding Cluster ID to Query_ID Column")
+                           description = "Adding Cluster ID to Query_ID Column.",
+                           epilogue = "Author: Rene Wiegandt <Rene.Wiegandt@mpi-bn.mpg.de>")
 
 opt <- parse_args(opt_parser)
 
@@ -27,7 +28,7 @@ label_cluster <- function(input, output){
   cluster_number <- as.numeric(splitted_name[length(splitted_name) - 1]) + 1
   
   #Adding cluster ID to first column
-  tsv$Query_ID <- paste(tsv$Query_ID, ".", cluster_number,sep = "")
+  tsv$Query_ID <- paste0(tsv$Query_ID, ".", cluster_number)
   
   
   data.table::fwrite(tsv, file = output, sep = "\t", col.names = FALSE)
@@ -36,4 +37,4 @@ label_cluster <- function(input, output){
 # run function label_cluster with given parameteres if not in interactive context (e.g. run from shell)
 if (!interactive()) {
   label_cluster(opt$input, opt$output)
-}
\ No newline at end of file
+}

From 798dd77a5039d55cd5ce8afbca9f2068f7032a8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= <rene.wiegandt@mpi-bn.mpg.de>
Date: Sun, 6 Jan 2019 06:04:41 -0500
Subject: [PATCH 6/6] merge_similar_clusters.R: improved syntax and
 documentation; fixed typos

---
 .../merge_similar_clusters.R                  | 156 +++++++++---------
 1 file changed, 79 insertions(+), 77 deletions(-)

diff --git a/bin/2.2_motif_estimation/merge_similar_clusters.R b/bin/2.2_motif_estimation/merge_similar_clusters.R
index 0104d05..d9b2373 100644
--- a/bin/2.2_motif_estimation/merge_similar_clusters.R
+++ b/bin/2.2_motif_estimation/merge_similar_clusters.R
@@ -1,77 +1,79 @@
-#!/usr/bin/env Rscript
-if (!require(optparse)) install.packages("optparse"); library(optparse)
-
-option_list <- list(
-  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from merged tomtom results", metavar = "character"),
-  make_option(opt_str = c("-l", "--list"), default = NULL, help = "Numerically sorted whitespace separated list of absolute fasta-file paths", metavar = "character"),
-  make_option(opt_str = c("-w", "--min"), default = NULL, help = "Minimum weight of edge allowed in graph clusters.", metavar = "character")
-)
-
-opt_parser <- OptionParser(option_list = option_list,
-                           description = "Adding Cluster ID to Query_ID Column")
-
-opt <- parse_args(opt_parser)
-
-#' Merging FASTA-files, which motifs are similar.
-#'
-#' @parameter tsv_in <string> Path to TSV file generated by Tomtom.
-#'                            The input for Tomtom is a from all clusters merged meme-file.
-#' @parameter file_list <string> Numerically sorted whitespace separated list of absolute fasta-file paths
-#' @parameter min_weight <INT> Minimum weight of edge allowed in graph clusters.
-#'
-#' @author RenĂŠ Wiegandt
-#' @contact rene.wiegandt(at)mpi-bn.mpg.de
-merge_similar <- function(tsv_in, file_list, min_weight){
-
-  files <- unlist(as.list(strsplit(file_list, ",")))
-
-  # split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster.
-  tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character')
-  query_cluster <- unlist(lapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){
-    tail(l,n=1)
-  }))
-  target_cluster <- unlist(lapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){
-    tail(l,n=1)
-  }))
-
-  # create data.table with only the cluster-numbers
-  sim_not_unique <- data.table::data.table(query_cluster,target_cluster)
-  # convert from character to numeric values
-  sim_not_unique[, query_cluster := as.numeric(query_cluster)]
-  sim_not_unique[, target_cluster := as.numeric(target_cluster)]
-
-  # remove rows if column 1 is idential to column 2
-  edgelist <- sim_not_unique[query_cluster != target_cluster]
-
-  # create graph from data.frame
-  g <- igraph::graph_from_edgelist(as.matrix(edgelist))
-  # converting graph to adjacency matrix
-  adj_matrix <- igraph::get.adjacency(g, names = T)
-  # generating weighted graph from adjacency matrix
-  g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T)
-
-  # get subgraphs from graph with edges of weight > min_weight
-  s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight>min_weight], del=F)
-  png('motif_clusters.png')
-  plot(s1)
-  dev.off()
-  clust <- igraph::clusters(s1)
-  if (clust$no < 1){
-    b <- lapply(files, function(f){
-      system(paste("cat",f,">",basename(f)))
-    })
-  }
-  # merge FASTA-files depending on the clustered graphs
-  a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){
-    cl <- as.vector(which(clust$membership %in% c(i)))
-    fasta_list <- paste(files[cl], collapse = " ")
-    name <- paste0("Cluster_",i,".fasta")
-    system(paste("cat",fasta_list,">",name))
-  })
-}
-
-
-# run function merge_similar with given parameteres if not in interactive context (e.g. run from shell)
-if (!interactive()) {
-  merge_similar(opt$input, opt$list, opt$min)
-}
+#!/usr/bin/env Rscript
+if (!require(optparse)) install.packages("optparse"); library(optparse)
+
+option_list <- list(
+  make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from merged tomtom results", metavar = "character"),
+  make_option(opt_str = c("-l", "--list"), default = NULL, help = "Numerically sorted whitespace separated list of absolute fasta-file paths", metavar = "character"),
+  make_option(opt_str = c("-w", "--min"), default = NULL, help = "Minimum weight of edge allowed in graph clusters.", metavar = "character")
+)
+
+opt_parser <- OptionParser(option_list = option_list,
+                           description = "Adding Cluster ID to Query_ID Column",
+                           epilogue = "Author: Rene Wiegandt <Rene.Wiegandt@mpi-bn.mpg.de>")
+
+opt <- parse_args(opt_parser)
+
+#' Merging FASTA-files, which motifs are similar.
+#'
+#' @parameter tsv_in <string> Path to TSV file generated by Tomtom.
+#'                            The input for Tomtom is a from all clusters merged meme-file.
+#' @parameter file_list <string> Numerically sorted comma separated list of absolute fasta-file paths
+#' @parameter min_weight <INT> Minimum weight of edge allowed in graph clusters.
+#'
+#' @author RenĂŠ Wiegandt
+#' @contact rene.wiegandt(at)mpi-bn.mpg.de
+merge_similar <- function(tsv_in, file_list, min_weight){
+
+  files <- unlist(strsplit(file_list, ","))
+
+  # split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster.
+  tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character')
+  query_cluster <- vapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){
+    tail(l, n = 1)
+  },"")
+  target_cluster <- vapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){
+    tail(l, n = 1)
+  },"")
+
+  # create data.table with only the cluster-numbers
+  sim_not_unique <- data.table::data.table(as.numeric(query_cluster),as.numeric(target_cluster))
+
+  # remove rows if column 1 is identical to column 2
+  edgelist <- sim_not_unique[query_cluster != target_cluster]
+
+  # create graph from data.frame
+  g <- igraph::graph_from_edgelist(as.matrix(edgelist))
+  # converting graph to adjacency matrix
+  adj_matrix <- igraph::get.adjacency(g, names = T)
+  # generating weighted graph from adjacency matrix
+  g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T)
+
+  # get subgraphs from graph with edges of weight > min_weight
+  s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight > min_weight], del = F)
+  png('motif_clusters.png')
+  plot(s1)
+  dev.off()
+  clust <- igraph::clusters(s1)
+  if (clust$no < 1) {
+    b <- lapply(files, function(f){
+      system(paste("cat",f,">",basename(f)))
+    })
+  }
+  # merge FASTA-files depending on the clustered graphs
+  a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){
+    #get vector with cluster ids, which are clustered together in "motif cluster" i
+    cl <- as.vector(which(clust$membership %in% c(i)))
+    #create string, which represents a list, containing all FASTA-files to be merged
+    fasta_list <- paste(files[cl], collapse = " ")
+    #create name for merged FASTA-file
+    name <- paste0("Cluster_",i,".fasta")
+    #merge fasta files
+    system(paste("cat",fasta_list,">",name))
+  })
+}
+
+
+# run function merge_similar with given parameteres if not in interactive context (e.g. run from shell)
+if (!interactive()) {
+  merge_similar(opt$input, opt$list, opt$min)
+}