loosolab · renewiegandt · Dec 17, 2018 · Dec 15, 2018 · Dec 15, 2018 · Dec 15, 2018
diff --git a/README.md b/README.md
@@ -41,7 +41,7 @@ When the enviroment is created, set the variable 'path_env' in the configuration
 
 ## Quick Start
 ```console
-nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file]
+nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file]
 ```
 ## Parameters
 For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration).
@@ -54,47 +54,50 @@ Required arguments:
 	--config		 Path to UROPA configuration file
 	--create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored.
 				 Path can be set as tfbs_path in next run. (Default: './')
+	--out			 Output Directory (Default: './out/')	
+
 Optional arguments:
-
-	--tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run.
 
+	--help [0|1]		1 to show this help message. (Default: 0)
+	--tfbs_path 		Path to directory with output from tfbsscan. If given tfbsscan will not be run.
+
 	Footprint extraction:
-	--window_length INT	(Default: 200)
-	--step INT		(Default: 100)
-	--percentage INT	(Default: 0)
-	
+	--window_length INT	This parameter sets the length of a sliding window. (Default: 200)
+	--step INT		This parameter sets the number of positions to slide the window forward. (Default: 100)
+	--percentage INT	Threshold in percent (Default: 0)
+
 	Filter unknown motifs:
-	--min_size_fp INT	(Default: 10)
-	--max_size_fp INT	(Default: 100)
-	
+	--min_size_fp INT	Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10)
+	--max_size_fp INT	Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 100)
+
 	Clustering:
 	Sequence preparation/ reduction:
 	--kmer INT		Kmer length (Default: 10)
 	--aprox_motif_len INT	Motif length (Default: 10)
 	--motif_occurence FLOAT	Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
 	--min_seq_length Interations	Remove all sequences below this value. (Default: 10)
-	
+
 	Clustering:
 	--global INT		Global (=1) or local (=0) alignment. (Default: 0)
 	--identity FLOAT	Identity threshold. (Default: 0.8)
 	--sequence_coverage INT	Minimum aligned nucleotides on both sequences. (Default: 8)
 	--memory INT		Memory limit in MB. 0 for unlimited. (Default: 800)
 	--throw_away_seq INT	Remove all sequences equal or below this length before clustering. (Default: 9)
 	--strand INT		Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0)
-	
+
 	Motif estimation:
 	--min_seq INT 		Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100)
 	--motif_min_key INT	Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8)
 	--motif_max_key INT	Maximum number of key positions (aligned columns) in the alignment done by GLAM2.f (Default: 20)
 	--iteration INT		Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000)
 	--tomtom_treshold float	Threshold for similarity score. (Default: 0.01)
 	--best_motif INT	Get the best X motifs per cluster. (Default: 3)
-	
+
 	Moitf clustering:
 	--cluster_motif	Boolean	If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0)
 	--edge_weight INT	Minimum weight of edges in motif-cluster-graph (Default: 5)
 	--motif_similarity_thresh FLOAT	Threshold for motif similarity score (Default: 0.00001)
-	
+
 	Creating GTF:
 	--organism [hg38 | hg19 | mm9 | mm10]	Input organism
 	--tissues List/String 	List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON

diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R
@@ -14,14 +14,14 @@ min_seq <- args[3]
 
 bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")
 
-clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster
+clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Cluster column
 discard <- lapply(1:length(clusters), function(i){
   clust <- as.data.frame(clusters[i])
   print(nrow(clust))
   if (nrow(clust) >= as.numeric(min_seq) ) {
-    sequences <- as.list(clust[[10]])   # <---- Splate mit Sequenz
+    sequences <- as.list(clust[[10]])   # <---- sequenze column
     outfile <- paste0(prefix,"_cluster_",i,".FASTA")
-    seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name
+    seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column
   } else {
     print(paste0("Cluster: ",i," is to small"))
   }

diff --git a/bin/merge_similar_clusters.R b/bin/merge_similar_clusters.R
@@ -1,7 +1,7 @@
 #!/usr/bin/env Rscript
 
 # Merging FASTA-files, which motifs are similar.
-# 
+#
 # @parameter tsv_in <string> Path to TSV file generated by Tomtom.
 #                            The input for Tomtom is a from all clusters merged meme-file.
 # @parameter file_list <string> Numerically sorted whitespace separated list of absolute fasta-file paths
@@ -10,7 +10,7 @@
 
 args = commandArgs(trailingOnly = TRUE)
 
-tsv_in <- args[1] 
+tsv_in <- args[1]
 file_list <- args[2]
 min_weight <- args[3]
 
@@ -38,7 +38,7 @@ edgelist <- sim_not_unique[query_cluster != target_cluster]
 g <- igraph::graph_from_edgelist(as.matrix(edgelist))
 # converting graph to adjacency matrix
 adj_matrix <- igraph::get.adjacency(g, names = T)
-# generating weighted graph from adjacency matrix 
+# generating weighted graph from adjacency matrix
 g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T)
 
 # get subgraphs from graph with edges of weight > min_weight
@@ -47,7 +47,11 @@ png('motif_clusters.png')
 plot(s1)
 dev.off()
 clust <- igraph::clusters(s1)
-
+if (clust$no < 1){
+  b <- lapply(files, function(f){
+    system(paste("cat",f,">",basename(f)))
+  })
+}
 # merge FASTA-files depending on the clustered graphs
 a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){
   cl <- as.vector(which(clust$membership %in% c(i)))