loosolab · renewiegandt · Dec 14, 2018 · Dec 14, 2018 · Dec 14, 2018 · Dec 14, 2018
diff --git a/README.md b/README.md
@@ -47,57 +47,59 @@ nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta
 For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration).
 ```
 Required arguments:
-	--bigwig Path to BigWig-file with scores on the peaks of interest
-	--bed Path to BED-file with peaks of interest corresponding to the BigWig file
-	--genome_fasta Path to genome in FASTA-format
-	--motif_db Path to motif-database in MEME-format
-
-
+	--bigwig		 Path to BigWig-file
+	--bed			 Path to BED-file
+	--genome_fasta		 Path to genome in FASTA-format
+	--motif_db		 Path to motif-database in MEME-format
+	--config		 Path to UROPA configuration file
+	--create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored.
+				 Path can be set as tfbs_path in next run. (Default: './')
 Optional arguments:
 
-  	--tfbs_path Path to directory with output BED-files from tfbsscan. If given tfbsscan will not be run.
-
+	--tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run.
+	
 	Footprint extraction:
-	--window_length INT (Default: 200) a length of a window
-	--step INT (Default: 100) an interval to slide the window
-	--percentage INT(Default: 0) a percentage to be added to background while searching for footprints
-
+	--window_length INT	(Default: 200)
+	--step INT		(Default: 100)
+	--percentage INT	(Default: 0)
+	
 	Filter unknown motifs:
-	--min_size_fp INT (Default: 10)
-	--max_size_fp INT (Default: 100)
-
-	Cluster:
+	--min_size_fp INT	(Default: 10)
+	--max_size_fp INT	(Default: 100)
+	
+	Clustering:
 	Sequence preparation/ reduction:
-	--kmer INT Kmer length (Default: 10)
-	--aprox_motif_len INT Motif length (Default: 10)
-	--motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
-	--min_seq_length INT Remove all sequences below this value. (Default: 10)
-
+	--kmer INT		Kmer length (Default: 10)
+	--aprox_motif_len INT	Motif length (Default: 10)
+	--motif_occurence FLOAT	Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
+	--min_seq_length Interations	Remove all sequences below this value. (Default: 10)
+	
 	Clustering:
-	--global INT Global (=1) or local (=0) alignment. (Default: 0)
-	--identity FLOAT Identity threshold. (Default: 0.8)
-	--sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8)
-	--memory INT Memory limit in MB. 0 for unlimited. (Default: 800)
-	--throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9)
-	--strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0)
-
+	--global INT		Global (=1) or local (=0) alignment. (Default: 0)
+	--identity FLOAT	Identity threshold. (Default: 0.8)
+	--sequence_coverage INT	Minimum aligned nucleotides on both sequences. (Default: 8)
+	--memory INT		Memory limit in MB. 0 for unlimited. (Default: 800)
+	--throw_away_seq INT	Remove all sequences equal or below this length before clustering. (Default: 9)
+	--strand INT		Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0)
+	
 	Motif estimation:
-	--min_seq INT Minimum number of sequences required in the FASTA-files for GLAM2 (Default: 100)
-	--motif_min_key INT	Maximum number of key positions (aligned columns) (Default: 8)
-	--motif_max_key INT	Maximum number of key positions (aligned columns) (Default: 20)
-	--iteration INT	Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000)
+	--min_seq INT 		Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100)
+	--motif_min_key INT	Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8)
+	--motif_max_key INT	Maximum number of key positions (aligned columns) in the alignment done by GLAM2.f (Default: 20)
+	--iteration INT		Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000)
 	--tomtom_treshold float	Threshold for similarity score. (Default: 0.01)
-
-	Motif clustering:
-  	--cluster_motif BOOLEAN IF its 1 motifs will be clustered (Default: 0)
-	--edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5)
-  	--motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001)
-
-  	Creating GTF:
- 	--tissue STRING Filter for one or more tissue/category activity, categories as in JSON config (Default: None)
-  	--organism STRING Source organism: [ hg19 | hg38 or mm9 | mm10 ] (Default: hg38)
-
-All arguments can be set in the configuration files.
+	--best_motif INT	Get the best X motifs per cluster. (Default: 3)
+
+	Moitf clustering:
+	--cluster_motif	Boolean	If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0)
+	--edge_weight INT	Minimum weight of edges in motif-cluster-graph (Default: 5)
+	--motif_similarity_thresh FLOAT	Threshold for motif similarity score (Default: 0.00001)
+
+	Creating GTF:
+	--organism [hg38 | hg19 | mm9 | mm10]	Input organism
+	--tissues List/String 	List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON
+				config
+All arguments can be set in the configuration files
  ```
 
 

diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R
@@ -9,17 +9,17 @@
 args = commandArgs(trailingOnly = TRUE)
 
 bedInput <- args[1]
-prefix <- args[2] 
-min_seq <- args[3] 
+prefix <- args[2]
+min_seq <- args[3]
 
 bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")
 
-clusters <- split(bed, bed$V8, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster
+clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster
 discard <- lapply(1:length(clusters), function(i){
   clust <- as.data.frame(clusters[i])
   print(nrow(clust))
   if (nrow(clust) >= as.numeric(min_seq) ) {
-    sequences <- as.list(clust[[7]])   # <---- Splate mit Sequenz
+    sequences <- as.list(clust[[10]])   # <---- Splate mit Sequenz
     outfile <- paste0(prefix,"_cluster_",i,".FASTA")
     seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name
   } else {