loosolab · renewiegandt · Jan 23, 2019 · Jan 23, 2019 · Jan 23, 2019 · Jan 23, 2019
diff --git a/README.md b/README.md
@@ -6,104 +6,99 @@ For further information read the [documentation](https://github.molgen.mpg.de/lo
 
 ## Dependencies
 * [conda](https://conda.io/docs/user-guide/install/linux.html)
-* [Nextflow](https://www.nextflow.io/)
-* [MEME-Suite](http://meme-suite.org/doc/install.html?man_type=web)
 
 ## Installation
-1. Start with installing all dependencies listed above (Nextflow, conda, MEME-Suite) and downloading all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018).
-2. It is required to set the [environment paths for meme-suite](http://meme-suite.org/doc/install.html?man_type=web#installingtar).
-this can be done with following commands:
-```
-export PATH=[meme-suite instalation path]/libexec/meme-[meme-suite version]:$PATH
-export PATH=[meme-suite instalation path]/bin:$PATH
-```
+1. Start with installing conda and downloading all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018).
 
-3. Every other dependency will be automatically installed using conda. For that a conda environment has to be created from the yaml-file given in this repository.
+2. Every other dependency will be automatically installed using conda. For that a conda environment has to be created from the yaml-file given in this repository.
 It is required to create and activate the environment from the yaml-file beforehand.
 This can be done with following commands:
 ```condsole
 conda env create -f masterenv.yml
 conda activate masterenv
 ```
 
-4. Set the wd parameter in the nextflow.config file as path where the repository is saved. For example: '~/masterJLU2018/'.
+3. Set the wd parameter in the nextflow.config file as path where the repository is saved. For example: '~/masterJLU2018/'.
 
 
 **Important Notes:**
-1. For conda the channel bioconda needs to be set as highest priority! This is required due to two different packages with the same name in different channels. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfish package from the channel conda-forge!
+1. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfish package from the channel conda-forge! Please make sure that the right jellyfish package is installed.
 
 
 ## Quick Start
 ```console
-nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --organism [mm10|mm9|hg19|hg38]
+nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --organism [mm10|mm9|hg19|hg38] --gtf_annotation [GTF-file]
 ```
 
 ### Demo run
 There are files provided inside  ./demo/ for a demo run.
 Go to the main directory and run following command:
 ```
-nextflow run pipeline.nf --bigwig ./demo/buenrostro50k_chr1_fp.bw --bed ./demo/buenrostro50k_chr1_peaks.bed --genome_fasta ./demo/hg38_chr1.fa --motif_db ./demo/jaspar_vertebrates.meme --out ./demo/buenrostro50k_chr1_out/ --organism hg38
+nextflow run pipeline.nf --bigwig ./demo/buenrostro50k_chr1_fp.bw --bed ./demo/buenrostro50k_chr1_peaks.bed --genome_fasta ./demo/hg38_chr1.fa --motif_db ./demo/jaspar_vertebrates.meme --out ./demo/buenrostro50k_chr1_out/ --organism hg38 --gtf_annotation ./demo/homo_sapiens.94.mainChr.gtf
 ```
 
 ## Parameters
-For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration).
+For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/List-of-Parameters).
 ```
 Required arguments:
-	--bigwig		 Path to BigWig-file
-	--bed			 Path to BED-file
-	--genome_fasta		 Path to genome in FASTA-format
-	--motif_db		 Path to motif-database in MEME-format
-	--config		 Path to UROPA configuration file
- 	--organism 		 Input organism [hg38 | hg19 | mm9 | mm10]
-	--out			 Output Directory (Default: './out/')
+	--bigwig			Path to BigWig-file
+	--bed				Path to BED-file
+	--genome_fasta			Path to genome in FASTA-format
+	--motif_db			Path to motif-database in MEME-format
+	--config			Path to UROPA configuration file
+	--gtf_annotation		Path to gtf annotation file
+	--organism 			Input organism [hg38 | hg19 | mm9 | mm10]
+	--out				Output Directory (Default: './out/')
 
 Optional arguments:
 
-	--help [0|1]		1 to show this help message. (Default: 0)
-	--gtf_path		Path to gtf-file. If path is set the process which creates a gtf-file is skipped.
-	--tfbs_path 		Path to directory with output from tfbsscan. If given tfbsscan will be skipped.
+	--help [0|1]			1 to show this help message. (Default: 0)
+	--gtf_merged			Path to gtf-file. If path is set the process which creates a gtf-file is skipped.
+	--tfbs_path 			Path to directory with tfbsscan output. If given tfbsscan will be skipped.
 
 	Footprint extraction:
-	--window_length INT	This parameter sets the length of a sliding window. (Default: 200)
-	--step INT		This parameter sets the number of positions to slide the window forward. (Default: 100)
-	--percentage INT	Threshold in percent (Default: 0)
-	--min_gap INT		If footprints are less than X bases apart the footprints will be merged (Default: 6)
+	--window_length INT		This parameter sets the length of a sliding window. (Default: 200)
+	--step INT			This parameter sets the number of positions to slide the window forward. (Default: 100)
+	--percentage INT		Threshold in percent (Default: 0)
+	--min_gap INT			If footprints are less than X bases apart the footprints will be merged (Default: 6)
 
 	Filter motifs:
-	--min_size_fp INT	Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10)
-	--max_size_fp INT	Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 200)
-	--tfbsscan_method [moods|fimo] Method used by tfbsscan. (Default: moods)
+	--min_size_fp INT		Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10)
+	--max_size_fp INT		Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 200)
+	--tfbsscan_method [moods|fimo] 	Method used by tfbsscan. (Default: moods)
 
 	Cluster:
 	Sequence preparation/ reduction:
-	--kmer INT		K-mer length (Default: 10)
-	--aprox_motif_len INT	Motif length (Default: 10)
-	--motif_occurence FLOAT	Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
+	--kmer INT			K-mer length (Default: 10)
+	--aprox_motif_len INT		Motif length (Default: 10)
+	--motif_occurrence FLOAT	Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
 	--min_seq_length Interations	Remove all sequences below this value. (Default: 10)
 	Clustering:
-	--global INT		Global (=1) or local (=0) alignment. (Default: 0)
-	--identity FLOAT	Identity threshold. (Default: 0.8)
-	--sequence_coverage INT	Minimum aligned nucleotides on both sequences. (Default: 8)
-	--memory INT		Memory limit in MB. 0 for unlimited. (Default: 800)
-	--throw_away_seq INT	Remove all sequences equal or below this length before clustering. (Default: 9)
-	--strand INT		Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0)
+	--global INT			Global (=1) or local (=0) alignment. (Default: 0)
+	--identity FLOAT		Identity threshold. (Default: 0.8)
+	--sequence_coverage INT		Minimum aligned nucleotides on both sequences. (Default: 8)
+	--memory INT			Memory limit in MB. 0 for unlimited. (Default: 800)
+	--throw_away_seq INT		Remove all sequences equal or below this length before clustering. (Default: 9)
+	--strand INT			Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0)
 
 	Motif estimation:
-	--min_seq INT 		Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100)
-	--motif_min_key INT	Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8)
-	--motif_max_key INT	Maximum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 20)
-	--iteration INT		Number of iterations done by GLAM2. More Iterations: better results, higher runtime. (Default: 10000)
-	--tomtom_treshold FLOAT	Threshold for similarity score. (Default: 0.01)
-	--best_motif INT	Get the best X motifs per cluster. (Default: 3)
-	--gap_penalty INT	Set penalty for gaps in GLAM2 (Default: 1000)
+	--min_seq INT 			Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100)
+	--motif_min_key INT		Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8)
+	--motif_max_key INT		Maximum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 20)
+	--iteration INT			Number of iterations done by GLAM2. More Iterations: better results, higher runtime. (Default: 10000)
+	--tomtom_treshold FLOAT	T	hreshold for similarity score. (Default: 0.01)
+	--best_motif INT		Get the best X motifs per cluster. (Default: 3)
+	--gap_penalty INT		Set penalty for gaps in GLAM2 (Default: 1000)
+	--seed String			Set seed for GLAM2 (Default: 123456789)
 	Moitf clustering:
-	--cluster_motif	Boolean	If 1 pipeline clusters motifs. If its 0 it does not. (Default: 0)
-	--edge_weight INT	Minimum weight of edges in motif-cluster-graph (Default: 5)
+	--cluster_motif	Boolean		If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0)
+	--edge_weight INT		Minimum weight of edges in motif-cluster-graph (Default: 5)
 	--motif_similarity_thresh FLOAT	Threshold for motif similarity score (Default: 0.00001)
 
 	Creating GTF:
-	--tissues List/String 	List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON
-				config
+	--tissues List/String 		List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON config
+	Evaluation:
+	--max_uropa_runs INT	 	Maximum number UROPA runs running parallelized (Default: 10)
 All arguments can be set in the configuration files
  ```
 

diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh
@@ -5,10 +5,10 @@
 
 # desciption: This scripts uses bedtools to find new, non overlapping regions
 #             between input data in BED-format and a group of known motifs in
-#             BED-format. 
+#             BED-format.
 
 #             For parameter description, run the script without parameters or -h.
-#             The output is a file with the filtered footprints and the log file compareBed.stats
+#             The output is a file with the filtered footprints and the log file filter_motifs.log
 
 # One R scripts is used, compareBed_runinfo.R, stored in the same directory.
 # Parameter values (file/directory paths) must not contain the | symbol, because it is chosen as "sed"-field separator.
@@ -124,7 +124,7 @@ then
 	echo "              -o   --output       output path/file ; default dir current directory and filename is newMotifs.bed and newMotifs.bed.fasta"
 	echo "              -h --help           shows this help message"
 	echo ""
-	echo " Note: file paths must not contain the '|' pipe symbol" 
+	echo " Note: file paths must not contain the '|' pipe symbol"
 	exit 0
 fi
 
@@ -188,7 +188,7 @@ cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed
 # motiffiles either from a directory OR comma separated list
 if [ -d "$motifs" ]
 then
-	# creates an array of all files with bed in its name in the directory $motifs  
+	# creates an array of all files with bed in its name in the directory $motifs
 	declare -a motiffiles=(`ls $motifs | grep -i '.bed$' | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`)
 
 # the else case means, that the motiffiles were passed comma separated with no whitespace.
@@ -264,9 +264,9 @@ fi
 
 # remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition
 # also creates a small output file with information about the comparison
-Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/compareBed.stats
+Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/filter_motifs.log
 # check if Rscript executed without errors
-if [ $? -gt 0 ] 
+if [ $? -gt 0 ]
 then
 	exit 1
 fi
@@ -279,15 +279,15 @@ then
 	# add initial number of footprints to the log file
 	fp_initial=`cat $data | wc -l`
 	fp_initial=`expr $fp_initial - 1`
-	echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats
+	echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/filter_motifs.log
 else
 	# output will be overwritten if it exists
 	rm -f $output
 	# add initial number of footprints to the log file
-	cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats
+	cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/filter_motifs.log
 fi
 # add number of footprints after filtering to the log file
-cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats
+cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/filter_motifs.log
 
 # add fasta sequences to bed and create fasta file
 out_fasta=`echo $output | sed "s|.bed$|.fasta|g"`

diff --git a/bin/2.2_motif_estimation/get_motif_seq.R → bin/2.2_motif_estimation/get_motif_seq_id.R b/bin/2.2_motif_estimation/get_motif_seq.R → bin/2.2_motif_estimation/get_motif_seq_id.R
@@ -68,7 +68,7 @@ create_seq_json <- function(input, output, num, tmp_path, cluster_id) {
 
   # Create json file
   ## naming
-  names(datalist) <- paste0(c("Motif_", "Motif_", "Motif_"),seq(1,as.numeric(num),1) , " Cluster_", cluster_id)
+  names(datalist) <- paste0("Motif_", seq(1,as.numeric(num),1), " Cluster_", cluster_id)
   ## creating json object
   json <- RJSONIO::toJSON(datalist, pretty = T , .withNames = T)
   ## writing file

diff --git a/bin/2.2_motif_estimation/merge_similar_clusters.R b/bin/2.2_motif_estimation/merge_similar_clusters.R
@@ -1,5 +1,6 @@
 #!/usr/bin/env Rscript
 if (!require(optparse)) install.packages("optparse"); library(optparse)
+if (!require(igraph)) install.packages("igraph"); library(igraph)
 
 option_list <- list(
   make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from merged tomtom results", metavar = "character"),
@@ -40,37 +41,39 @@ merge_similar <- function(tsv_in, file_list, min_weight){
 
   # remove rows if column 1 is identical to column 2
   edgelist <- sim_not_unique[query_cluster != target_cluster]
-
+  unique_edgelist <- unique(edgelist)
+
   # create graph from data.frame
-  g <- igraph::graph_from_edgelist(as.matrix(edgelist))
-  # converting graph to adjacency matrix
-  adj_matrix <- igraph::get.adjacency(g, names = T)
-  # generating weighted graph from adjacency matrix
-  g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T)
+  g <- graph_from_edgelist(as.matrix(unique(edgelist)), directed = F)
+  g_tidy <- delete.vertices(simplify(g), degree(g) == 0)
+  g_tidy <- set_vertex_attr(g_tidy,"name",V(g_tidy), value = (sort(unique(c(unique_edgelist$V1,unique_edgelist$V2)))) - 1)
 
-  # get subgraphs from graph with edges of weight > min_weight
-  s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight > min_weight], del = F)
+  # plotting
   png('motif_clusters.png')
-  plot(s1)
+  plot(g_tidy)
   dev.off()
-  clust <- igraph::clusters(s1)
+  clust <- clusters(g)
   if (clust$no < 1) {
     b <- lapply(files, function(f){
       system(paste("cat",f,">",basename(f)))
     })
   }
+  clust_out <- file("cluster.txt")
   # merge FASTA-files depending on the clustered graphs
-  a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){
+  a <- vapply(seq(from = 1, to = clust$no, by = 1), function(i){
     #get vector with cluster ids, which are clustered together in "motif cluster" i
     cl <- as.vector(which(clust$membership %in% c(i)))
     #create string, which represents a list, containing all FASTA-files to be merged
     fasta_list <- paste(files[cl], collapse = " ")
     #create name for merged FASTA-file
-    name <- paste0("Cluster_",i,".fasta")
+    name <- paste0("Cluster_",i - 1 ,".fasta")
     #merge fasta files
     system(paste("cat",fasta_list,">",name))
-  })
+    paste("Cluster",i - 1,"is generated by merging cluster:",paste(cl - 1,collapse = ","))
+  }, "")
+  writeLines(a[a != ""], con = clust_out)
 }
+
 
 
 # run function merge_similar with given parameteres if not in interactive context (e.g. run from shell)