fixes

loosolab · Dec 14, 2018 · 7aaf90c · 7aaf90c
1 parent 748fc0b
commit 7aaf90c
Showing 6 changed files with 66 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -86,6 +86,7 @@ Optional arguments:
 	--tomtom_treshold float	Threshold for similarity score. (Default: 0.01)
 
 	Motif clustering:
+  --cluster_motif BOOLEAN IF its 1 motifs will be clustered (Default: 0)
 	--edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5)
   --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001)
 

diff --git a/bin/compareBed.sh b/bin/compareBed.sh
@@ -148,7 +148,7 @@ then
 	#	mkdir workdir1337
 	#fi
 	wo=true
-	workdir="."
+	workdir=$path
 fi
 
 if [ $ou == false ]
@@ -223,7 +223,7 @@ echo calc maxScore
 Rscript --vanilla $path/maxScore.R "$workdir"/pass1Fa.bed
 
 #3 subtract overlapping regions for additional motifs
-echo "also get subsequences with no overlap of overlapping sequences"
+#echo "also get subsequences with no overlap of overlapping sequences"
 help=true
 
 if [ -d $"motifs" ]
@@ -265,11 +265,13 @@ fi
 #4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition
 # also creates a small output file with information about the comparison
 
-Rscript --vanilla $path/merge.R $min $max "$workdir" $data
-
+Rscript --vanilla $path/merge.R $min $max $workdir $data
 
+echo "------------------------------------------------------------"
+echo $workdir
+head -100 $workdir/merged.bed
 #5. add fasta sequences to bed and create fasta file
-bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$output"
+bedtools getfasta -fi $fasta -bed $workdir/merged.bed -bedOut > $output
 bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta
 
 #6 clean up

diff --git a/bin/merge.R b/bin/merge.R
@@ -22,7 +22,7 @@ splitted$length=splitted$stop - splitted$start
 splitted=cbind(splitted, containsMaxpos=0)
 splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1
 splitted$maxpos = splitted$maxpos - splitted$start
-fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t')
+data.table::fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t')
 
 before = fread(args[4], header=FALSE)
 
@@ -34,4 +34,3 @@ lengthb = formatC(mean(before$V3-before$V2), digits=4)
 lengtha = formatC(mean(splitted$length), digits=4)
 stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha)
 write.table(stats, "../FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t')
-
diff --git a/config/cluster.config b/config/cluster.config
@@ -9,7 +9,7 @@ params{
   global=0
   identity=0.8
   sequence_coverage=8
-  memory=800
+  memory=2000
   throw_away_seq=9
   strand=0
 }
diff --git a/config/motif_estimation.config b/config/motif_estimation.config
@@ -1,6 +1,6 @@
 params {
 	//bed_to_clustered_fasta
-	min_seq = 10
+	min_seq = 50
 
 	//glam2
 	motif_min_len = 8
@@ -11,7 +11,8 @@ params {
 	tomtom_treshold = 0.01
 
 	//motif Clustering
-	edge_weight = 5
+	cluster_motif = 0
+	edge_weight = 50
 	motif_similarity_thresh = 0.00001
 	best_motif = 3
 }
diff --git a/pipeline.nf b/pipeline.nf
@@ -34,7 +34,7 @@ params.tfbs_path=""
 
 //motif_estimation
 	//bed_to_clustered_fasta
-	params.min_seq = 10  // Minimum number of sequences in the fasta-files for glam2
+	params.min_seq = 100  // Minimum number of sequences in the fasta-files for glam2
 
 	//glam2
 	params.motif_min_key = 8 // Minimum number of key positions (aligned columns)
@@ -45,7 +45,8 @@ params.tfbs_path=""
 	params.tomtom_treshold = 0.01 // threshold for similarity score.
 
 	//cluster motifs
-	params.edge_weight = 5 // Minimum weight of edges in motif-cluster-graph
+	params.cluster_motif = 0 // Boolean if 1 motifs are clustered else they are not
+	params.edge_weight = 50 // Minimum weight of edges in motif-cluster-graph
 	motif_similarity_thresh = 0.00001 // threshold for motif similarity score
 
 	params.best_motif = 3 // Top n motifs per cluster
@@ -129,8 +130,8 @@ process footprint_extraction {
 	conda "${path_env}"
 
 	tag{name}
-	publishDir "${out}/footprint_extraction/", mode: 'copy', pattern: '*.bed'
-	publishDir "${out}/footprint_extraction/log", mode: 'copy', pattern: '*.log'
+	publishDir "${params.out}/footprint_extraction/", mode: 'copy', pattern: '*.bed'
+	publishDir "${params.out}/footprint_extraction/log", mode: 'copy', pattern: '*.log'
 
 	input:
 	set name, file (bigWig), file (bed) from footprint_in
@@ -152,7 +153,7 @@ process extract_known_TFBS {
 
 	conda "${path_env}"
 
-	publishDir "${out}/known_TFBS/", mode: 'copy', pattern: '*.bed'
+	publishDir "${params.out}/known_TFBS/", mode: 'copy', pattern: '*.bed'
 
 	input:
 	file (fasta) from fa_overlap
@@ -187,28 +188,30 @@ if(params.tfbs_path == "") {
 */
 process overlap_with_known_TFBS {
 	conda "${path_env}"
+	echo true
 
-	publishDir "${out}/unknown_overlap/", mode: 'copy', pattern: '*.bed'
+	publishDir "${params.out}/unknown_overlap/"
 
 	input:
 	set name, file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap
 
 	output:
-	set name, file ("${name}.bed") into bed_for_reducing
-	set name, file ("${name}.bed") into testetst
+	set name, file ("*.bed") into bed_for_reducing
+	//set name, file ("${name}_un.bed") into testetst
 
 	script:
 	motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"")
 	"""
-	${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}.bed -min ${params.min_size_fp} -max ${params.max_size_fp} -p ${path_bin}
+	${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}_un.bed -min ${params.min_size_fp} -max ${params.max_size_fp} -p ${path_bin}
 	"""
 }
-testetst.println()
+//testetst.println()
 
 /*
 */
 process reduce_bed {
 	conda "${path_env}"
+	echo true
 
 	input:
 	set name, file (bed) from bed_for_reducing
@@ -227,8 +230,9 @@ process reduce_bed {
 */
 process clustering {
 	conda "${path_env}"
+	echo true
 
-	publishDir "${out}/cluster/", mode: 'copy', pattern: '*.bed'
+	publishDir "${params.out}/cluster/", mode: 'copy', pattern: '*.bed'
 
 	input:
 	set name, file (bed) from bed_for_clustering
@@ -254,9 +258,6 @@ process bed_to_clustered_fasta {
     input:
     set name, file (bed) from bed_for_motif_esitmation
 
-    when:
-    params.fasta == false
-
     output:
     file ('*.FASTA') into fasta_for_glam2
     file ('*.FASTA') into fasta_for_motif_cluster
@@ -280,16 +281,19 @@ Generating Motifs through alignment and scoring best local matches.
 process glam2 {
 
     tag{name}
+	publishDir "${params.out}/esimated_motifs/clustered_motifs/${name}/", mode: 'copy'
 
     input:
     set name, file (fasta) from fasta_for_glam2
 
     output:
     file("${name}/*.meme") into meme_to_merge
+	set name, file("${name}/*.meme") into meme_for_tomtom
+	set name, file("${name}/*.meme") into meme_for_filter
 
     script:
     """
-    glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration}
+    glam2 n ${fasta} -O ./${name}/ -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration}
     """
 }
 
@@ -299,12 +303,17 @@ The paths are sorted numerically depending on the cluster number.
 */
 process merge_meme {
 
+	publishDir "${params.out}/esimated_motifs/merged_meme/", mode: 'copy'
+
 	input:
 	val (memelist) from meme_to_merge.toList()
 
 	output:
 	file ('merged_meme.meme') into merged_meme
 
+	when:
+	params.cluster_motif == 1
+
 	script:
 	memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") }
 	meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer }
@@ -321,13 +330,16 @@ Output table has the information which clusters are similar to each other.
 */
 process find_similar_motifs {
 
-
+	publishDir "${params.out}/esimated_motifs/cluster_similarity/", mode: 'copy'
 	input:
 	file (merged_meme) from merged_meme
 
 	output:
 	file ('all_to_all.tsv') into motif_similarity
 
+	when:
+	params.cluster_motif == 1
+
 	script:
 	"""
 	tomtom ${merged_meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > all_to_all.tsv
@@ -340,17 +352,21 @@ files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster)
 
 process merge_fasta {
 	conda "${path_env}"
+	publishDir "${params.out}/esimated_motifs/merged_fasta/", mode: 'copy'
 
 	input:
 	set file (motiv_sim), val (fasta_list) from files_for_merge_fasta
 
 	output:
 	file ('*.fasta') into motif_clustered_fasta_list
-  file('*.png')
+  	file('*.png')
+
+  	when:
+  	params.cluster_motif == 1
 
 	script:
-  fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer }
-  fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"")
+  	fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer }
+  	fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"")
 	"""
 	Rscript ${path_bin}/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight}
 	"""
@@ -362,7 +378,7 @@ motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten()
 
 process clustered_glam2 {
 
-	publishDir "${out}/esimated_motifs/${name}/", mode: 'copy'
+	publishDir "${params.out}/final_esimated_motifs/${name}/", mode: 'copy'
 
 	input:
 	file (fasta) from motif_clustered_fasta_flat
@@ -372,13 +388,24 @@ process clustered_glam2 {
   	set name, file ('*.meme') into clustered_meme_for_filter
   	file('*')
 
+	when:
+	params.cluster_motif == 1
+
 	script:
-  name = fasta.getBaseName()
+  	name = fasta.getBaseName()
 	"""
 	glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration}
 	"""
 }
 
+if(params.cluster_motif == 1){
+	for_tomtom = clustered_meme_for_tomtom
+	for_filter = clustered_meme_for_filter
+} else {
+	for_tomtom = meme_for_tomtom
+	for_filter = meme_for_filter
+}
+
 
 /*
 Running Tomtom on meme-files generated by GLAM2.
@@ -387,10 +414,10 @@ Tomtom searches motifs in databases.
 process tomtom {
 
     tag{name}
-	publishDir "${out}/esimated_motifs/tomtom/", mode: 'copy'
+	publishDir "${params.out}/final_esimated_motifs/tomtom/", mode: 'copy'
 
     input:
-    set name, file (meme) from clustered_meme_for_tomtom
+    set name, file (meme) from for_tomtom
 
     output:
     set name, file ('*.tsv') into tsv_for_filter
@@ -404,8 +431,8 @@ process tomtom {
 
 //Joining channels with meme and tsv files. Filter joined channel on line count.
 //Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel.
-for_filter = clustered_meme_for_filter.join( tsv_for_filter )
-for_filter
+for_filter2 = for_filter.join( tsv_for_filter )
+for_filter2
     .filter { name, meme, tsv ->
         long count = tsv.readLines().size()
         count <= 1
@@ -434,7 +461,7 @@ process check_for_unknown_motifs {
 process get_best_motif {
 	conda "${path_env}"
 
-	publishDir "${out}/esimated_motifs/unknown_motifs/", mode: 'copy'
+	publishDir "${params.out}/esimated_motifs/unknown_motifs/", mode: 'copy'
 
 	input:
 	set name, file(meme), file(tsv) from meme_for_scan