diff --git a/README.md b/README.md index e6b69e9..1e4c4be 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,7 @@ Optional arguments: --tomtom_treshold float Threshold for similarity score. (Default: 0.01) Motif clustering: + --cluster_motif BOOLEAN IF its 1 motifs will be clustered (Default: 0) --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) diff --git a/bin/compareBed.sh b/bin/compareBed.sh index f635283..426f3b4 100644 --- a/bin/compareBed.sh +++ b/bin/compareBed.sh @@ -148,7 +148,7 @@ then # mkdir workdir1337 #fi wo=true - workdir="." + workdir=$path fi if [ $ou == false ] @@ -223,7 +223,7 @@ echo calc maxScore Rscript --vanilla $path/maxScore.R "$workdir"/pass1Fa.bed #3 subtract overlapping regions for additional motifs -echo "also get subsequences with no overlap of overlapping sequences" +#echo "also get subsequences with no overlap of overlapping sequences" help=true if [ -d $"motifs" ] @@ -265,11 +265,13 @@ fi #4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition # also creates a small output file with information about the comparison -Rscript --vanilla $path/merge.R $min $max "$workdir" $data - +Rscript --vanilla $path/merge.R $min $max $workdir $data +echo "------------------------------------------------------------" +echo $workdir +head -100 $workdir/merged.bed #5. add fasta sequences to bed and create fasta file -bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$output" +bedtools getfasta -fi $fasta -bed $workdir/merged.bed -bedOut > $output bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta #6 clean up diff --git a/bin/merge.R b/bin/merge.R index c2015aa..0d349b5 100644 --- a/bin/merge.R +++ b/bin/merge.R @@ -22,7 +22,7 @@ splitted$length=splitted$stop - splitted$start splitted=cbind(splitted, containsMaxpos=0) splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 splitted$maxpos = splitted$maxpos - splitted$start -fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') +data.table::fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') before = fread(args[4], header=FALSE) @@ -34,4 +34,3 @@ lengthb = formatC(mean(before$V3-before$V2), digits=4) lengtha = formatC(mean(splitted$length), digits=4) stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha) write.table(stats, "../FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t') - diff --git a/config/cluster.config b/config/cluster.config index 1b444e2..1ba0b33 100644 --- a/config/cluster.config +++ b/config/cluster.config @@ -9,7 +9,7 @@ params{ global=0 identity=0.8 sequence_coverage=8 - memory=800 + memory=2000 throw_away_seq=9 strand=0 } diff --git a/config/motif_estimation.config b/config/motif_estimation.config index d12e13c..6d9712a 100644 --- a/config/motif_estimation.config +++ b/config/motif_estimation.config @@ -1,6 +1,6 @@ params { //bed_to_clustered_fasta - min_seq = 10 + min_seq = 50 //glam2 motif_min_len = 8 @@ -11,7 +11,8 @@ params { tomtom_treshold = 0.01 //motif Clustering - edge_weight = 5 + cluster_motif = 0 + edge_weight = 50 motif_similarity_thresh = 0.00001 best_motif = 3 } diff --git a/pipeline.nf b/pipeline.nf index 18b6745..4631803 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -34,7 +34,7 @@ params.tfbs_path="" //motif_estimation //bed_to_clustered_fasta - params.min_seq = 10 // Minimum number of sequences in the fasta-files for glam2 + params.min_seq = 100 // Minimum number of sequences in the fasta-files for glam2 //glam2 params.motif_min_key = 8 // Minimum number of key positions (aligned columns) @@ -45,7 +45,8 @@ params.tfbs_path="" params.tomtom_treshold = 0.01 // threshold for similarity score. //cluster motifs - params.edge_weight = 5 // Minimum weight of edges in motif-cluster-graph + params.cluster_motif = 0 // Boolean if 1 motifs are clustered else they are not + params.edge_weight = 50 // Minimum weight of edges in motif-cluster-graph motif_similarity_thresh = 0.00001 // threshold for motif similarity score params.best_motif = 3 // Top n motifs per cluster @@ -129,8 +130,8 @@ process footprint_extraction { conda "${path_env}" tag{name} - publishDir "${out}/footprint_extraction/", mode: 'copy', pattern: '*.bed' - publishDir "${out}/footprint_extraction/log", mode: 'copy', pattern: '*.log' + publishDir "${params.out}/footprint_extraction/", mode: 'copy', pattern: '*.bed' + publishDir "${params.out}/footprint_extraction/log", mode: 'copy', pattern: '*.log' input: set name, file (bigWig), file (bed) from footprint_in @@ -152,7 +153,7 @@ process extract_known_TFBS { conda "${path_env}" - publishDir "${out}/known_TFBS/", mode: 'copy', pattern: '*.bed' + publishDir "${params.out}/known_TFBS/", mode: 'copy', pattern: '*.bed' input: file (fasta) from fa_overlap @@ -187,28 +188,30 @@ if(params.tfbs_path == "") { */ process overlap_with_known_TFBS { conda "${path_env}" + echo true - publishDir "${out}/unknown_overlap/", mode: 'copy', pattern: '*.bed' + publishDir "${params.out}/unknown_overlap/" input: set name, file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap output: - set name, file ("${name}.bed") into bed_for_reducing - set name, file ("${name}.bed") into testetst + set name, file ("*.bed") into bed_for_reducing + //set name, file ("${name}_un.bed") into testetst script: motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"") """ - ${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}.bed -min ${params.min_size_fp} -max ${params.max_size_fp} -p ${path_bin} + ${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}_un.bed -min ${params.min_size_fp} -max ${params.max_size_fp} -p ${path_bin} """ } -testetst.println() +//testetst.println() /* */ process reduce_bed { conda "${path_env}" + echo true input: set name, file (bed) from bed_for_reducing @@ -227,8 +230,9 @@ process reduce_bed { */ process clustering { conda "${path_env}" + echo true - publishDir "${out}/cluster/", mode: 'copy', pattern: '*.bed' + publishDir "${params.out}/cluster/", mode: 'copy', pattern: '*.bed' input: set name, file (bed) from bed_for_clustering @@ -254,9 +258,6 @@ process bed_to_clustered_fasta { input: set name, file (bed) from bed_for_motif_esitmation - when: - params.fasta == false - output: file ('*.FASTA') into fasta_for_glam2 file ('*.FASTA') into fasta_for_motif_cluster @@ -280,16 +281,19 @@ Generating Motifs through alignment and scoring best local matches. process glam2 { tag{name} + publishDir "${params.out}/esimated_motifs/clustered_motifs/${name}/", mode: 'copy' input: set name, file (fasta) from fasta_for_glam2 output: file("${name}/*.meme") into meme_to_merge + set name, file("${name}/*.meme") into meme_for_tomtom + set name, file("${name}/*.meme") into meme_for_filter script: """ - glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} + glam2 n ${fasta} -O ./${name}/ -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} """ } @@ -299,12 +303,17 @@ The paths are sorted numerically depending on the cluster number. */ process merge_meme { + publishDir "${params.out}/esimated_motifs/merged_meme/", mode: 'copy' + input: val (memelist) from meme_to_merge.toList() output: file ('merged_meme.meme') into merged_meme + when: + params.cluster_motif == 1 + script: memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") } meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer } @@ -321,13 +330,16 @@ Output table has the information which clusters are similar to each other. */ process find_similar_motifs { - + publishDir "${params.out}/esimated_motifs/cluster_similarity/", mode: 'copy' input: file (merged_meme) from merged_meme output: file ('all_to_all.tsv') into motif_similarity + when: + params.cluster_motif == 1 + script: """ tomtom ${merged_meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > all_to_all.tsv @@ -340,17 +352,21 @@ files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster) process merge_fasta { conda "${path_env}" + publishDir "${params.out}/esimated_motifs/merged_fasta/", mode: 'copy' input: set file (motiv_sim), val (fasta_list) from files_for_merge_fasta output: file ('*.fasta') into motif_clustered_fasta_list - file('*.png') + file('*.png') + + when: + params.cluster_motif == 1 script: - fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer } - fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"") + fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer } + fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"") """ Rscript ${path_bin}/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight} """ @@ -362,7 +378,7 @@ motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten() process clustered_glam2 { - publishDir "${out}/esimated_motifs/${name}/", mode: 'copy' + publishDir "${params.out}/final_esimated_motifs/${name}/", mode: 'copy' input: file (fasta) from motif_clustered_fasta_flat @@ -372,13 +388,24 @@ process clustered_glam2 { set name, file ('*.meme') into clustered_meme_for_filter file('*') + when: + params.cluster_motif == 1 + script: - name = fasta.getBaseName() + name = fasta.getBaseName() """ glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} """ } +if(params.cluster_motif == 1){ + for_tomtom = clustered_meme_for_tomtom + for_filter = clustered_meme_for_filter +} else { + for_tomtom = meme_for_tomtom + for_filter = meme_for_filter +} + /* Running Tomtom on meme-files generated by GLAM2. @@ -387,10 +414,10 @@ Tomtom searches motifs in databases. process tomtom { tag{name} - publishDir "${out}/esimated_motifs/tomtom/", mode: 'copy' + publishDir "${params.out}/final_esimated_motifs/tomtom/", mode: 'copy' input: - set name, file (meme) from clustered_meme_for_tomtom + set name, file (meme) from for_tomtom output: set name, file ('*.tsv') into tsv_for_filter @@ -404,8 +431,8 @@ process tomtom { //Joining channels with meme and tsv files. Filter joined channel on line count. //Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel. -for_filter = clustered_meme_for_filter.join( tsv_for_filter ) -for_filter +for_filter2 = for_filter.join( tsv_for_filter ) +for_filter2 .filter { name, meme, tsv -> long count = tsv.readLines().size() count <= 1 @@ -434,7 +461,7 @@ process check_for_unknown_motifs { process get_best_motif { conda "${path_env}" - publishDir "${out}/esimated_motifs/unknown_motifs/", mode: 'copy' + publishDir "${params.out}/esimated_motifs/unknown_motifs/", mode: 'copy' input: set name, file(meme), file(tsv) from meme_for_scan