Skip to content
Permalink
b7c80c8559
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
283 lines (215 sloc) 7.78 KB
params.bed = ""
params.out = ""
params.motif_db = ""
params.path_env = ""
params.help = 0
params.path_bin = "."
//bed_to_clustered_fasta
params.min_seq = 100 // Minimum number of sequences in the fasta-files for glam2
//glam2
params.motif_min_key = 8 // Minimum number of key positions (aligned columns)
params.motif_max_key = 20 // Maximum number of key positions (aligned columns)
params.iteration = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime.
//tomtom
params.tomtom_treshold = 0.01 // threshold for similarity score.
//cluster motifs
params.cluster_motif = 0 // Boolean if 1 motifs are clustered else they are not
params.edge_weight = 50 // Minimum weight of edges in motif-cluster-graph
motif_similarity_thresh = 0.00001 // threshold for motif similarity score
if (params.bed == "" || params.out == "" || params.motif_db == "" || params.path_env || "${params.help}" == "1") {
log.info """
Usage: nextflow run motif_estimation.nf --bed [PATH] --out [PATH] --motif_db [PATH] --path_env YAML-file
--bed Path Input BED-file (Column with cluster: 11; column with sequenze: 10)!!
--out Path Output Path
--motif_db Path Path to MEME-Database
--path_env Path Path to YAML file for conda enviroment.
--path_bin Path Path to directory with subscripts
--help 0|1 Print help message if help == 1 (Default: 0)
Motif estimation:
--min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100)
--motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8)
--motif_max_key INT Maximum number of key positions (aligned columns) in the alignment done by GLAM2.f (Default: 20)
--iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000)
--tomtom_treshold float Threshold for similarity score. (Default: 0.01)
--best_motif INT Get the best X motifs per cluster. (Default: 3)
Moitf clustering:
--cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0)
--edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5)
--motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001)
"""
System.exit(2)
} else {
Channel.fromPath(params.bed).map {it -> [it.simpleName, it]}.set {bed_for_motif_esitmation}
}
/*
Converting BED-File to one FASTA-File per cluster
*/
process bed_to_clustered_fasta {
conda "${params.path_env}"
publishDir "${params.out}/esimated_motifs/clustered_motifs/clustered_fasta/", mode: 'copy'
tag{name}
input:
set name, file (bed) from bed_for_motif_esitmation
output:
file ('*.FASTA') into fasta_for_glam2
file ('*.FASTA') into fasta_for_motif_cluster
script:
"""
Rscript ${params.path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq}
"""
}
//flatten list and adding name of file to channel value
fasta_for_glam2 = fasta_for_glam2.flatten().map {it -> [it.simpleName, it]}
//combine fasta files in one list
fasta_for_motif_cluster = fasta_for_motif_cluster.toList()
/*
Running GLAM2 on FASTA-files.
Generating Motifs through alignment and scoring best local matches.
*/
process glam2 {
tag{name}
publishDir "${params.out}/esimated_motifs/clustered_motifs/${name}/", mode: 'copy'
input:
set name, file (fasta) from fasta_for_glam2
output:
file("${name}/*.meme") into meme_to_merge
set name, file("${name}/*.meme") into meme_for_tomtom
set name, file("${name}/*.meme") into meme_for_filter
file ('*')
script:
"""
glam2 n ${fasta} -O ./${name}/ -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration}
"""
}
/*
Combining all MEME-files to one big MEME-file.
The paths are sorted numerically depending on the cluster number.
*/
process merge_meme {
publishDir "${params.out}/esimated_motifs/merged_meme/", mode: 'copy'
input:
val (memelist) from meme_to_merge.toList()
output:
file ('merged_meme.meme') into merged_meme
when:
params.cluster_motif == 1
script:
memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") }
meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer }
meme_sorted_full = meme_sorted.collect {it.toString() + "/glam2.meme"}
meme_list = meme_sorted_full.toString().replaceAll(/\,|\[|\]/,"")
"""
meme2meme ${meme_list} > merged_meme.meme
"""
}
/*
Running Tomtom on merged meme-files.
Output table has the information which clusters are similar to each other.
*/
process find_similar_motifs {
publishDir "${params.out}/esimated_motifs/cluster_similarity/", mode: 'copy'
input:
file (merged_meme) from merged_meme
output:
file ('all_to_all.tsv') into motif_similarity
when:
params.cluster_motif == 1
script:
"""
tomtom ${merged_meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > all_to_all.tsv
"""
}
files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster)
/*
Merging FASTA-files of similar clusters
*/
process merge_fasta {
conda "${params.path_env}"
publishDir "${params.out}/esimated_motifs/merged_fasta/", mode: 'copy'
input:
set file (motiv_sim), val (fasta_list) from files_for_merge_fasta
output:
file ('*.fasta') into motif_clustered_fasta_list
file('*.png')
when:
params.cluster_motif == 1
script:
fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer }
fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"")
"""
Rscript ${params.path_bin}/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight}
"""
}
motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten()
process clustered_glam2 {
publishDir "${params.out}/final_esimated_motifs/${name}/", mode: 'copy'
input:
file (fasta) from motif_clustered_fasta_flat
output:
set name, file ('*.meme') into clustered_meme_for_tomtom
set name, file ('*.meme') into clustered_meme_for_filter
file('*')
when:
params.cluster_motif == 1
script:
name = fasta.getBaseName()
"""
glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration}
"""
}
if(params.cluster_motif == 1){
for_tomtom = clustered_meme_for_tomtom
for_filter = clustered_meme_for_filter
} else {
for_tomtom = meme_for_tomtom
for_filter = meme_for_filter
}
/*
Running Tomtom on meme-files generated by GLAM2.
Tomtom searches motifs in databases.
*/
process tomtom {
tag{name}
publishDir "${params.out}/esimated_motifs/tomtom/", mode: 'copy'
input:
set name, file (meme) from for_tomtom
output:
set name, file ('*.tsv') into tsv_for_filter
script:
"""
tomtom ${meme} ${params.motif_db} -thresh ${params.tomtom_treshold} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv
"""
}
//Joining channels with meme and tsv files. Filter joined channel on line count.
//Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel.
for_filter2 = for_filter.join( tsv_for_filter )
for_filter2
.filter { name, meme, tsv ->
long count = tsv.readLines().size()
count <= 1
}
.into { meme_for_scan; check }
//If channel 'check' is empty print errormessage
process check_for_unknown_motifs {
echo true
input:
val x from check.ifEmpty('EMPTY')
when:
x == 'EMPTY'
"""
echo '>>> STOPPED: No unknown Motifs were found.'
"""
}
//Get the best(first) Motif from each MEME-file
process get_best_motif {
conda "${params.path_env}"
publishDir "${params.out}/esimated_motifs/unknown_motifs/", mode: 'copy'
input:
set name, file(meme), file(tsv) from meme_for_scan
output:
set name, file('*_best.meme') into best_motif
script:
"""
python ${params.path_bin}/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif}
"""
}