Skip to content
Permalink
2e6bafce02
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
777 lines (602 sloc) 21.5 KB
#!/usr/bin/env nextflow
//setting default values
params.bigwig=""
params.bed=""
params.genome_fasta=""
params.motif_db=""
params.config=""
params.tfbs_path=""
params.help = 0
params.gtf_path=""
params.out = "./out/"
//footprint_extraction
params.window_length = 200
params.step = 100
params.percentage = 0
params.min_gap = 6
//filter_unknown_motifs
params.min_size_fp=10
params.max_size_fp=200
params.tfbsscan_method = "moods"
//clustering
//reduce_sequence
params.kmer=10
params.aprox_motif_len=10
params.motif_occurence=1
params.min_seq_length=10
//cdhit_wrapper
params.global=0
params.identity=0.8
params.sequence_coverage=8
params.memory=800
params.throw_away_seq=9
params.strand=0
//motif_estimation
//bed_to_clustered_fasta
params.min_seq = 100 // Minimum number of sequences in the fasta-files for glam2
//glam2
params.motif_min_key = 8 // Minimum number of key positions (aligned columns)
params.motif_max_key = 20 // Maximum number of key positions (aligned columns)
params.iteration = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime.
params.gap_penalty = 1000
//tomtom
params.tomtom_treshold = 0.01 // threshold for similarity score.
//cluster motifs
params.cluster_motif = 0 // Boolean if 1 motifs are clustered else they are not
params.edge_weight = 50 // Minimum weight of edges in motif-cluster-graph
params.motif_similarity_thresh = 0.00001 // threshold for motif similarity score
params.best_motif = 3 // Top n motifs per cluster
//creating_gtf
params.organism=""
params.tissue=""
if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" != "0" ){
log.info """
Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file]
Required arguments:
--bigwig Path to BigWig-file
--bed Path to BED-file
--genome_fasta Path to genome in FASTA-format
--motif_db Path to motif-database in MEME-format
--config Path to UROPA configuration file
--organism Input organism [hg38 | hg19 | mm9 | mm10]
--out Output Directory (Default: './out/')
Optional arguments:
--help [0|1] 1 to show this help message. (Default: 0)
--gtf_path Path to gtf-file. If path is set the process which creates a gtf-file is skipped.
--tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will be skipped.
Footprint extraction:
--window_length INT This parameter sets the length of a sliding window. (Default: 200)
--step INT This parameter sets the number of positions to slide the window forward. (Default: 100)
--percentage INT Threshold in percent (Default: 0)
--min_gap INT If footprints are less than X bases apart the footprints will be merged (Default: 6)
Filter motifs:
--min_size_fp INT Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10)
--max_size_fp INT Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 200)
--tfbsscan_method [moods|fimo] Method used by tfbsscan. (Default: moods)
Cluster:
Sequence preparation/ reduction:
--kmer INT K-mer length (Default: 10)
--aprox_motif_len INT Motif length (Default: 10)
--motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
--min_seq_length Interations Remove all sequences below this value. (Default: 10)
Clustering:
--global INT Global (=1) or local (=0) alignment. (Default: 0)
--identity FLOAT Identity threshold. (Default: 0.8)
--sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8)
--memory INT Memory limit in MB. 0 for unlimited. (Default: 800)
--throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9)
--strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0)
Motif estimation:
--min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100)
--motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8)
--motif_max_key INT Maximum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 20)
--iteration INT Number of iterations done by GLAM2. More Iterations: better results, higher runtime. (Default: 10000)
--tomtom_treshold FLOAT Threshold for similarity score. (Default: 0.01)
--best_motif INT Get the best X motifs per cluster. (Default: 3)
--gap_penalty INT Set penalty for gaps in GLAM2 (Default: 1000)
Moitf clustering:
--cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0)
--edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5)
--motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001)
Creating GTF:
--tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON
config
All arguments can be set in the configuration files
```
"""
System.exit(2)
} else {
Channel.fromPath(params.bigwig).map {it -> [it.simpleName, it]}.set {bigwig_input}
Channel.fromPath(params.bed).into {bed_input; bed_for_tfbsscan}
Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2}
Channel.fromPath(params.motif_db).into {db_for_motivscan; db_for_tomtom}
Channel.fromPath(params.config).set {config}
if (params.tfbs_path != ""){
known_tfbs = file(params.tfbs_path).toAbsolutePath()
}
}
/*
Checking for parameter input!
*/
int_params = ["window_length", "step", "min_size_fp", "max_size_fp", "kmer",
"aprox_motif_len", "motif_occurence", "min_seq_length", "global",
"sequence_coverage", "memory", "throw_away_seq", "strand",
"min_seq", "motif_min_key", "motif_max_key", "iteration",
"edge_weight", "best_motif", "min_gap", "gap_penalty", "edge_weight"]
req_params = ["bigwig", "bed", "genome_fasta", "motif_db", "config"]
valid_organism = ["hg38", "hg19", "mm9", "mm10"]
valid_tfbsscan_methods = ["moods","fimo"]
params.each { key, value ->
if(int_params.contains(key)) {
if (!("${value}" ==~ /\d+/ )){
println("ERROR: $key needs to be an Integer")
System.exit(2)
}
}
if(req_params.contains(key)) {
if(!file(value).exists()) {
println("ERROR: $key not found. Please check the given path.")
System.exit(2)
}
}
}
if (!("${params.identity}" ==~ /^0\.[8-9][[0-9]*]?|^1(\.0)?/ )){
println("ERROR: --identity needs to be float in range 0.8 to 1.0")
System.exit(2)
}
if (!valid_tfbsscan_methods.contains(params.tfbsscan_method)) {
println("ERROR: Invalid Method for tfbsscan! Valid methods: " + valid_tfbsscan_methods)
System.exit(2)
}
if (!valid_organism.contains(params.organism)) {
println("ERROR: Invalid organism! Valid organisms: " + valid_organism)
System.exit(2)
}
if (!("${params.percentage}" ==~ /\d+/ ) || params.percentage < 0 || params.percentage > 100 ){
println("ERROR: --percentage needs to be an Integer between 0 and 100")
System.exit(2)
}
if (!("${params.tomtom_treshold}" ==~ /([0-9]*\.[0-9]+|[0-9]+)/)) {
println("ERROR: --tomtom_treshold needs to be an Integer or float, e.g. 0.01")
System.exit(2)
}
if (!("${params.motif_similarity_thresh}" ==~ /([0-9]*\.[0-9]+|[0-9]+)/)) {
println("ERROR: --motif_similarity_thresh needs to be an Integer or float, e.g. 0.01")
System.exit(2)
}
path_bin = path_bin?.endsWith('/') ? path_bin.substring( 0, path_bin.length() -1 ) : path_bin
bigwig_input.combine(bed_input).set{footprint_in}
/*
This process uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest
*/
process footprint_extraction {
//conda "${path_env}"
errorStrategy 'finish'
tag{name}
publishDir "${params.out}/1.1_footprint_extraction/", mode: 'copy', pattern: '*.bed'
publishDir "${params.out}/1.1_footprint_extraction/log", mode: 'copy', pattern: '*.log'
input:
set name, file (bigWig), file (bed) from footprint_in
output:
set name, file ('*.bed') into bed_for_overlap_with_TFBS
file('*.log')
script:
"""
python ${path_bin}/1.1_footprint_extraction/footprints_extraction.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --percentage ${params.percentage} --min_gap ${params.min_gap}
"""
}
for_overlap = bed_for_overlap_with_TFBS.combine(fa_overlap).combine(db_for_motivscan).combine(bed_for_tfbsscan)
/*
Find postitions of known tfbs with tfbsscan and discard the overlaps with compareBed.sh
*/
process overlap_with_known_TFBS {
//conda "${path_env}"
publishDir "${params.out}/1.2_filter_motifs", mode :'copy'
tag{name}
errorStrategy 'finish'
input:
set name, file (bed_footprints), file (fasta), file (db), file (bed_peaks) from for_overlap
output:
set name, file ("${name}_unknown.bed") into bed_for_reducing
file ('./known_tfbs/*.bed') optional true
file ('*.stats')
script:
if(params.tfbs_path == ""){
"""
python ${path_bin}/1.2_filter_motifs/tfbsscan.py --use ${params.tfbsscan_method} --core ${params.threads} -m ${db} -g ${fasta} -o ./known_tfbs -b ${bed_peaks}
${path_bin}/1.2_filter_motifs/compareBed.sh --data ${bed_footprints} --motifs ./known_tfbs --fasta ${fasta} -o ${name}_unknown.bed -min ${params.min_size_fp} -max ${params.max_size_fp}
cp -r ./known_tfbs/ ${params.out}/1.2_filter_motifs/
"""
} else {
"""
${path_bin}/1.2_filter_motifs/compareBed.sh --data ${bed_footprints} --motifs ${known_tfbs} --fasta ${fasta} -o ${name}_unknown.bed -min ${params.min_size_fp} -max ${params.max_size_fp}
"""
}
}
/*
Reduce each sequence to its most conserved region.
*/
process reduce_sequence {
//conda "${path_env}"
errorStrategy 'finish'
tag{name}
publishDir "${params.out}/2.1_clustering/reduced_bed/", mode: 'copy', pattern: '*.bed'
publishDir "${params.out}/2.1_clustering/log", mode: 'copy', pattern: '*.log'
input:
set name, file (bed) from bed_for_reducing
output:
set name, file ('*.bed') into bed_for_clustering
file ('*.log')
script:
"""
Rscript ${path_bin}/2.1_clustering/reduce_sequence.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} --summary reduce_sequence.log
"""
}
/*
Cluster all sequences. Appends a column with cluster numbers to the bed-file.
*/
process clustering {
//conda "${path_env}"
publishDir "${params.out}/2.1_clustering/", mode: 'copy', pattern: '*.bed'
publishDir "${params.out}/2.1_clustering/log", mode: 'copy', pattern: '*.log'
input:
set name, file (bed) from bed_for_clustering
output:
set name, file ('*.bed') into bed_for_motif_esitmation
file ('*.log')
script:
"""
Rscript ${path_bin}/2.1_clustering/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} --summary clustering.log
"""
}
/*
Converting BED-File to one FASTA-File per cluster
*/
process bed_to_clustered_fasta {
//conda "${path_env}"
publishDir "${params.out}/2.2_motif_estimation/fasta/", mode: 'copy', pattern: '*.FASTA'
tag{name}
input:
set name, file (bed) from bed_for_motif_esitmation
output:
file ('*.FASTA') into fasta_for_glam2
file ('*.FASTA') into fasta_for_motif_cluster
file ('*.log') into log22
script:
"""
Rscript ${path_bin}/2.2_motif_estimation/bed_to_fasta.R -i ${bed} -p ${name} -m ${params.min_seq}
"""
}
//flatten list and adding name of file to channel value
fasta_for_glam2 = fasta_for_glam2.flatten().map {it -> [it.simpleName, it]}
//combine fasta files in one list
fasta_for_motif_cluster = fasta_for_motif_cluster.toList()
/*
Running GLAM2 on FASTA-files.
Generating Motifs through alignment and scoring best local matches.
*/
process glam2 {
tag{name}
publishDir "${params.out}/2.2_motif_estimation/glam2/", mode: 'copy'
//conda "${path_env}"
input:
set name, file (fasta) from fasta_for_glam2
output:
file("${name}/*.meme") into meme_to_merge
set name, file("${name}/*.meme") into meme_for_tomtom
set name, file("${name}/*.meme") into meme_to_search_in_merged
set name, file("${name}/*.meme") into meme_for_filter
set name, file("${name}/*.txt") into glam_for_seq
file ('*')
script:
"""
glam2 n ${fasta} -O ./${name}/ -E ${params.gap_penalty} -J ${params.gap_penalty} -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration}
"""
}
/*
Combining all MEME-files to one big MEME-file.
The paths are sorted numerically depending on the cluster ID.
*/
process merge_meme {
publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_meme/", mode: 'copy'
//conda "${path_env}"
input:
val (memelist) from meme_to_merge.toList()
output:
file ('merged_meme.meme') into merged_meme
when:
params.cluster_motif == 1
script:
//sorting
memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") }
meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer }
meme_sorted_full = meme_sorted.collect {it.toString() + "/glam2.meme"}
//create list for bash
meme_list = meme_sorted_full.toString().replaceAll(/\,|\[|\]/,"")
"""
meme2meme ${meme_list} > merged_meme.meme
"""
}
to_find_similar_motifs = meme_to_search_in_merged.combine(merged_meme)
/*
Running Tomtom on merged meme-files.
Output table has the information which clusters are similar to each other.
*/
process find_similar_motifs {
tag{name}
publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/cluster_similarity/", mode: 'copy'
//conda "${path_env}"
input:
set name, file (meme) ,file (merged_meme) from to_find_similar_motifs
output:
set name, file ("${name}.tsv") into motif_similarity
when:
params.cluster_motif == 1
script:
"""
tomtom ${meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > ${name}.tsv
"""
}
/*
Label first column of TSV-file with Cluster ID
*/
process label_cluster {
tag{name}
//conda "${path_env}"
input:
set name, file (tsv) from motif_similarity
output:
file ("${name}_labeled.tsv") into labeled_tsv
when:
params.cluster_motif == 1
script:
"""
Rscript ${path_bin}/2.2_motif_estimation/label_cluster.R -i ${tsv} -o ${name}_labeled.tsv
"""
}
/*
Merging tsv files_for_merge_fasta
*/
process merge_labeled_tsv {
publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/", mode: 'copy'
input:
val (tsvlist) from labeled_tsv.toSortedList { it.toString().tokenize('_')[-2] as Integer }
output:
file ('*.tsv') into merged_labeled_tsv
when:
params.cluster_motif == 1
script:
tsvs = tsvlist.toString().replaceAll(/\,|\[|\]/,"")
"""
echo -e 'Query_ID\tTarget_ID\tOptimal_offset\tp-value\tE-value\tq-value\tOverlap\tQuery_consensus\tTarget_consensus\tOrientation'> merged_labeled.tsv
for i in $tsvs; do
cat \$i >> merged_labeled.tsv
done
"""
}
files_for_merge_fasta = merged_labeled_tsv.combine(fasta_for_motif_cluster)
/*
Merging FASTA-files of similar clusters
*/
process merge_fasta {
//conda "${path_env}"
publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_fasta/", mode: 'copy'
input:
set file (motiv_sim), val (fasta_list) from files_for_merge_fasta
output:
file ('*.fasta') into motif_clustered_fasta_list
file('*.png')
when:
params.cluster_motif == 1
script:
fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer }
fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"")
"""
Rscript ${path_bin}/2.2_motif_estimation/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight}
"""
}
motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten()
/*
Run GLAM2 on emrged FASTA-files
*/
process clustered_glam2 {
publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/glam2/", mode: 'copy'
tag{name}
//conda "${path_env}"
input:
file (fasta) from motif_clustered_fasta_flat
output:
set name, file ("/${name}/*.meme") into clustered_meme_for_tomtom
set name, file ("/${name}/*.meme") into clustered_meme_for_filter
set name, file("/${name}/glam2.txt") into clust_glam_for_seq
file('*')
when:
params.cluster_motif == 1
script:
name = fasta.getBaseName()
"""
glam2 n ${fasta} -O ./${name}/ -E ${params.gap_penalty} -J ${params.gap_penalty} -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration}
"""
}
/*
Forward files depending on set parameter
If motif clustering is activated or not.
*/
if(params.cluster_motif == 1){
for_tomtom = clustered_meme_for_tomtom
for_filter = clustered_meme_for_filter
seq_to_json = clust_glam_for_seq
} else {
for_tomtom = meme_for_tomtom
for_filter = meme_for_filter
seq_to_json = glam_for_seq
}
/*
Running Tomtom on meme-files generated by GLAM2.
Tomtom searches motifs in databases.
*/
process tomtom {
tag{name}
publishDir "${params.out}/2.2_motif_estimation/tomtom/", mode: 'copy'
//conda "${path_env}"
input:
set name, file (meme) from for_tomtom
output:
set name, file ('*.tsv') into tsv_for_filter
script:
"""
tomtom ${meme} ${params.motif_db} -thresh ${params.tomtom_treshold} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv
"""
}
//Joining channels with meme and tsv files. Filter joined channel on line count.
//Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel.
for_filter.join( tsv_for_filter ).into {for_filter2; for_log}
for_filter2
.filter { name, meme, tsv ->
long count = tsv.readLines().size()
count <= 1
}
.into { meme_for_scan; check; num_cluster }
count_cluster = num_cluster.count()
count_cluster_before_filter = for_log.count()
log22.combine(count_cluster).combine(count_cluster_before_filter ).set {log22_final}
//Writes log file for 2.2_motif_estimation
process write_log_for_motif_estimation {
publishDir "${params.out}/2.2_motif_estimation/log/", mode: 'copy'
input:
set file (logfile), after_filter, before_filter from log22_final
output:
file ('*.log')
script:
removed = before_filter - after_filter
"""
cat ${logfile} > motif_estimation.log
printf "\nMotifs found in Database: $removed\nNumber of remaining unknown motifs/cluster: $after_filter" >> motif_estimation.log
"""
}
//If channel 'check' is empty print errormessage
process check_for_unknown_motifs {
echo true
input:
val x from check.ifEmpty('EMPTY')
when:
x == 'EMPTY'
"""
echo '>>> STOPPED: No unknown Motifs were found.'
"""
}
//Get the best (first) X Motifs from each MEME-file
process get_best_motif {
//conda "${path_env}"
tag{name}
publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/", mode: 'copy'
input:
set name, file(meme), file(tsv) from meme_for_scan
output:
set name, file('*_best.meme') into best_motif
script:
cluster_id = name.split('_')[-1]
"""
python ${path_bin}/2.2_motif_estimation/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif} ${cluster_id}
"""
}
/*
Get the sequence names used to create best X Motifs as a JSON-file
*/
process get_best_motif_seq {
//conda "${path_env}"
tag{name}
publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/motif_sequences/", mode: 'copy'
input:
set name, file (txt) from seq_to_json
output:
file("${name}_seq.json")
script:
cluster_id = name.split('_')[-1]
"""
Rscript ${path_bin}/2.2_motif_estimation/get_motif_seq.R -i ${txt} -o ${name}_seq.json -n ${params.best_motif} -t ./motif_seq_tmp/cluster_${cluster_id} -c ${cluster_id}
"""
}
best_motif.combine(fa_scan).set {files_for_genome_scan}
/*
process genome_scan {
//conda "${path_env}"
input:
set name, file(meme), file(fasta) from files_for_genome_scan
output:
file ('.bed') into bed_for_uropa, bed_for_cluster_quality
script:
"""
"""
}
process cluster_quality {
input:
file (bed) from bed_for_cluster_quality
output:
file ('*.bed') into bed_for_final_filter
script:
"""
"""
} */
process create_GTF {
//conda "${path_env}"
errorStrategy 'finish'
publishDir "${params.out}/3.1_create_gtf/", mode: 'copy'
output:
file ('*.gtf') into gtf
when:
params.gtf_path == ""
script:
"""
python ${path_bin}/3.1_create_gtf/RegGTFExtractor.py ${params.organism} --tissue ${params.tissues} --wd ${path_bin}/3.1_create_gtf
"""
}
if (params.gtf_path == "") {
gtf_for_uropa = gtf
} else {
gtf_for_uropa = Channel.fromPath(params.gtf_path)
}
/*
bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in}
// Create configuration file for UROPA.
// Takes template and replaces bed- and gtf-placeholders with actual paths.
process create_uropa_config {
publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/', mode: 'copy'
input:
set val(bed), val(gtf) from uropa_in.toList()
file (conf) from config
output:
file ('uropa.config') into uropa_config
script:
"""
sed -- 's/placeholder_gtf/${gtf}/g; s/placeholder_bed/${bed}/g' ${conf} > uropa.config.final
"""
}
process UROPA {
input:
file (config) from uropa_config
output:
set file ("*_allhits.txt"), file ("*_finalhits.txt") into uropa_for_filter
script:
"""
"""
}
process filter {
input:
output:
script:
"""
"""
} */
workflow.onComplete {
log.info"""
Pipeline execution summary
---------------------------
Completed at: ${workflow.complete}
Duration : ${workflow.duration}
Success : ${workflow.success}
workDir : ${workflow.workDir}
exit status : ${workflow.exitStatus}
Error report: ${workflow.errorReport ?: '-'}
"""
}