#!/usr/bin/env nextflow //setting default values params.bigwig="" params.bed="" params.genome_fasta="" params.motif_db="" params.config="" params.tfbs_path="" params.create_known_tfbs_path = "./" params.help = 0 params.gtf_path="" params.out = "./out/" //peak_calling params.window_length = 200 params.step = 100 params.percentage = 0 //filter_unknown_motifs params.min_size_fp=10 params.max_size_fp=200 params.tfbsscan_method = "moods" //clustering //reduce_sequence params.kmer=10 params.aprox_motif_len=10 params.motif_occurence=1 params.min_seq_length=10 //cdhit_wrapper params.global=0 params.identity=0.8 params.sequence_coverage=8 params.memory=800 params.throw_away_seq=9 params.strand=0 //motif_estimation //bed_to_clustered_fasta params.min_seq = 100 // Minimum number of sequences in the fasta-files for glam2 //glam2 params.motif_min_key = 8 // Minimum number of key positions (aligned columns) params.motif_max_key = 20 // Maximum number of key positions (aligned columns) params.iteration = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime. //tomtom params.tomtom_treshold = 0.01 // threshold for similarity score. //cluster motifs params.cluster_motif = 0 // Boolean if 1 motifs are clustered else they are not params.edge_weight = 50 // Minimum weight of edges in motif-cluster-graph motif_similarity_thresh = 0.00001 // threshold for motif similarity score params.best_motif = 3 // Top n motifs per cluster //creating_gtf params.organism="" params.tissue="" if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" != "0"){ log.info """ Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file] Required arguments: --bigwig Path to BigWig-file --bed Path to BED-file --genome_fasta Path to genome in FASTA-format --motif_db Path to motif-database in MEME-format --config Path to UROPA configuration file --organism Input organism [hg38 | hg19 | mm9 | mm10] --out Output Directory (Default: './out/') Optional arguments: --help [0|1] 1 to show this help message. (Default: 0) --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. Path can be set as tfbs_path in next run. (Default: './') --gtf_path Path to gtf-file. If path is set the process which creats a gtf-file is skipped. Footprint extraction: --window_length INT This parameter sets the length of a sliding window. (Default: 200) --step INT This parameter sets the number of positions to slide the window forward. (Default: 100) --percentage INT Threshold in percent (Default: 0) Filter unknown motifs: --min_size_fp INT Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10) --max_size_fp INT Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 100) --tfbsscan_method [moods|fimo] Method used by tfbsscan. (Default: moods) Clustering: Sequence preparation/ reduction: --kmer INT Kmer length (Default: 10) --aprox_motif_len INT Motif length (Default: 10) --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. --min_seq_length Interations Remove all sequences below this value. (Default: 10) Clustering: --global INT Global (=1) or local (=0) alignment. (Default: 0) --identity FLOAT Identity threshold. (Default: 0.8) --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) Motif estimation: --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) --motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) --motif_max_key INT Maximum number of key positions (aligned columns) in the alignment done by GLAM2.f (Default: 20) --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) --best_motif INT Get the best X motifs per cluster. (Default: 3) Moitf clustering: --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) Creating GTF: --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON config All arguments can be set in the configuration files ``` """ System.exit(2) } else { Channel.fromPath(params.bigwig).map {it -> [it.simpleName, it]}.set {bigwig_input} Channel.fromPath(params.bed).into {bed_input; bed_for_tfbsscan} Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} Channel.fromPath(params.motif_db).into {db_for_motivscan; db_for_tomtom} Channel.fromPath(params.config).set {config} } /* Checking for parameter input! */ int_params = ["window_length", "step", "min_size_fp", "max_size_fp", "kmer", "aprox_motif_len", "motif_occurence", "min_seq_length", "global", "sequence_coverage", "memory", "throw_away_seq", "strand", "min_seq", "motif_min_key", "motif_max_key", "iteration", "edge_weight", "best_motif"] req_params = ["bigwig", "bed", "genome_fasta", "motif_db", "config"] valid_organism = ["hg38", "hg19", "mm9", "mm10"] valid_tfbsscan_methods = ["moods","fimo"] params.each { key, value -> if(int_params.contains(key)) { if (!("${value}" ==~ /\d+/ )){ println("ERROR: $key needs to be an Integer") System.exit(2) } } if(req_params.contains(key)) { if(!file(value).exists()) { println("ERROR: $key not found. Please check the given path.") System.exit(2) } } } if (!("${params.identity}" ==~ /^0\.[8-9][[0-9]*]?|^1(\.0)?/ )){ println("ERROR: --identity needs to be float in range 0.8 to 1.0") System.exit(2) } if (!valid_tfbsscan_methods.contains(params.tfbsscan_method)) { println("ERROR: Invalid Method for tfbsscan! Valid methods: " + valid_tfbsscan_methods) System.exit(2) } if (!valid_organism.contains(params.organism)) { println("ERROR: Invalid organism! Valid organisms: " + valid_organism) System.exit(2) } if (!("${params.percentage}" ==~ /\d+/ ) || params.percentage < 0 || params.percentage > 100 ){ println("ERROR: --percentage needs to be an Integer between 0 and 100") System.exit(2) } if (!("${params.tomtom_treshold}" ==~ /([0-9]*\.[0-9]+|[0-9]+)/)) { println("ERROR: --tomtom_treshold needs to be an Integer or float, e.g. 0.01") System.exit(2) } if (!("${params.motif_similarity_thresh}" ==~ /([0-9]*\.[0-9]+|[0-9]+)/)) { println("ERROR: --motif_similarity_thresh needs to be an Integer or float, e.g. 0.01") System.exit(2) } path_bin = path_bin?.endsWith('/') ? path_bin.substring( 0, path_bin.length() -1 ) : path_bin bigwig_input.combine(bed_input).set{footprint_in} /* This process uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest */ process footprint_extraction { //conda "${path_env}" tag{name} publishDir "${params.out}/1.1_footprint_extraction/", mode: 'copy', pattern: '*.bed' publishDir "${params.out}/1.1_footprint_extraction/log", mode: 'copy', pattern: '*.log' input: set name, file (bigWig), file (bed) from footprint_in output: set name, file ('*.bed') into bed_for_overlap_with_TFBS file('*.log') script: """ python ${path_bin}/1.1_footprint_extraction/footprints_extraction.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --percentage ${params.percentage} """ } for_tfbs = fa_overlap.combine(db_for_motivscan).combine(bed_for_tfbsscan) /* */ process extract_known_TFBS { //conda "${path_env}" publishDir "${params.out}/1.2_filter_motifs/TFBSscan/", mode: 'copy', pattern: '*.bed' input: set file (fasta), file (db), file (bed) from for_tfbs output: val ('done') into known_TFBS_for_overlap when: params.tfbs_path == "" script: """ python ${path_bin}/1.2_filter_motifs/tfbsscan.py --use ${params.tfbsscan_method} --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path} -b ${bed} """ } /* Sets path to known tfbs BED-files. If tfbs_path is set the process extract_known_TFBS is skipped, so the paths are differnt. */ if(params.tfbs_path == "") { bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap.toList()).combine(fa_overlap_2).set {for_overlap} motif_path = params.create_known_tfbs_path } else { Channel.from('skipped').set {workaround} bed_for_overlap_with_TFBS.combine(workaround).combine(fa_overlap_2).set {for_overlap} tfbs = params.tfbs_path?.endsWith('/') ? params.tfbs_path: params.tfbs_path.substring( 0, params.tfbs_path.length() -1 ) motif_path = tfbs } /* */ process overlap_with_known_TFBS { //conda "${path_env}" publishDir "${params.out}/1.2_filter_motifs/compareBed/", mode :'copy' input: set name, file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap output: set name, file ("${name}_unknown.bed") into bed_for_reducing file ('*.stats') script: motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"") """ ${path_bin}/1.2_filter_motifs/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}_unknown.bed -min ${params.min_size_fp} -max ${params.max_size_fp} """ } /* Reduce each sequence to its most conserved region. */ process reduce_sequence { //conda "${path_env}" publishDir "${params.out}/2.1_clustering/reduced_bed/", mode: 'copy', pattern: '*.bed' publishDir "${params.out}/2.1_clustering/log", mode: 'copy', pattern: '*.log' input: set name, file (bed) from bed_for_reducing output: set name, file ('*.bed') into bed_for_clustering file ('*.log') script: """ Rscript ${path_bin}/2.1_clustering/reduce_sequence.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} --summary reduce_sequence.log """ } /* Cluster all sequences. Appends a column with cluster numbers to the bed-file. */ process clustering { //conda "${path_env}" publishDir "${params.out}/2.1_clustering/", mode: 'copy', pattern: '*.bed' publishDir "${params.out}/2.1_clustering/log", mode: 'copy', pattern: '*.log' input: set name, file (bed) from bed_for_clustering output: set name, file ('*.bed') into bed_for_motif_esitmation file ('*.log') script: """ Rscript ${path_bin}/2.1_clustering/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} --summary clustering.log """ } /* Converting BED-File to one FASTA-File per cluster */ process bed_to_clustered_fasta { //conda "${path_env}" publishDir "${params.out}/2.2_motif_estimation/fasta/", mode: 'copy', pattern: '*.FASTA' tag{name} input: set name, file (bed) from bed_for_motif_esitmation output: file ('*.FASTA') into fasta_for_glam2 file ('*.FASTA') into fasta_for_motif_cluster file ('*.log') into log22 script: """ Rscript ${path_bin}/2.2_motif_estimation/bed_to_fasta.R -i ${bed} -p ${name} -m ${params.min_seq} """ } //flatten list and adding name of file to channel value fasta_for_glam2 = fasta_for_glam2.flatten().map {it -> [it.simpleName, it]} //combine fasta files in one list fasta_for_motif_cluster = fasta_for_motif_cluster.toList() /* Running GLAM2 on FASTA-files. Generating Motifs through alignment and scoring best local matches. */ process glam2 { tag{name} publishDir "${params.out}/2.2_motif_estimation/glam2/", mode: 'copy' //conda "${path_env}" input: set name, file (fasta) from fasta_for_glam2 output: file("${name}/*.meme") into meme_to_merge set name, file("${name}/*.meme") into meme_for_tomtom set name, file("${name}/*.meme") into meme_to_search_in_merged set name, file("${name}/*.meme") into meme_for_filter set name, file("${name}/*.txt") into glam_for_seq file ('*') script: """ glam2 n ${fasta} -O ./${name}/ -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} """ } /* Combining all MEME-files to one big MEME-file. The paths are sorted numerically depending on the cluster ID. */ process merge_meme { publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_meme/", mode: 'copy' //conda "${path_env}" input: val (memelist) from meme_to_merge.toList() output: file ('merged_meme.meme') into merged_meme when: params.cluster_motif == 1 script: //sorting memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") } meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer } meme_sorted_full = meme_sorted.collect {it.toString() + "/glam2.meme"} //create list for bash meme_list = meme_sorted_full.toString().replaceAll(/\,|\[|\]/,"") """ meme2meme ${meme_list} > merged_meme.meme """ } to_find_similar_motifs = meme_to_search_in_merged.combine(merged_meme) /* Running Tomtom on merged meme-files. Output table has the information which clusters are similar to each other. */ process find_similar_motifs { tag{name} publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/cluster_similarity/", mode: 'copy' //conda "${path_env}" input: set name, file (meme) ,file (merged_meme) from to_find_similar_motifs output: set name, file ("${name}.tsv") into motif_similarity when: params.cluster_motif == 1 script: """ tomtom ${meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > ${name}.tsv """ } /* Label first column of TSV-file with Cluster ID */ process label_cluster { tag{name} //conda "${path_env}" input: set name, file (tsv) from motif_similarity output: file ("${name}_labeled.tsv") into labeled_tsv when: params.cluster_motif == 1 script: """ Rscript ${path_bin}/2.2_motif_estimation/label_cluster.R -i ${tsv} -o ${name}_labeled.tsv """ } /* Merging tsv files_for_merge_fasta */ process merge_labeled_tsv { publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/", mode: 'copy' input: val (tsvlist) from labeled_tsv.toSortedList { it.toString().tokenize('_')[-2] as Integer } output: file ('*.tsv') into merged_labeled_tsv when: params.cluster_motif == 1 script: tsvs = tsvlist.toString().replaceAll(/\,|\[|\]/,"") """ echo -e 'Query_ID\tTarget_ID\tOptimal_offset\tp-value\tE-value\tq-value\tOverlap\tQuery_consensus\tTarget_consensus\tOrientation'> merged_labeled.tsv for i in $tsvs; do cat \$i >> merged_labeled.tsv done """ } files_for_merge_fasta = merged_labeled_tsv.combine(fasta_for_motif_cluster) /* Merging FASTA-files of similar clusters */ process merge_fasta { //conda "${path_env}" publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_fasta/", mode: 'copy' input: set file (motiv_sim), val (fasta_list) from files_for_merge_fasta output: file ('*.fasta') into motif_clustered_fasta_list file('*.png') when: params.cluster_motif == 1 script: fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer } fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"") """ Rscript ${path_bin}/2.2_motif_estimation/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight} """ } motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten() /* Run GLAM2 on emrged FASTA-files */ process clustered_glam2 { publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/glam2/", mode: 'copy' tag{name} //conda "${path_env}" input: file (fasta) from motif_clustered_fasta_flat output: set name, file ("/${name}/*.meme") into clustered_meme_for_tomtom set name, file ("/${name}/*.meme") into clustered_meme_for_filter set name, file("/${name}/glam2.txt") into clust_glam_for_seq file('*') when: params.cluster_motif == 1 script: name = fasta.getBaseName() """ glam2 n ${fasta} -O ./${name}/ -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} """ } /* Forward files depending on set parameter If motif clustering is activated or not. */ if(params.cluster_motif == 1){ for_tomtom = clustered_meme_for_tomtom for_filter = clustered_meme_for_filter seq_to_json = clust_glam_for_seq } else { for_tomtom = meme_for_tomtom for_filter = meme_for_filter seq_to_json = glam_for_seq } /* Running Tomtom on meme-files generated by GLAM2. Tomtom searches motifs in databases. */ process tomtom { tag{name} publishDir "${params.out}/2.2_motif_estimation/tomtom/", mode: 'copy' //conda "${path_env}" input: set name, file (meme) from for_tomtom output: set name, file ('*.tsv') into tsv_for_filter script: """ tomtom ${meme} ${params.motif_db} -thresh ${params.tomtom_treshold} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv """ } //Joining channels with meme and tsv files. Filter joined channel on line count. //Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel. for_filter.join( tsv_for_filter ).into {for_filter2; for_log} for_filter2 .filter { name, meme, tsv -> long count = tsv.readLines().size() count <= 1 } .into { meme_for_scan; check; num_cluster } count_cluster = num_cluster.count() count_cluster_before_filter = for_log.count() log22.combine(count_cluster).combine(count_cluster_before_filter ).set {log22_final} process write_log_for_motif_estimation { publishDir "${params.out}/2.2_motif_estimation/log/", mode: 'copy' input: set file (logfile), after_filter, before_filter from log22_final output: file ('*.log') script: removed = before_filter - after_filter """ cat ${logfile} > motif_estimation.log printf "\nMotifs found in Database: $removed\nNumber of remaining unknown motifs/cluster: $after_filter" >> motif_estimation.log """ } //If channel 'check' is empty print errormessage process check_for_unknown_motifs { echo true input: val x from check.ifEmpty('EMPTY') when: x == 'EMPTY' """ echo '>>> STOPPED: No unknown Motifs were found.' """ } //Get the best (first) X Motifs from each MEME-file process get_best_motif { //conda "${path_env}" tag{name} publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/", mode: 'copy' input: set name, file(meme), file(tsv) from meme_for_scan output: set name, file('*_best.meme') into best_motif script: cluster_id = name.split('_')[-1] """ python ${path_bin}/2.2_motif_estimation/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif} ${cluster_id} """ } process get_best_motif_seq { //conda "${path_env}" tag{name} publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/motif_sequences/", mode: 'copy' input: set name, file (txt) from seq_to_json output: file("${name}_seq.json") script: cluster_id = name.split('_')[-1] """ Rscript ${path_bin}/2.2_motif_estimation/get_motif_seq.R -i ${txt} -o ${name}_seq.json -n ${params.best_motif} -t ${path_bin}/2.2_motif_estimation/tmp/cluster_${cluster_id} -c ${cluster_id} """ } best_motif.combine(fa_scan).set {files_for_genome_scan} /* process genome_scan { //conda "${path_env}" input: set name, file(meme), file(fasta) from files_for_genome_scan output: file ('.bed') into bed_for_uropa, bed_for_cluster_quality script: """ """ } process cluster_quality { input: file (bed) from bed_for_cluster_quality output: file ('*.bed') into bed_for_final_filter script: """ """ } */ process create_GTF { //conda "${path_env}" publishDir "${params.out}/3.1_create_gtf/", mode: 'copy' output: file ('*.gtf') into gtf when: params.gtf_path == "" script: """ python ${path_bin}/3.1_create_gtf/RegGTFExtractor.py ${params.organism} --tissue ${params.tissues} --wd ${path_bin}/3.1_create_gtf """ } if (params.gtf_path == "") { gtf_for_uropa = gtf } else { gtf_for_uropa = Channel.fromPath(params.gtf_path) } /* bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in} // Create configuration file for UROPA. // Takes template and replaces bed- and gtf-placeholders with actual paths. process create_uropa_config { publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/', mode: 'copy' input: set val(bed), val(gtf) from uropa_in.toList() file (conf) from config output: file ('uropa.config') into uropa_config script: """ sed -- 's/placeholder_gtf/${gtf}/g; s/placeholder_bed/${bed}/g' ${conf} > uropa.config.final """ } process UROPA { input: file (config) from uropa_config output: set file ("*_allhits.txt"), file ("*_finalhits.txt") into uropa_for_filter script: """ """ } process filter { input: output: script: """ """ } */ workflow.onComplete { log.info""" Pipeline execution summary --------------------------- Completed at: ${workflow.complete} Duration : ${workflow.duration} Success : ${workflow.success} workDir : ${workflow.workDir} exit status : ${workflow.exitStatus} Error report: ${workflow.errorReport ?: '-'} """ }