diff --git a/README.md b/README.md index a884632..3bf19c3 100644 --- a/README.md +++ b/README.md @@ -47,57 +47,59 @@ nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration). ``` Required arguments: - --bigwig Path to BigWig-file with scores on the peaks of interest - --bed Path to BED-file with peaks of interest corresponding to the BigWig file - --genome_fasta Path to genome in FASTA-format - --motif_db Path to motif-database in MEME-format - - + --bigwig Path to BigWig-file + --bed Path to BED-file + --genome_fasta Path to genome in FASTA-format + --motif_db Path to motif-database in MEME-format + --config Path to UROPA configuration file + --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. + Path can be set as tfbs_path in next run. (Default: './') Optional arguments: - --tfbs_path Path to directory with output BED-files from tfbsscan. If given tfbsscan will not be run. - + --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. + Footprint extraction: - --window_length INT (Default: 200) a length of a window - --step INT (Default: 100) an interval to slide the window - --percentage INT(Default: 0) a percentage to be added to background while searching for footprints - + --window_length INT (Default: 200) + --step INT (Default: 100) + --percentage INT (Default: 0) + Filter unknown motifs: - --min_size_fp INT (Default: 10) - --max_size_fp INT (Default: 100) - - Cluster: + --min_size_fp INT (Default: 10) + --max_size_fp INT (Default: 100) + + Clustering: Sequence preparation/ reduction: - --kmer INT Kmer length (Default: 10) - --aprox_motif_len INT Motif length (Default: 10) - --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. - --min_seq_length INT Remove all sequences below this value. (Default: 10) - + --kmer INT Kmer length (Default: 10) + --aprox_motif_len INT Motif length (Default: 10) + --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. + --min_seq_length Interations Remove all sequences below this value. (Default: 10) + Clustering: - --global INT Global (=1) or local (=0) alignment. (Default: 0) - --identity FLOAT Identity threshold. (Default: 0.8) - --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) - --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) - --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) - --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) - + --global INT Global (=1) or local (=0) alignment. (Default: 0) + --identity FLOAT Identity threshold. (Default: 0.8) + --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) + --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) + --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) + --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) + Motif estimation: - --min_seq INT Minimum number of sequences required in the FASTA-files for GLAM2 (Default: 100) - --motif_min_key INT Maximum number of key positions (aligned columns) (Default: 8) - --motif_max_key INT Maximum number of key positions (aligned columns) (Default: 20) - --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) + --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) + --motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) + --motif_max_key INT Maximum number of key positions (aligned columns) in the alignment done by GLAM2.f (Default: 20) + --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) - - Motif clustering: - --cluster_motif BOOLEAN IF its 1 motifs will be clustered (Default: 0) - --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) - --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) - - Creating GTF: - --tissue STRING Filter for one or more tissue/category activity, categories as in JSON config (Default: None) - --organism STRING Source organism: [ hg19 | hg38 or mm9 | mm10 ] (Default: hg38) - -All arguments can be set in the configuration files. + --best_motif INT Get the best X motifs per cluster. (Default: 3) + + Moitf clustering: + --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) + --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) + --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) + + Creating GTF: + --organism [hg38 | hg19 | mm9 | mm10] Input organism + --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON + config +All arguments can be set in the configuration files ``` diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R index 13bd026..6cefe43 100644 --- a/bin/bed_to_fasta.R +++ b/bin/bed_to_fasta.R @@ -9,17 +9,17 @@ args = commandArgs(trailingOnly = TRUE) bedInput <- args[1] -prefix <- args[2] -min_seq <- args[3] +prefix <- args[2] +min_seq <- args[3] bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") -clusters <- split(bed, bed$V8, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster +clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster discard <- lapply(1:length(clusters), function(i){ clust <- as.data.frame(clusters[i]) print(nrow(clust)) if (nrow(clust) >= as.numeric(min_seq) ) { - sequences <- as.list(clust[[7]]) # <---- Splate mit Sequenz + sequences <- as.list(clust[[10]]) # <---- Splate mit Sequenz outfile <- paste0(prefix,"_cluster_",i,".FASTA") seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name } else { diff --git a/pipeline.nf b/pipeline.nf index c2f3511..d8d5ad5 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -1,625 +1,630 @@ -#!/usr/bin/env nextflow - -//setting default values -params.bigwig="" -params.bed="" -params.genome_fasta="" -params.motif_db="" -params.config="" -params.tfbs_path="" - -//peak_calling - params.window_length = 200 - params.step = 100 - params.percentage = 0 - -//filter_unknown_motifs - params.min_size_fp=10 - params.max_size_fp=100 - -//clustering - //reduce_bed - params.kmer=10 - params.aprox_motif_len=10 - params.motif_occurence=1 - params.min_seq_length=10 - - //cdhit_wrapper - params.global=0 - params.identity=0.8 - params.sequence_coverage=8 - params.memory=800 - params.throw_away_seq=9 - params.strand=0 - -//motif_estimation - //bed_to_clustered_fasta - params.min_seq = 100 // Minimum number of sequences in the fasta-files for glam2 - - //glam2 - params.motif_min_key = 8 // Minimum number of key positions (aligned columns) - params.motif_max_key = 20 // Maximum number of key positions (aligned columns) - params.iteration = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime. - - //tomtom - params.tomtom_treshold = 0.01 // threshold for similarity score. - - //cluster motifs - params.cluster_motif = 0 // Boolean if 1 motifs are clustered else they are not - params.edge_weight = 50 // Minimum weight of edges in motif-cluster-graph - motif_similarity_thresh = 0.00001 // threshold for motif similarity score - - params.best_motif = 3 // Top n motifs per cluster - -//creating_gtf - params.organism="hg38" - params.tissue="" - - params.tmpdir = "./" - -if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == ""){ -log.info """ -Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] - -Required arguments: - --bigwig Path to BigWig-file - --bed Path to BED-file - --genome_fasta Path to genome in FASTA-format - --jaspar_db Path to motif-database in MEME-format - - -Optional arguments: - - --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. - - Footprint extraction: - --window_length INT (Default: 200) - --step INT (Default: 100) - --percentage INT (Default: 0) - - Filter unknown motifs: - --min_size_fp INT (Default: 10) - --max_size_fp INT (Default: 100) - - Clustering: - Sequence preparation/ reduction: - --kmer INT Kmer length (Default: 10) - --aprox_motif_len INT Motif length (Default: 10) - --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. - --min_seq_length Interations Remove all sequences below this value. (Default: 10) - - Clustering: - --global INT Global (=1) or local (=0) alignment. (Default: 0) - --identity FLOAT Identity threshold. (Default: 0.8) - --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) - --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) - --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) - --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) - - Motif estimation: - --motif_min_len INT Minimum length of Motif (Default: 8) - --motif_max_len INT Maximum length of Motif (Default: 20) - --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) - --tomtom_treshold float Threshold for similarity score. (Default: 0.01) - - Moitf clustering: - --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) - --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) - - Creating GTF: - --organism [homo_sapiens | mus_musculus] - --tissues - -All arguments can be set in the configuration files. -""" -} else { - Channel.fromPath(params.bigwig).map {it -> [it.simpleName, it]}.set {bigwig_input} - Channel.fromPath(params.bed).set {bed_input} - Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} - Channel.fromPath(params.motif_db).into {db_for_motivscan; db_for_tomtom} - Channel.fromPath(params.config).set {config} -} - -/* -Checking for parameter input! -*/ -int_params = ["window_length", "step", "min_size_fp", "max_size_fp", "kmer", - "aprox_motif_len", "motif_occurence", "min_seq_length", "global", - "sequence_coverage", "memory", "throw_away_seq", "strand", - "min_seq", "motif_min_key", "motif_max_key", "iteration", - "edge_weight", "best_motif"] -req_params = ["bigwig", "bed", "genome_fasta", "motif_db", "config"] - -valid_organism = ["hg38", "hg19", "mm9", "mm10"] - -params.each { key, value -> - if(int_params.contains(key)) { - if (!("${value}" ==~ /\d+/ )){ - println("ERROR: $key needs to be an Integer") - System.exit(2) - } - } - if(req_params.contains(key)) { - if(!file(value).exists()) { - println("ERROR: $key not found. Please check the given path.") - System.exit(2) - } - } -} -if (!("${params.identity}" ==~ /^0\.[8-9][[0-9]*]?|^1(\.0)?/ )){ - println("ERROR: --identity needs to be float in range 0.8 to 1.0") - System.exit(2) -} - -if (!valid_organism.contains(params.organism)) { - println("ERROR: Invalid organism! Valid organisms: " + valid_organism) - System.exit(2) -} -if (!("${params.percentage}" ==~ /\d+/ ) || params.percentage < 0 || params.percentage > 100 ){ - println("ERROR: --percentage needs to be an Integer between 0 and 100") - System.exit(2) -} - -if (!("${params.tomtom_treshold}" ==~ /([0-9]*\.[0-9]+|[0-9]+)/)) { - println("ERROR: --tomtom_treshold needs to be an Integer or float, e.g. 0.01") - System.exit(2) -} -if (!("${params.motif_similarity_thresh}" ==~ /([0-9]*\.[0-9]+|[0-9]+)/)) { - println("ERROR: --motif_similarity_thresh needs to be an Integer or float, e.g. 0.01") - System.exit(2) -} - - -path_bin = path_bin?.endsWith('/') ? path_bin.substring( 0, path_bin.length() -1 ) : path_bin - -bigwig_input.combine(bed_input).set{footprint_in} -/* -This process uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest -*/ -process footprint_extraction { - conda "${path_env}" - - tag{name} - publishDir "${params.out}/footprint_extraction/", mode: 'copy', pattern: '*.bed' - publishDir "${params.out}/footprint_extraction/log", mode: 'copy', pattern: '*.log' - - input: - set name, file (bigWig), file (bed) from footprint_in - - output: - set name, file ('*.bed') into bed_for_overlap_with_TFBS - - script: - """ - python ${path_bin}/call_peaks.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --percentage ${params.percentage} - """ -} - - -/* - -*/ -process extract_known_TFBS { - - conda "${path_env}" - - publishDir "${params.out}/known_TFBS/", mode: 'copy', pattern: '*.bed' - - input: - file (fasta) from fa_overlap - file (db) from db_for_motivscan - - output: - val ('done') into known_TFBS_for_overlap - - when: - params.tfbs_path == "" - - script: - """ - python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ${params.tmpdir} - """ -} - -/* -Sets path to known tfbs BED-files. If tfbs_path is set the process extract_known_TFBS is skipped, so the paths are differnt. -*/ -if(params.tfbs_path == "") { - bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap.toList()).combine(fa_overlap_2).set {for_overlap} - motif_path = params.tmpdir -} else { - Channel.from('skipped').set {workaround} - bed_for_overlap_with_TFBS.combine(workaround).combine(fa_overlap_2).set {for_overlap} - tfbs = params.tfbs_path?.endsWith('/') ? params.tfbs_path: params.tfbs_path.substring( 0, params.tfbs_path.length() -1 ) - motif_path = tfbs -} - - -/* -*/ -process overlap_with_known_TFBS { - conda "${path_env}" - - publishDir "${params.out}/unknown_overlap/" - - input: - set name, file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap - - output: - set name, file ("${name}_unknown.bed") into bed_for_reducing - - script: - motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"") - """ - ${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}_unknown.bed -min ${params.min_size_fp} -max ${params.max_size_fp} -p ${path_bin} - """ -} - - -/* -*/ -process reduce_bed { - conda "${path_env}" - echo true - - input: - set name, file (bed) from bed_for_reducing - - output: - set name, file ('*.bed') into bed_for_clustering - - script: - """ - Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} - """ -} - - -/* -*/ -process clustering { - conda "${path_env}" - echo true - - publishDir "${params.out}/cluster/", mode: 'copy', pattern: '*.bed' - - input: - set name, file (bed) from bed_for_clustering - - output: - set name, file ('*.bed') into bed_for_motif_esitmation - - script: - """ - Rscript ${path_bin}/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} - """ -} - - -/* -Converting BED-File to one FASTA-File per cluster -*/ -process bed_to_clustered_fasta { - conda "${path_env}" - publishDir "${params.out}/esimated_motifs/clustered_motifs/clustered_fasta/", mode: 'copy' - tag{name} - - input: - set name, file (bed) from bed_for_motif_esitmation - - output: - file ('*.FASTA') into fasta_for_glam2 - file ('*.FASTA') into fasta_for_motif_cluster - - script: - """ - Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq} - """ -} - - -//flatten list and adding name of file to channel value -fasta_for_glam2 = fasta_for_glam2.flatten().map {it -> [it.simpleName, it]} -//combine fasta files in one list -fasta_for_motif_cluster = fasta_for_motif_cluster.toList() - -/* -Running GLAM2 on FASTA-files. -Generating Motifs through alignment and scoring best local matches. -*/ -process glam2 { - - tag{name} - publishDir "${params.out}/esimated_motifs/clustered_motifs/${name}/", mode: 'copy' - - input: - set name, file (fasta) from fasta_for_glam2 - - output: - file("${name}/*.meme") into meme_to_merge - set name, file("${name}/*.meme") into meme_for_tomtom - set name, file("${name}/*.meme") into meme_for_filter - - script: - """ - glam2 n ${fasta} -O ./${name}/ -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} - """ -} - -/* -Combining all MEME-files to one big MEME-file. -The paths are sorted numerically depending on the cluster number. -*/ -process merge_meme { - - publishDir "${params.out}/esimated_motifs/merged_meme/", mode: 'copy' - - input: - val (memelist) from meme_to_merge.toList() - - output: - file ('merged_meme.meme') into merged_meme - - when: - params.cluster_motif == 1 - - script: - memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") } - meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer } - meme_sorted_full = meme_sorted.collect {it.toString() + "/glam2.meme"} - meme_list = meme_sorted_full.toString().replaceAll(/\,|\[|\]/,"") - """ - meme2meme ${meme_list} > merged_meme.meme - """ -} - -/* -Running Tomtom on merged meme-files. -Output table has the information which clusters are similar to each other. -*/ -process find_similar_motifs { - - publishDir "${params.out}/esimated_motifs/cluster_similarity/", mode: 'copy' - input: - file (merged_meme) from merged_meme - - output: - file ('all_to_all.tsv') into motif_similarity - - when: - params.cluster_motif == 1 - - script: - """ - tomtom ${merged_meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > all_to_all.tsv - """ -} - - -files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster) - - - -/* -Merging FASTA-files of similar clusters -*/ -process merge_fasta { - conda "${path_env}" - publishDir "${params.out}/esimated_motifs/merged_fasta/", mode: 'copy' - - input: - set file (motiv_sim), val (fasta_list) from files_for_merge_fasta - - output: - file ('*.fasta') into motif_clustered_fasta_list - file('*.png') - - when: - params.cluster_motif == 1 - - script: - fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer } - fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"") - """ - Rscript ${path_bin}/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight} - """ -} - - -motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten() - - -process clustered_glam2 { - - publishDir "${params.out}/final_esimated_motifs/${name}/", mode: 'copy' - - input: - file (fasta) from motif_clustered_fasta_flat - - output: - set name, file ('*.meme') into clustered_meme_for_tomtom - set name, file ('*.meme') into clustered_meme_for_filter - file('*') - - when: - params.cluster_motif == 1 - - script: - name = fasta.getBaseName() - """ - glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} - """ -} - -if(params.cluster_motif == 1){ - for_tomtom = clustered_meme_for_tomtom - for_filter = clustered_meme_for_filter -} else { - for_tomtom = meme_for_tomtom - for_filter = meme_for_filter -} - - -/* -Running Tomtom on meme-files generated by GLAM2. -Tomtom searches motifs in databases. -*/ -process tomtom { - - tag{name} - publishDir "${params.out}/final_esimated_motifs/tomtom/", mode: 'copy' - - input: - set name, file (meme) from for_tomtom - - output: - set name, file ('*.tsv') into tsv_for_filter - - script: - """ - tomtom ${meme} ${params.motif_db} -thresh ${params.thresh} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv - """ -} - - -//Joining channels with meme and tsv files. Filter joined channel on line count. -//Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel. -for_filter2 = for_filter.join( tsv_for_filter ) -for_filter2 - .filter { name, meme, tsv -> - long count = tsv.readLines().size() - count <= 1 - } - .into { meme_for_scan; check } - - -//If channel 'check' is empty print errormessage -process check_for_unknown_motifs { - echo true - - input: - val x from check.ifEmpty('EMPTY') - - when: - x == 'EMPTY' - - """ - echo '>>> STOPPED: No unknown Motifs were found.' - """ - -} - - -//Get the best(first) Motif from each MEME-file -process get_best_motif { - conda "${path_env}" - - publishDir "${params.out}/esimated_motifs/unknown_motifs/", mode: 'copy' - - input: - set name, file(meme), file(tsv) from meme_for_scan - - output: - set name, file('*_best.meme') into best_motif - - script: - """ - python ${path_bin}/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif} - """ -} - - -best_motif.combine(fa_scan).set {files_for_genome_scan} - -/* -process genome_scan { - conda "${path_env}" - - input: - set name, file(meme), file(fasta) from files_for_genome_scan - - output: - file ('.bed') into bed_for_uropa, bed_for_cluster_quality - - script: - """ - """ -} - - -process cluster_quality { - - input: - file (bed) from bed_for_cluster_quality - - output: - file ('*.bed') into bed_for_final_filter - - script: - """ - """ -} */ - - -process create_GTF { - conda "${path_env}" - - publishDir 'Path', mode:'copy' - - output: - file ('*.gtf') into gtf_for_uropa - - script: - """ - python ${path_bin}/RegGTFExtractor.py ${params.organism} --tissue ${params.tissues} --wd ${path_bin} - """ -} - -/* -bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in} - - -// Create configuration file for UROPA. -// Takes template and replaces bed- and gtf-placeholders with actual paths. -process create_uropa_config { - - publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/', mode: 'copy' - - input: - set val(bed), val(gtf) from uropa_in.toList() - file (conf) from config - - output: - file ('uropa.config') into uropa_config - - script: - """ - sed -- 's/placeholder_gtf/${gtf}/g; s/placeholder_bed/${bed}/g' ${conf} > uropa.config.final - """ -} - - -process UROPA { - - input: - file (config) from uropa_config - - output: - set file ("*_allhits.txt"), file ("*_finalhits.txt") into uropa_for_filter - - script: - """ - """ -} - - -process filter { - - input: - - output: - - script: - """ - """ -} */ +#!/usr/bin/env nextflow + +//setting default values + params.bigwig="" + params.bed="" + params.genome_fasta="" + params.motif_db="" + params.config="" + params.tfbs_path="" + params.create_known_tfbs_path = "./" + +//peak_calling + params.window_length = 200 + params.step = 100 + params.percentage = 0 + +//filter_unknown_motifs + params.min_size_fp=10 + params.max_size_fp=100 + +//clustering + //reduce_bed + params.kmer=10 + params.aprox_motif_len=10 + params.motif_occurence=1 + params.min_seq_length=10 + + //cdhit_wrapper + params.global=0 + params.identity=0.8 + params.sequence_coverage=8 + params.memory=800 + params.throw_away_seq=9 + params.strand=0 + +//motif_estimation + //bed_to_clustered_fasta + params.min_seq = 100 // Minimum number of sequences in the fasta-files for glam2 + + //glam2 + params.motif_min_key = 8 // Minimum number of key positions (aligned columns) + params.motif_max_key = 20 // Maximum number of key positions (aligned columns) + params.iteration = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime. + + //tomtom + params.tomtom_treshold = 0.01 // threshold for similarity score. + + //cluster motifs + params.cluster_motif = 0 // Boolean if 1 motifs are clustered else they are not + params.edge_weight = 50 // Minimum weight of edges in motif-cluster-graph + motif_similarity_thresh = 0.00001 // threshold for motif similarity score + + params.best_motif = 3 // Top n motifs per cluster + +//creating_gtf + params.organism="hg38" + params.tissue="" + +if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == ""){ +log.info """ +Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] + +Required arguments: + --bigwig Path to BigWig-file + --bed Path to BED-file + --genome_fasta Path to genome in FASTA-format + --motif_db Path to motif-database in MEME-format + --config Path to UROPA configuration file + --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. + Path can be set as tfbs_path in next run. (Default: './') +Optional arguments: + + --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. + + Footprint extraction: + --window_length INT (Default: 200) + --step INT (Default: 100) + --percentage INT (Default: 0) + + Filter unknown motifs: + --min_size_fp INT (Default: 10) + --max_size_fp INT (Default: 100) + + Clustering: + Sequence preparation/ reduction: + --kmer INT Kmer length (Default: 10) + --aprox_motif_len INT Motif length (Default: 10) + --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. + --min_seq_length Interations Remove all sequences below this value. (Default: 10) + + Clustering: + --global INT Global (=1) or local (=0) alignment. (Default: 0) + --identity FLOAT Identity threshold. (Default: 0.8) + --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) + --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) + --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) + --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) + + Motif estimation: + --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) + --motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) + --motif_max_key INT Maximum number of key positions (aligned columns) in the alignment done by GLAM2.f (Default: 20) + --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) + --tomtom_treshold float Threshold for similarity score. (Default: 0.01) + --best_motif INT Get the best X motifs per cluster. (Default: 3) + + Moitf clustering: + --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) + --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) + --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) + + Creating GTF: + --organism [hg38 | hg19 | mm9 | mm10] Input organism + --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON + config +All arguments can be set in the configuration files + ``` +""" +} else { + Channel.fromPath(params.bigwig).map {it -> [it.simpleName, it]}.set {bigwig_input} + Channel.fromPath(params.bed).set {bed_input} + Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} + Channel.fromPath(params.motif_db).into {db_for_motivscan; db_for_tomtom} + Channel.fromPath(params.config).set {config} +} + +/* +Checking for parameter input! +*/ +int_params = ["window_length", "step", "min_size_fp", "max_size_fp", "kmer", + "aprox_motif_len", "motif_occurence", "min_seq_length", "global", + "sequence_coverage", "memory", "throw_away_seq", "strand", + "min_seq", "motif_min_key", "motif_max_key", "iteration", + "edge_weight", "best_motif"] +req_params = ["bigwig", "bed", "genome_fasta", "motif_db", "config"] + +valid_organism = ["hg38", "hg19", "mm9", "mm10"] + +params.each { key, value -> + if(int_params.contains(key)) { + if (!("${value}" ==~ /\d+/ )){ + println("ERROR: $key needs to be an Integer") + System.exit(2) + } + } + if(req_params.contains(key)) { + if(!file(value).exists()) { + println("ERROR: $key not found. Please check the given path.") + System.exit(2) + } + } +} +if (!("${params.identity}" ==~ /^0\.[8-9][[0-9]*]?|^1(\.0)?/ )){ + println("ERROR: --identity needs to be float in range 0.8 to 1.0") + System.exit(2) +} + +if (!valid_organism.contains(params.organism)) { + println("ERROR: Invalid organism! Valid organisms: " + valid_organism) + System.exit(2) +} +if (!("${params.percentage}" ==~ /\d+/ ) || params.percentage < 0 || params.percentage > 100 ){ + println("ERROR: --percentage needs to be an Integer between 0 and 100") + System.exit(2) +} + +if (!("${params.tomtom_treshold}" ==~ /([0-9]*\.[0-9]+|[0-9]+)/)) { + println("ERROR: --tomtom_treshold needs to be an Integer or float, e.g. 0.01") + System.exit(2) +} +if (!("${params.motif_similarity_thresh}" ==~ /([0-9]*\.[0-9]+|[0-9]+)/)) { + println("ERROR: --motif_similarity_thresh needs to be an Integer or float, e.g. 0.01") + System.exit(2) +} + + +path_bin = path_bin?.endsWith('/') ? path_bin.substring( 0, path_bin.length() -1 ) : path_bin + +bigwig_input.combine(bed_input).set{footprint_in} +/* +This process uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest +*/ +process footprint_extraction { + conda "${path_env}" + + tag{name} + publishDir "${params.out}/footprint_extraction/", mode: 'copy', pattern: '*.bed' + publishDir "${params.out}/footprint_extraction/log", mode: 'copy', pattern: '*.log' + + input: + set name, file (bigWig), file (bed) from footprint_in + + output: + set name, file ('*.bed') into bed_for_overlap_with_TFBS + + script: + """ + python ${path_bin}/call_peaks.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --percentage ${params.percentage} + """ +} + + +/* + +*/ +process extract_known_TFBS { + + conda "${path_env}" + + publishDir "${params.out}/known_TFBS/", mode: 'copy', pattern: '*.bed' + + input: + file (fasta) from fa_overlap + file (db) from db_for_motivscan + + output: + val ('done') into known_TFBS_for_overlap + + when: + params.tfbs_path == "" + + script: + """ + python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path} + """ +} + +/* +Sets path to known tfbs BED-files. If tfbs_path is set the process extract_known_TFBS is skipped, so the paths are differnt. +*/ +if(params.tfbs_path == "") { + bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap.toList()).combine(fa_overlap_2).set {for_overlap} + motif_path = params.create_known_tfbs_path +} else { + Channel.from('skipped').set {workaround} + bed_for_overlap_with_TFBS.combine(workaround).combine(fa_overlap_2).set {for_overlap} + tfbs = params.tfbs_path?.endsWith('/') ? params.tfbs_path: params.tfbs_path.substring( 0, params.tfbs_path.length() -1 ) + motif_path = tfbs +} + + +/* +*/ +process overlap_with_known_TFBS { + conda "${path_env}" + + publishDir "${params.out}/unknown_overlap/", mode :'copy' + + input: + set name, file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap + + output: + set name, file ("${name}_unknown.bed") into bed_for_reducing + + script: + motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"") + """ + ${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}_unknown.bed -min ${params.min_size_fp} -max ${params.max_size_fp} -p ${path_bin} + """ +} + + +/* +*/ +process reduce_bed { + conda "${path_env}" + echo true + + input: + set name, file (bed) from bed_for_reducing + + output: + set name, file ('*.bed') into bed_for_clustering + + script: + """ + Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} + """ +} + + +/* +*/ +process clustering { + conda "${path_env}" + echo true + + publishDir "${params.out}/cluster/", mode: 'copy', pattern: '*.bed' + + input: + set name, file (bed) from bed_for_clustering + + output: + set name, file ('*.bed') into bed_for_motif_esitmation + + script: + """ + Rscript ${path_bin}/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} + """ +} + + +/* +Converting BED-File to one FASTA-File per cluster +*/ +process bed_to_clustered_fasta { + conda "${path_env}" + publishDir "${params.out}/esimated_motifs/clustered_motifs/clustered_fasta/", mode: 'copy' + tag{name} + + input: + set name, file (bed) from bed_for_motif_esitmation + + output: + file ('*.FASTA') into fasta_for_glam2 + file ('*.FASTA') into fasta_for_motif_cluster + + script: + """ + Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq} + """ +} + + +//flatten list and adding name of file to channel value +fasta_for_glam2 = fasta_for_glam2.flatten().map {it -> [it.simpleName, it]} +//combine fasta files in one list +fasta_for_motif_cluster = fasta_for_motif_cluster.toList() + +/* +Running GLAM2 on FASTA-files. +Generating Motifs through alignment and scoring best local matches. +*/ +process glam2 { + + tag{name} + publishDir "${params.out}/esimated_motifs/clustered_motifs/${name}/", mode: 'copy' + + input: + set name, file (fasta) from fasta_for_glam2 + + output: + file("${name}/*.meme") into meme_to_merge + set name, file("${name}/*.meme") into meme_for_tomtom + set name, file("${name}/*.meme") into meme_for_filter + file ('*') + + script: + """ + glam2 n ${fasta} -O ./${name}/ -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} + """ +} + +/* +Combining all MEME-files to one big MEME-file. +The paths are sorted numerically depending on the cluster number. +*/ +process merge_meme { + + publishDir "${params.out}/esimated_motifs/merged_meme/", mode: 'copy' + + input: + val (memelist) from meme_to_merge.toList() + + output: + file ('merged_meme.meme') into merged_meme + + when: + params.cluster_motif == 1 + + script: + memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") } + meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer } + meme_sorted_full = meme_sorted.collect {it.toString() + "/glam2.meme"} + meme_list = meme_sorted_full.toString().replaceAll(/\,|\[|\]/,"") + """ + meme2meme ${meme_list} > merged_meme.meme + """ +} + +/* +Running Tomtom on merged meme-files. +Output table has the information which clusters are similar to each other. +*/ +process find_similar_motifs { + + publishDir "${params.out}/esimated_motifs/cluster_similarity/", mode: 'copy' + input: + file (merged_meme) from merged_meme + + output: + file ('all_to_all.tsv') into motif_similarity + + when: + params.cluster_motif == 1 + + script: + """ + tomtom ${merged_meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > all_to_all.tsv + """ +} + + +files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster) + + + +/* +Merging FASTA-files of similar clusters +*/ +process merge_fasta { + conda "${path_env}" + publishDir "${params.out}/esimated_motifs/merged_fasta/", mode: 'copy' + + input: + set file (motiv_sim), val (fasta_list) from files_for_merge_fasta + + output: + file ('*.fasta') into motif_clustered_fasta_list + file('*.png') + + when: + params.cluster_motif == 1 + + script: + fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer } + fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"") + """ + Rscript ${path_bin}/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight} + """ +} + + +motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten() + + +process clustered_glam2 { + + publishDir "${params.out}/final_esimated_motifs/${name}/", mode: 'copy' + + input: + file (fasta) from motif_clustered_fasta_flat + + output: + set name, file ('*.meme') into clustered_meme_for_tomtom + set name, file ('*.meme') into clustered_meme_for_filter + file('*') + + when: + params.cluster_motif == 1 + + script: + name = fasta.getBaseName() + """ + glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} + """ +} + +if(params.cluster_motif == 1){ + for_tomtom = clustered_meme_for_tomtom + for_filter = clustered_meme_for_filter +} else { + for_tomtom = meme_for_tomtom + for_filter = meme_for_filter +} + + +/* +Running Tomtom on meme-files generated by GLAM2. +Tomtom searches motifs in databases. +*/ +process tomtom { + + tag{name} + publishDir "${params.out}/final_esimated_motifs/tomtom/", mode: 'copy' + + input: + set name, file (meme) from for_tomtom + + output: + set name, file ('*.tsv') into tsv_for_filter + + script: + """ + tomtom ${meme} ${params.motif_db} -thresh ${params.tomtom_treshold} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv + """ +} + + +//Joining channels with meme and tsv files. Filter joined channel on line count. +//Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel. +for_filter2 = for_filter.join( tsv_for_filter ) +for_filter2 + .filter { name, meme, tsv -> + long count = tsv.readLines().size() + count <= 1 + } + .into { meme_for_scan; check } + + +//If channel 'check' is empty print errormessage +process check_for_unknown_motifs { + echo true + + input: + val x from check.ifEmpty('EMPTY') + + when: + x == 'EMPTY' + + """ + echo '>>> STOPPED: No unknown Motifs were found.' + """ + +} + + +//Get the best(first) Motif from each MEME-file +process get_best_motif { + conda "${path_env}" + + publishDir "${params.out}/esimated_motifs/unknown_motifs/", mode: 'copy' + + input: + set name, file(meme), file(tsv) from meme_for_scan + + output: + set name, file('*_best.meme') into best_motif + + script: + """ + python ${path_bin}/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif} + """ +} + + +best_motif.combine(fa_scan).set {files_for_genome_scan} + +/* +process genome_scan { + conda "${path_env}" + + input: + set name, file(meme), file(fasta) from files_for_genome_scan + + output: + file ('.bed') into bed_for_uropa, bed_for_cluster_quality + + script: + """ + """ +} + + +process cluster_quality { + + input: + file (bed) from bed_for_cluster_quality + + output: + file ('*.bed') into bed_for_final_filter + + script: + """ + """ +} */ + + +process create_GTF { + conda "${path_env}" + + publishDir 'Path', mode:'copy' + + output: + file ('*.gtf') into gtf_for_uropa + + script: + """ + python ${path_bin}/RegGTFExtractor.py ${params.organism} --tissue ${params.tissues} --wd ${path_bin} + """ +} + +/* +bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in} + + +// Create configuration file for UROPA. +// Takes template and replaces bed- and gtf-placeholders with actual paths. +process create_uropa_config { + + publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/', mode: 'copy' + + input: + set val(bed), val(gtf) from uropa_in.toList() + file (conf) from config + + output: + file ('uropa.config') into uropa_config + + script: + """ + sed -- 's/placeholder_gtf/${gtf}/g; s/placeholder_bed/${bed}/g' ${conf} > uropa.config.final + """ +} + + +process UROPA { + + input: + file (config) from uropa_config + + output: + set file ("*_allhits.txt"), file ("*_finalhits.txt") into uropa_for_filter + + script: + """ + """ +} + + +process filter { + + input: + + output: + + script: + """ + """ +} */