diff --git a/README.md b/README.md index 3bf19c3..1aa01e6 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ When the enviroment is created, set the variable 'path_env' in the configuration ## Quick Start ```console -nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] +nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file] ``` ## Parameters For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration). @@ -54,26 +54,29 @@ Required arguments: --config Path to UROPA configuration file --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. Path can be set as tfbs_path in next run. (Default: './') + --out Output Directory (Default: './out/') + Optional arguments: - - --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. + --help [0|1] 1 to show this help message. (Default: 0) + --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. + Footprint extraction: - --window_length INT (Default: 200) - --step INT (Default: 100) - --percentage INT (Default: 0) - + --window_length INT This parameter sets the length of a sliding window. (Default: 200) + --step INT This parameter sets the number of positions to slide the window forward. (Default: 100) + --percentage INT Threshold in percent (Default: 0) + Filter unknown motifs: - --min_size_fp INT (Default: 10) - --max_size_fp INT (Default: 100) - + --min_size_fp INT Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10) + --max_size_fp INT Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 100) + Clustering: Sequence preparation/ reduction: --kmer INT Kmer length (Default: 10) --aprox_motif_len INT Motif length (Default: 10) --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. --min_seq_length Interations Remove all sequences below this value. (Default: 10) - + Clustering: --global INT Global (=1) or local (=0) alignment. (Default: 0) --identity FLOAT Identity threshold. (Default: 0.8) @@ -81,7 +84,7 @@ Optional arguments: --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) - + Motif estimation: --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) --motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) @@ -89,12 +92,12 @@ Optional arguments: --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) --best_motif INT Get the best X motifs per cluster. (Default: 3) - + Moitf clustering: --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) - + Creating GTF: --organism [hg38 | hg19 | mm9 | mm10] Input organism --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R index 6cefe43..84910f3 100644 --- a/bin/bed_to_fasta.R +++ b/bin/bed_to_fasta.R @@ -14,14 +14,14 @@ min_seq <- args[3] bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") -clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster +clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Cluster column discard <- lapply(1:length(clusters), function(i){ clust <- as.data.frame(clusters[i]) print(nrow(clust)) if (nrow(clust) >= as.numeric(min_seq) ) { - sequences <- as.list(clust[[10]]) # <---- Splate mit Sequenz + sequences <- as.list(clust[[10]]) # <---- sequenze column outfile <- paste0(prefix,"_cluster_",i,".FASTA") - seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name + seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column } else { print(paste0("Cluster: ",i," is to small")) } diff --git a/bin/merge_similar_clusters.R b/bin/merge_similar_clusters.R index e72fa03..03b8cf1 100644 --- a/bin/merge_similar_clusters.R +++ b/bin/merge_similar_clusters.R @@ -1,7 +1,7 @@ #!/usr/bin/env Rscript # Merging FASTA-files, which motifs are similar. -# +# # @parameter tsv_in Path to TSV file generated by Tomtom. # The input for Tomtom is a from all clusters merged meme-file. # @parameter file_list Numerically sorted whitespace separated list of absolute fasta-file paths @@ -10,7 +10,7 @@ args = commandArgs(trailingOnly = TRUE) -tsv_in <- args[1] +tsv_in <- args[1] file_list <- args[2] min_weight <- args[3] @@ -38,7 +38,7 @@ edgelist <- sim_not_unique[query_cluster != target_cluster] g <- igraph::graph_from_edgelist(as.matrix(edgelist)) # converting graph to adjacency matrix adj_matrix <- igraph::get.adjacency(g, names = T) -# generating weighted graph from adjacency matrix +# generating weighted graph from adjacency matrix g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T) # get subgraphs from graph with edges of weight > min_weight @@ -47,7 +47,11 @@ png('motif_clusters.png') plot(s1) dev.off() clust <- igraph::clusters(s1) - +if (clust$no < 1){ + b <- lapply(files, function(f){ + system(paste("cat",f,">",basename(f))) + }) +} # merge FASTA-files depending on the clustered graphs a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){ cl <- as.vector(which(clust$membership %in% c(i))) diff --git a/bin/motif_estimation.nf b/bin/motif_estimation.nf new file mode 100644 index 0000000..430040e --- /dev/null +++ b/bin/motif_estimation.nf @@ -0,0 +1,283 @@ + +params.bed = "" +params.out = "" +params.motif_db = "" +params.path_env = "" +params.help = 0 +params.path_bin = "." + +//bed_to_clustered_fasta +params.min_seq = 100 // Minimum number of sequences in the fasta-files for glam2 + +//glam2 +params.motif_min_key = 8 // Minimum number of key positions (aligned columns) +params.motif_max_key = 20 // Maximum number of key positions (aligned columns) +params.iteration = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime. + +//tomtom +params.tomtom_treshold = 0.01 // threshold for similarity score. + +//cluster motifs +params.cluster_motif = 0 // Boolean if 1 motifs are clustered else they are not +params.edge_weight = 50 // Minimum weight of edges in motif-cluster-graph +motif_similarity_thresh = 0.00001 // threshold for motif similarity score + +if (params.bed == "" || params.out == "" || params.motif_db == "" || params.path_env || "${params.help}" == "1") { +log.info """ +Usage: nextflow run motif_estimation.nf --bed [PATH] --out [PATH] --motif_db [PATH] --path_env YAML-file +--bed Path Input BED-file (Column with cluster: 11; column with sequenze: 10)!! +--out Path Output Path +--motif_db Path Path to MEME-Database +--path_env Path Path to YAML file for conda enviroment. +--path_bin Path Path to directory with subscripts +--help 0|1 Print help message if help == 1 (Default: 0) + +Motif estimation: +--min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) +--motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) +--motif_max_key INT Maximum number of key positions (aligned columns) in the alignment done by GLAM2.f (Default: 20) +--iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) +--tomtom_treshold float Threshold for similarity score. (Default: 0.01) +--best_motif INT Get the best X motifs per cluster. (Default: 3) + +Moitf clustering: +--cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) +--edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) +--motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) +""" +System.exit(2) +} else { + Channel.fromPath(params.bed).map {it -> [it.simpleName, it]}.set {bed_for_motif_esitmation} +} + +/* +Converting BED-File to one FASTA-File per cluster +*/ +process bed_to_clustered_fasta { + conda "${params.path_env}" + publishDir "${params.out}/esimated_motifs/clustered_motifs/clustered_fasta/", mode: 'copy' + tag{name} + + input: + set name, file (bed) from bed_for_motif_esitmation + + output: + file ('*.FASTA') into fasta_for_glam2 + file ('*.FASTA') into fasta_for_motif_cluster + + script: + """ + Rscript ${params.path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq} + """ +} + + +//flatten list and adding name of file to channel value +fasta_for_glam2 = fasta_for_glam2.flatten().map {it -> [it.simpleName, it]} +//combine fasta files in one list +fasta_for_motif_cluster = fasta_for_motif_cluster.toList() + +/* +Running GLAM2 on FASTA-files. +Generating Motifs through alignment and scoring best local matches. +*/ +process glam2 { + + tag{name} + publishDir "${params.out}/esimated_motifs/clustered_motifs/${name}/", mode: 'copy' + + input: + set name, file (fasta) from fasta_for_glam2 + + output: + file("${name}/*.meme") into meme_to_merge + set name, file("${name}/*.meme") into meme_for_tomtom + set name, file("${name}/*.meme") into meme_for_filter + file ('*') + + script: + """ + glam2 n ${fasta} -O ./${name}/ -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} + """ +} + +/* +Combining all MEME-files to one big MEME-file. +The paths are sorted numerically depending on the cluster number. +*/ +process merge_meme { + + publishDir "${params.out}/esimated_motifs/merged_meme/", mode: 'copy' + + input: + val (memelist) from meme_to_merge.toList() + + output: + file ('merged_meme.meme') into merged_meme + + when: + params.cluster_motif == 1 + + script: + memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") } + meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer } + meme_sorted_full = meme_sorted.collect {it.toString() + "/glam2.meme"} + meme_list = meme_sorted_full.toString().replaceAll(/\,|\[|\]/,"") + """ + meme2meme ${meme_list} > merged_meme.meme + """ +} + +/* +Running Tomtom on merged meme-files. +Output table has the information which clusters are similar to each other. +*/ +process find_similar_motifs { + + publishDir "${params.out}/esimated_motifs/cluster_similarity/", mode: 'copy' + input: + file (merged_meme) from merged_meme + + output: + file ('all_to_all.tsv') into motif_similarity + + when: + params.cluster_motif == 1 + + script: + """ + tomtom ${merged_meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > all_to_all.tsv + """ +} + + +files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster) + + +/* +Merging FASTA-files of similar clusters +*/ +process merge_fasta { + conda "${params.path_env}" + publishDir "${params.out}/esimated_motifs/merged_fasta/", mode: 'copy' + + input: + set file (motiv_sim), val (fasta_list) from files_for_merge_fasta + + output: + file ('*.fasta') into motif_clustered_fasta_list + file('*.png') + + when: + params.cluster_motif == 1 + + script: + fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer } + fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"") + """ + Rscript ${params.path_bin}/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight} + """ +} + + +motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten() + + +process clustered_glam2 { + + publishDir "${params.out}/final_esimated_motifs/${name}/", mode: 'copy' + + input: + file (fasta) from motif_clustered_fasta_flat + + output: + set name, file ('*.meme') into clustered_meme_for_tomtom + set name, file ('*.meme') into clustered_meme_for_filter + file('*') + + when: + params.cluster_motif == 1 + + script: + name = fasta.getBaseName() + """ + glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} + """ +} + +if(params.cluster_motif == 1){ + for_tomtom = clustered_meme_for_tomtom + for_filter = clustered_meme_for_filter +} else { + for_tomtom = meme_for_tomtom + for_filter = meme_for_filter +} + + +/* +Running Tomtom on meme-files generated by GLAM2. +Tomtom searches motifs in databases. +*/ +process tomtom { + + tag{name} + publishDir "${params.out}/esimated_motifs/tomtom/", mode: 'copy' + + input: + set name, file (meme) from for_tomtom + + output: + set name, file ('*.tsv') into tsv_for_filter + + script: + """ + tomtom ${meme} ${params.motif_db} -thresh ${params.tomtom_treshold} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv + """ +} + + +//Joining channels with meme and tsv files. Filter joined channel on line count. +//Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel. +for_filter2 = for_filter.join( tsv_for_filter ) +for_filter2 + .filter { name, meme, tsv -> + long count = tsv.readLines().size() + count <= 1 + } + .into { meme_for_scan; check } + + +//If channel 'check' is empty print errormessage +process check_for_unknown_motifs { + echo true + + input: + val x from check.ifEmpty('EMPTY') + + when: + x == 'EMPTY' + + """ + echo '>>> STOPPED: No unknown Motifs were found.' + """ + +} + + +//Get the best(first) Motif from each MEME-file +process get_best_motif { + conda "${params.path_env}" + + publishDir "${params.out}/esimated_motifs/unknown_motifs/", mode: 'copy' + + input: + set name, file(meme), file(tsv) from meme_for_scan + + output: + set name, file('*_best.meme') into best_motif + + script: + """ + python ${params.path_bin}/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif} + """ +} diff --git a/pipeline.nf b/pipeline.nf index e1977d2..a39616a 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -8,6 +8,8 @@ params.config="" params.tfbs_path="" params.create_known_tfbs_path = "./" + params.help = 0 + params.out = "./out/" //peak_calling params.window_length = 200 @@ -56,9 +58,9 @@ params.organism="hg38" params.tissue="" -if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == ""){ +if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" != "0"){ log.info """ -Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] +Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file] Required arguments: --bigwig Path to BigWig-file @@ -68,18 +70,21 @@ Required arguments: --config Path to UROPA configuration file --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. Path can be set as tfbs_path in next run. (Default: './') + --out Output Directory (Default: './out/') + Optional arguments: - --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. + --help [0|1] 1 to show this help message. (Default: 0) + --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. Footprint extraction: - --window_length INT (Default: 200) - --step INT (Default: 100) - --percentage INT (Default: 0) + --window_length INT This parameter sets the length of a sliding window. (Default: 200) + --step INT This parameter sets the number of positions to slide the window forward. (Default: 100) + --percentage INT Threshold in percent (Default: 0) Filter unknown motifs: - --min_size_fp INT (Default: 10) - --max_size_fp INT (Default: 100) + --min_size_fp INT Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10) + --max_size_fp INT Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 100) Clustering: Sequence preparation/ reduction: @@ -116,9 +121,10 @@ Optional arguments: All arguments can be set in the configuration files ``` """ +System.exit(2) } else { Channel.fromPath(params.bigwig).map {it -> [it.simpleName, it]}.set {bigwig_input} - Channel.fromPath(params.bed).set {bed_input} + Channel.fromPath(params.bed).into {bed_input; bed_for_tfbsscan} Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} Channel.fromPath(params.motif_db).into {db_for_motivscan; db_for_tomtom} Channel.fromPath(params.config).set {config} @@ -199,6 +205,7 @@ process footprint_extraction { """ } +for_tfbs = fa_overlap.combine(db_for_motivscan).combine(bed_for_tfbsscan) /* @@ -210,8 +217,7 @@ process extract_known_TFBS { publishDir "${params.out}/known_TFBS/", mode: 'copy', pattern: '*.bed' input: - file (fasta) from fa_overlap - file (db) from db_for_motivscan + set file (fasta), file (db), file (bed) from for_tfbs output: val ('done') into known_TFBS_for_overlap @@ -221,7 +227,7 @@ process extract_known_TFBS { script: """ - python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path} + python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path} -b ${bed} """ } @@ -411,7 +417,7 @@ Merging FASTA-files of similar clusters process merge_fasta { conda "${path_env}" publishDir "${params.out}/esimated_motifs/merged_fasta/", mode: 'copy' - + echo true input: set file (motiv_sim), val (fasta_list) from files_for_merge_fasta @@ -569,7 +575,7 @@ process cluster_quality { process create_GTF { conda "${path_env}" - publishDir 'Path', mode:'copy' + publishDir "${params.out}/gtf/", mode: 'copy' output: file ('*.gtf') into gtf_for_uropa @@ -628,3 +634,16 @@ process filter { """ """ } */ + +workflow.onComplete { +log.info""" +Pipeline execution summary +--------------------------- +Completed at: ${workflow.complete} +Duration : ${workflow.duration} +Success : ${workflow.success} +workDir : ${workflow.workDir} +exit status : ${workflow.exitStatus} +Error report: ${workflow.errorReport ?: '-'} +""" +}