Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
master_project_JLU2018/pipeline.nf
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
401 lines (286 sloc)
8.71 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//!/usr/bin/env nextflow | |
Channel.fromPath(params.input).map {it -> [it.simpleName, it]}.set {bigwig_input} | |
Channel.fromPath(params.bed).set {bed_input} | |
Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} | |
Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom} | |
Channel.fromPath(params.config).set {config} | |
//setting default values | |
params.input="" | |
params.bed="" | |
params.genome_fasta="" | |
params.jaspar_db="" | |
params.config="" | |
//peak_calling | |
params.window_length = 200 | |
params.step = 100 | |
params.percentage = 0 | |
//filter_unknown_motifs | |
params.min_size_fp=10 | |
params.max_size_fp=100 | |
//clustering | |
params.sequence_coverage=8 | |
params.kmer=10 | |
params.aprox_motif_len=10 | |
//motif_estimation | |
//bed_to_clustered_fasta | |
params.min_seq = 10 // Minimum number of sequences in the fasta-files for glam2 | |
//glam2 | |
params.motif_min_len = 8 // Minimum length of Motifs | |
params.motif_max_len = 20 // Maximum length of Motifs | |
params.interation = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime. | |
//tomtom | |
params.tomtom_treshold = 0.01 // threshold for similarity score. | |
//creating_gtf | |
params.organism="homo_sapiens" | |
params.tissue="" | |
if (params.input == "" || params.bed == "" || params.genome_fasta == "" || params.jaspar_db == "" || params.config == ""){ | |
log.info """ | |
Usage: nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file] | |
Required arguments: | |
--input Path to BigWig-file | |
--bed Path to BED-file | |
--genome_fasta Path to genome in FASTA-format | |
--jaspar_db Path to motif-database in MEME-format | |
Optional arguments: | |
Footprint extraction: | |
--window_length INT (Default: 200) | |
--step INT (Default: 100) | |
--percentage INT(Default: 0) | |
Filter unknown motifs: | |
--min_size_fp INT (Default: 10) | |
--max_size_fp INT (Default: 100) | |
Clustering: | |
--sequence_coverage INT (Default: 8) | |
--kmer INT (Default: 10) | |
--aprox_motif_len INT (Default: 10) | |
Motif estimation: | |
--motif_min_len INT Minimum length of Motif (Default: 8) | |
--motif_max_len INT Maximum length of Motif (Default: 20) | |
--interation INT Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000) | |
--tomtom_treshold float Threshold for similarity score. (Default: 0.01) | |
Creating GTF: | |
--organism [homo_sapiens | mus_musculus] | |
--tissues | |
All arguments can be set in the configuration files. | |
""" | |
} | |
bigwig_input.combine(bed_input).into {footprint_in} | |
/* | |
*/ | |
process footprint_extraction { | |
conda "${path_env}" | |
tag{name} | |
publishDir "${out}", mode: 'copy', pattern: '*.bed' | |
publishDir "${out}/log", mode: 'copy', pattern: '*.log' | |
input: | |
set name, file (bigWig), file (bed) from footprint_in | |
output: | |
set name, file ('*.bed') into bed_for_overlap_with_TFBS | |
script: | |
print(bigWig) | |
""" | |
python ${path_bin}/call_peaks.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --percentage ${params.percentage} | |
""" | |
} | |
//Abfrage ob ausgeführt werden muss. | |
/* | |
*/ | |
process extract_known_TFBS { | |
conda "${path_env}" | |
input: | |
file (fasta) from fa_overlap | |
file (db) from db_for_motivscan | |
output: | |
file ('*.bed') into known_TFBS_for_overlap | |
script: | |
""" | |
python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ./ | |
""" | |
} | |
bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap).combine(fa_overlap_2).set {for_overlap} | |
/* | |
*/ | |
process overlap_with_known_TFBS { | |
conda "${path_env}" | |
input: | |
set name, file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap | |
output: | |
set name, file ('*.bed') into bed_for_reducing | |
script: | |
motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"") | |
""" | |
${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_list} --fasta ${fasta} -o ${name}.bed -min ${params.min_size_fp} -max ${params.max_size_fp} | |
""" | |
} | |
/* | |
*/ | |
process reduce_bed { | |
conda "${path_env}" | |
input: | |
set name, file (bed) from bed_for_reducing | |
output: | |
set name, file ('*.bed') into bed_for_clustering | |
script: | |
""" | |
Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} | |
""" | |
} | |
/* | |
*/ | |
process clustering { | |
conda "${path_env}" | |
input: | |
set name, file (bed) from bed_for_clustering | |
output: | |
set name, file ('*.bed') into bed_for_motif_esitmation | |
script: | |
""" | |
Rscript ${path_bin}/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed | |
""" | |
} | |
/* | |
Converting BED-File to one FASTA-File per cluster | |
*/ | |
process bed_to_clustered_fasta { | |
conda "${path_env}" | |
tag{name} | |
publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/tmp/', mode: 'copy' | |
input: | |
set name, file (bed) from bed_for_motif_esitmation | |
output: | |
file ('*.FASTA') into fasta_for_glam2 | |
script: | |
""" | |
Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq} | |
""" | |
} | |
//flatten list and adding name of file to channel value | |
fasta_for_glam2 = fasta_for_glam2.flatten().map {it -> [it.simpleName, it]} | |
/* | |
Running GLAM2 on FASTA-files. | |
Generating Motifs through alignment and scoring best local matches. | |
*/ | |
process glam2 { | |
conda "${path_env}" | |
tag{name} | |
input: | |
set name, file (fasta) from fasta_for_glam2 | |
output: | |
set name, file('*.meme') into meme_for_tomtom, meme_for_filter | |
script: | |
""" | |
glam2 n ${fasta} -O . -a ${params.motif_min_len} -b ${params.motif_max_len} -z 5 -n ${params.interation} | |
""" | |
} | |
/* | |
Running Tomtom on meme-files generated by GLAM2. | |
Tomtom searches motifs in databases. | |
*/ | |
process tomtom { | |
conda "${path_env}" | |
tag{name} | |
publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/tmp/', mode: 'copy' | |
input: | |
set name, file (meme), file (jaspar_db) from meme_for_tomtom.combine(db_for_tomtom) | |
output: | |
set name, file ('*.tsv') into tsv_for_filter | |
script: | |
""" | |
tomtom ${meme} ${jaspar_db} -thresh ${params.tomtom_treshold} -text --norc | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv | |
""" | |
} | |
//Joining channels with meme and tsv files. Filter joined channel on line count. | |
//Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel. | |
for_filter = meme_for_filter.join( tsv_for_filter ) | |
for_filter | |
.filter { name, meme, tsv -> | |
long count = tsv.readLines().size() | |
count <= 1 | |
} | |
.into { meme_for_scan; check } | |
//If channel 'check' is empty print errormessage | |
process check_for_unknown_motifs { | |
echo true | |
input: | |
val x from check.ifEmpty('EMPTY') | |
when: | |
x == 'EMPTY' | |
""" | |
echo '>>> STOPPED: No unknown Motifs were found.' | |
""" | |
} | |
//Get the best(first) Motif from each MEME-file | |
process get_best_motif { | |
conda "${path_env}" | |
input: | |
set name, file(meme), file(tsv) from meme_for_scan | |
output: | |
set name, file('*_best.meme') into best_motif | |
script: | |
""" | |
python ${path_bin}/get_best_motif.py ${meme} ${name}_best.meme | |
""" | |
} | |
best_motif.combine(fa_scan).set {files_for_genome_scan} | |
/* | |
process genome_scan { | |
conda "${path_env}" | |
input: | |
set name, file(meme), file(fasta) from files_for_genome_scan | |
output: | |
file ('.bed') into bed_for_uropa, bed_for_cluster_quality | |
script: | |
""" | |
""" | |
} | |
process cluster_quality { | |
input: | |
file (bed) from bed_for_cluster_quality | |
output: | |
file ('*.bed') into bed_for_final_filter | |
script: | |
""" | |
""" | |
} */ | |
process create_GTF { | |
conda "${path_env}" | |
publishDir 'Path', mode:'copy' | |
output: | |
file ('*.gtf') into gtf_for_uropa | |
script: | |
""" | |
python ${path_bin}/RegGTFExtractor.py ${params.organism} --tissue ${params.tissues} --wd ${path_bin} | |
""" | |
} | |
/* | |
bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in} | |
// Create configuration file for UROPA. | |
// Takes template and replaces bed- and gtf-placeholders with actual paths. | |
process create_uropa_config { | |
publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/', mode: 'copy' | |
input: | |
set val(bed), val(gtf) from uropa_in.toList() | |
file (conf) from config | |
output: | |
file ('uropa.config') into uropa_config | |
script: | |
""" | |
sed -- 's/placeholder_gtf/${gtf}/g; s/placeholder_bed/${bed}/g' ${conf} > uropa.config.final | |
""" | |
} | |
process UROPA { | |
input: | |
file (config) from uropa_config | |
output: | |
set file ("*_allhits.txt"), file ("*_finalhits.txt") into uropa_for_filter | |
script: | |
""" | |
""" | |
} | |
process filter { | |
input: | |
output: | |
script: | |
""" | |
""" | |
} */ |