Skip to content
Navigation Menu
Toggle navigation
Sign in
In this repository
All GitHub Enterprise
↵
Jump to
↵
No suggested jump to results
In this repository
All GitHub Enterprise
↵
Jump to
↵
In this organization
All GitHub Enterprise
↵
Jump to
↵
In this repository
All GitHub Enterprise
↵
Jump to
↵
Sign in
Reseting focus
You signed in with another tab or window.
Reload
to refresh your session.
You signed out in another tab or window.
Reload
to refresh your session.
You switched accounts on another tab or window.
Reload
to refresh your session.
Dismiss alert
{{ message }}
loosolab
/
master_project_JLU2018
Public
Notifications
You must be signed in to change notification settings
Fork
0
Star
0
Code
Issues
7
Pull requests
1
Actions
Projects
0
Wiki
Security
Insights
Additional navigation options
Code
Issues
Pull requests
Actions
Projects
Wiki
Security
Insights
Files
a8377ce
bin
config
README.md
masterenv.yml
nextflow.config
pipeline.nf
Breadcrumbs
master_project_JLU2018
/
pipeline.nf
Blame
Blame
Latest commit
History
History
419 lines (303 loc) · 9.77 KB
Breadcrumbs
master_project_JLU2018
/
pipeline.nf
Top
File metadata and controls
Code
Blame
419 lines (303 loc) · 9.77 KB
Raw
//!/usr/bin/env nextflow Channel.fromPath(params.input).map {it -> [it.simpleName, it]}.set {bigwig_input} Channel.fromPath(params.bed).set {bed_input} Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom} Channel.fromPath(params.config).set {config} //setting default values params.input="" params.bed="" params.genome_fasta="" params.jaspar_db="" params.config="" //peak_calling params.window_length = 200 params.step = 100 params.percentage = 0 //filter_unknown_motifs params.min_size_fp=10 params.max_size_fp=100 //clustering //reduce_bed params.kmer=10 params.aprox_motif_len=10 params.motif_occurence=1 params.min_seq_length=10 //cdhit_wrapper params.global=0 params.identity=0.8 params.sequence_coverage=8 params.memory=800 params.throw_away_seq=9 params.strand=0 //motif_estimation //bed_to_clustered_fasta params.min_seq = 10 // Minimum number of sequences in the fasta-files for glam2 //glam2 params.motif_min_len = 8 // Minimum length of Motifs params.motif_max_len = 20 // Maximum length of Motifs params.interation = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime. //tomtom params.tomtom_treshold = 0.01 // threshold for similarity score. //creating_gtf params.organism="homo_sapiens" params.tissue="" if (params.input == "" || params.bed == "" || params.genome_fasta == "" || params.jaspar_db == "" || params.config == ""){ log.info """ Usage: nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file] Required arguments: --input Path to BigWig-file --bed Path to BED-file --genome_fasta Path to genome in FASTA-format --jaspar_db Path to motif-database in MEME-format Optional arguments: Footprint extraction: --window_length INT (Default: 200) --step INT (Default: 100) --percentage INT(Default: 0) Filter unknown motifs: --min_size_fp INT (Default: 10) --max_size_fp INT (Default: 100) Clustering: Sequence preparation/ reduction: --kmer INT Kmer length (Default: 10) --aprox_motif_len INT Motif length (Default: 10) --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. --min_seq_length INT Remove all sequences below this value. (Default: 10) Clustering: --global INT Global (=1) or local (=0) alignment. (Default: 0) --identity FLOAT Identity threshold. (Default: 0.8) --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) Motif estimation: --motif_min_len INT Minimum length of Motif (Default: 8) --motif_max_len INT Maximum length of Motif (Default: 20) --interation INT Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) Creating GTF: --organism [homo_sapiens | mus_musculus] --tissues All arguments can be set in the configuration files. """ } bigwig_input.combine(bed_input).into {footprint_in} /* */ process footprint_extraction { conda "${path_env}" tag{name} publishDir "${out}", mode: 'copy', pattern: '*.bed' publishDir "${out}/log", mode: 'copy', pattern: '*.log' input: set name, file (bigWig), file (bed) from footprint_in output: set name, file ('*.bed') into bed_for_overlap_with_TFBS script: """ python ${path_bin}/call_peaks.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --percentage ${params.percentage} """ } //Abfrage ob ausgeführt werden muss. /* */ process extract_known_TFBS { conda "${path_env}" input: file (fasta) from fa_overlap file (db) from db_for_motivscan output: file ('*.bed') into known_TFBS_for_overlap script: """ python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ./ """ } bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap).combine(fa_overlap_2).set {for_overlap} /* */ process overlap_with_known_TFBS { conda "${path_env}" input: set name, file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap output: set name, file ('*.bed') into bed_for_reducing script: motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"") """ ${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_list} --fasta ${fasta} -o ${name}.bed -min ${params.min_size_fp} -max ${params.max_size_fp} """ } /* */ process reduce_bed { conda "${path_env}" input: set name, file (bed) from bed_for_reducing output: set name, file ('*.bed') into bed_for_clustering script: """ Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} """ } /* */ process clustering { conda "${path_env}" input: set name, file (bed) from bed_for_clustering output: set name, file ('*.bed') into bed_for_motif_esitmation script: """ Rscript ${path_bin}/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} """ } /* Converting BED-File to one FASTA-File per cluster */ process bed_to_clustered_fasta { conda "${path_env}" tag{name} publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/tmp/', mode: 'copy' input: set name, file (bed) from bed_for_motif_esitmation output: file ('*.FASTA') into fasta_for_glam2 script: """ Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq} """ } //flatten list and adding name of file to channel value fasta_for_glam2 = fasta_for_glam2.flatten().map {it -> [it.simpleName, it]} /* Running GLAM2 on FASTA-files. Generating Motifs through alignment and scoring best local matches. */ process glam2 { conda "${path_env}" tag{name} input: set name, file (fasta) from fasta_for_glam2 output: set name, file('*.meme') into meme_for_tomtom, meme_for_filter script: """ glam2 n ${fasta} -O . -a ${params.motif_min_len} -b ${params.motif_max_len} -z 5 -n ${params.interation} """ } /* Running Tomtom on meme-files generated by GLAM2. Tomtom searches motifs in databases. */ process tomtom { conda "${path_env}" tag{name} publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/tmp/', mode: 'copy' input: set name, file (meme), file (jaspar_db) from meme_for_tomtom.combine(db_for_tomtom) output: set name, file ('*.tsv') into tsv_for_filter script: """ tomtom ${meme} ${jaspar_db} -thresh ${params.tomtom_treshold} -text --norc | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv """ } //Joining channels with meme and tsv files. Filter joined channel on line count. //Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel. for_filter = meme_for_filter.join( tsv_for_filter ) for_filter .filter { name, meme, tsv -> long count = tsv.readLines().size() count <= 1 } .into { meme_for_scan; check } //If channel 'check' is empty print errormessage process check_for_unknown_motifs { echo true input: val x from check.ifEmpty('EMPTY') when: x == 'EMPTY' """ echo '>>> STOPPED: No unknown Motifs were found.' """ } //Get the best(first) Motif from each MEME-file process get_best_motif { conda "${path_env}" input: set name, file(meme), file(tsv) from meme_for_scan output: set name, file('*_best.meme') into best_motif script: """ python ${path_bin}/get_best_motif.py ${meme} ${name}_best.meme """ } best_motif.combine(fa_scan).set {files_for_genome_scan} /* process genome_scan { conda "${path_env}" input: set name, file(meme), file(fasta) from files_for_genome_scan output: file ('.bed') into bed_for_uropa, bed_for_cluster_quality script: """ """ } process cluster_quality { input: file (bed) from bed_for_cluster_quality output: file ('*.bed') into bed_for_final_filter script: """ """ } */ process create_GTF { conda "${path_env}" publishDir 'Path', mode:'copy' output: file ('*.gtf') into gtf_for_uropa script: """ python ${path_bin}/RegGTFExtractor.py ${params.organism} --tissue ${params.tissues} --wd ${path_bin} """ } /* bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in} // Create configuration file for UROPA. // Takes template and replaces bed- and gtf-placeholders with actual paths. process create_uropa_config { publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/', mode: 'copy' input: set val(bed), val(gtf) from uropa_in.toList() file (conf) from config output: file ('uropa.config') into uropa_config script: """ sed -- 's/placeholder_gtf/${gtf}/g; s/placeholder_bed/${bed}/g' ${conf} > uropa.config.final """ } process UROPA { input: file (config) from uropa_config output: set file ("*_allhits.txt"), file ("*_finalhits.txt") into uropa_for_filter script: """ """ } process filter { input: output: script: """ """ } */
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
You can’t perform that action at this time.