diff --git a/README.md b/README.md index 7eb9268..203d55b 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ export PATH=[meme-suite instalation path]/bin:$PATH Download all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018). The Nextflow-script needs a conda enviroment to run. Nextflow can create the needed enviroment from the given yaml-file. -On some systems Nrxtflow exits the run with following error: +On some systems Nextflow exits the run with following error: ``` Caused by: Failed to create Conda environment @@ -42,10 +42,11 @@ When the enviroment is created, set the variable 'path_env' in the configuration nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file] ``` ## Parameters +For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration). ``` Required arguments: - --input Path to BigWig-file - --bed Path to BED-file + --input Path to BigWig-file with scores on the peaks of interest + --bed Path to BED-file with peaks of interest corresponding to the BigWig file --genome_fasta Path to genome in FASTA-format --jaspar_db Path to motif-database in MEME-format --organism STRING Source organism: [ hg19 | hg 38 or mm9 | mm10 ] @@ -53,9 +54,9 @@ Required arguments: Optional arguments: Footprint extraction: - --window_length INT (Default: 200) - --step INT (Default: 100) - --percentage INT(Default: 0) + --window_length INT (Default: 200) a length of a window + --step INT (Default: 100) an interval to slide the window + --percentage INT(Default: 0) a percentage to be added to background while searching for footprints Filter unknown motifs: --min_size_fp INT (Default: 10) @@ -82,7 +83,7 @@ Optional arguments: --interation INT Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) - Moitf clustering: + Motif clustering: --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) diff --git a/masterenv.yml b/masterenv.yml index e0708db..a94a8cf 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -4,7 +4,6 @@ channels: - bioconda - conda-forge dependencies: - - bedtools - r-seqinr - numpy - pybigWig @@ -25,4 +24,4 @@ dependencies: - pybedtools - matplotlib - seaborn - - crossmap \ No newline at end of file + - crossmap diff --git a/pipeline.nf b/pipeline.nf index b1084df..516b7da 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -1,10 +1,4 @@ -//!/usr/bin/env nextflow - -Channel.fromPath(params.input).map {it -> [it.simpleName, it]}.set {bigwig_input} -Channel.fromPath(params.bed).set {bed_input} -Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} -Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom} -Channel.fromPath(params.config).set {config} +#!/usr/bin/env nextflow //setting default values params.input="" @@ -65,36 +59,36 @@ log.info """ Usage: nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file] Required arguments: - --input Path to BigWig-file - --bed Path to BED-file - --genome_fasta Path to genome in FASTA-format - --jaspar_db Path to motif-database in MEME-format + --input Path to BigWig-file + --bed Path to BED-file + --genome_fasta Path to genome in FASTA-format + --jaspar_db Path to motif-database in MEME-format Optional arguments: Footprint extraction: - --window_length INT (Default: 200) - --step INT (Default: 100) - --percentage INT(Default: 0) + --window_length INT (Default: 200) + --step INT (Default: 100) + --percentage INT (Default: 0) Filter unknown motifs: - --min_size_fp INT (Default: 10) - --max_size_fp INT (Default: 100) + --min_size_fp INT (Default: 10) + --max_size_fp INT (Default: 100) + + Clustering: + Sequence preparation/ reduction: + --kmer INT Kmer length (Default: 10) + --aprox_motif_len INT Motif length (Default: 10) + --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. + --min_seq_length Interations Remove all sequences below this value. (Default: 10) Clustering: - Sequence preparation/ reduction: - --kmer INT Kmer length (Default: 10) - --aprox_motif_len INT Motif length (Default: 10) - --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. - --min_seq_length INT Remove all sequences below this value. (Default: 10) - - Clustering: - --global INT Global (=1) or local (=0) alignment. (Default: 0) - --identity FLOAT Identity threshold. (Default: 0.8) - --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) - --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) - --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) - --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) + --global INT Global (=1) or local (=0) alignment. (Default: 0) + --identity FLOAT Identity threshold. (Default: 0.8) + --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) + --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) + --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) + --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) Motif estimation: --motif_min_len INT Minimum length of Motif (Default: 8) @@ -103,21 +97,27 @@ Optional arguments: --tomtom_treshold float Threshold for similarity score. (Default: 0.01) Moitf clustering: - --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) - --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) + --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) + --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) Creating GTF: - --organism [homo_sapiens | mus_musculus] - --tissues + --organism [homo_sapiens | mus_musculus] + --tissues All arguments can be set in the configuration files. """ +} else { + Channel.fromPath(params.input).map {it -> [it.simpleName, it]}.set {bigwig_input} + Channel.fromPath(params.bed).set {bed_input} + Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} + Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom} + Channel.fromPath(params.config).set {config} } -bigwig_input.combine(bed_input).into {footprint_in} +bigwig_input.combine(bed_input).set{footprint_in} /* - +This process uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest */ process footprint_extraction { conda "${path_env}" @@ -154,7 +154,7 @@ process extract_known_TFBS { script: """ - python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ./ + python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o . """ } @@ -225,7 +225,7 @@ process bed_to_clustered_fasta { tag{name} input: - set name, file (bed) from clustered_bed + set name, file (bed) from bed_for_motif_esitmation when: params.fasta == false @@ -349,7 +349,6 @@ process clustered_glam2 { """ } -*/ /* Running Tomtom on meme-files generated by GLAM2.