diff --git a/README.md b/README.md index 3c6e740..2f63706 100644 --- a/README.md +++ b/README.md @@ -54,13 +54,14 @@ Required arguments: --genome_fasta Path to genome in FASTA-format --motif_db Path to motif-database in MEME-format --config Path to UROPA configuration file + --gtf_annotation Path to gtf annotation file --organism Input organism [hg38 | hg19 | mm9 | mm10] --out Output Directory (Default: './out/') Optional arguments: --help [0|1] 1 to show this help message. (Default: 0) - --gtf_path Path to gtf-file. If path is set the process which creates a gtf-file is skipped. + --gtf_merged Path to gtf-file. If path is set the process which creates a gtf-file is skipped. --tfbs_path Path to directory with tfbsscan output. If given tfbsscan will be skipped. Footprint extraction: diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh index 3d25631..5097004 100644 --- a/bin/1.2_filter_motifs/compareBed.sh +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -8,7 +8,7 @@ # BED-format. # For parameter description, run the script without parameters or -h. -# The output is a file with the filtered footprints and the log file compareBed.log +# The output is a file with the filtered footprints and the log file filter_motifs.log # One R scripts is used, compareBed_runinfo.R, stored in the same directory. # Parameter values (file/directory paths) must not contain the | symbol, because it is chosen as "sed"-field separator. @@ -264,7 +264,7 @@ fi # remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition # also creates a small output file with information about the comparison -Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/compareBed.log +Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/filter_motifs.log # check if Rscript executed without errors if [ $? -gt 0 ] then @@ -279,15 +279,15 @@ then # add initial number of footprints to the log file fp_initial=`cat $data | wc -l` fp_initial=`expr $fp_initial - 1` - echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.log + echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/filter_motifs.log else # output will be overwritten if it exists rm -f $output # add initial number of footprints to the log file - cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.log + cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/filter_motifs.log fi # add number of footprints after filtering to the log file -cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.log +cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/filter_motifs.log # add fasta sequences to bed and create fasta file out_fasta=`echo $output | sed "s|.bed$|.fasta|g"` diff --git a/pipeline.nf b/pipeline.nf index afd3399..44104e7 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -10,8 +10,8 @@ disable_mo_clu = 1 params.config="" params.tfbs_path="" params.help = 0 - params.gtf_path="" - params.gtf2="" + params.gtf_merged="" + params.gtf_annotation="" params.out = "./out/" //footprint_extraction @@ -68,7 +68,7 @@ disable_mo_clu = 1 //evaluation params.max_uropa_runs = 10 -if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || params.gtf2 == "" || "${params.help}" != "0" ) { +if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || (params.gtf_annotation == "" && params.gtf_merged == "" ) || "${params.help}" != "0" ) { log.info """ Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file] @@ -78,13 +78,14 @@ if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.g --genome_fasta Path to genome in FASTA-format --motif_db Path to motif-database in MEME-format --config Path to UROPA configuration file + --gtf_annotation Path to gtf annotation file --organism Input organism [hg38 | hg19 | mm9 | mm10] --out Output Directory (Default: './out/') Optional arguments: --help [0|1] 1 to show this help message. (Default: 0) - --gtf_path Path to gtf-file. If path is set the process which creates a gtf-file is skipped. + --gtf_merged Path to gtf-file. If path is set the process which creates a gtf-file is skipped. --tfbs_path Path to directory with tfbsscan output. If given tfbsscan will be skipped. Footprint extraction: @@ -141,7 +142,9 @@ if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.g Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} Channel.fromPath(params.motif_db).into {db_for_motivscan; db_for_tomtom} Channel.fromPath(params.config).set {config} - Channel.fromPath(params.gtf2).set {gtf_to_merge} + if (params.gtf_annotation != ""){ + Channel.fromPath(params.gtf_annotation).set {gtf_to_merge} + } if (params.tfbs_path != ""){ known_tfbs = file(params.tfbs_path).toAbsolutePath() } @@ -161,7 +164,7 @@ int_params = ["window_length", "step", "min_size_fp", "max_size_fp", "kmer", req_params = ["bigwig", "bed", "genome_fasta", "motif_db", "config"] all_params = int_params + req_params + ["organism" , "identity", "tfbsscan_method", "percentage", "tomtom_treshold", "motif_similarity_thresh", "out", - "tissues", "gtf_path", "cluster_motif", "tfbs_path", "help", "gtf2", "seed"] + "tissues", "gtf_merged", "cluster_motif", "tfbs_path", "help", "gtf_annotation", "seed"] valid_organism = ["hg38", "hg19", "mm9", "mm10"] valid_tfbsscan_methods = ["moods","fimo"] @@ -176,7 +179,7 @@ params.each { key, value -> System.exit(2) } } - if(req_params.contains(key) || (key == "gtf_path" && value != "") ) { + if(req_params.contains(key) || (key == "gtf_merged" && value != "") ) { if(!file(value).exists()) { println("ERROR: $key not found. Please check the given path.") System.exit(2) @@ -247,7 +250,7 @@ Find postitions of known tfbs with tfbsscan and discard the overlaps with compar */ process overlap_with_known_TFBS { //conda "${path_env}" - publishDir "${params.out}/1.2_filter_motifs", mode :'copy', pattern: '*_unknown.bed' + publishDir "${params.out}/1.2_filter_motifs", mode :'copy', pattern: "${name}_unknown.bed" publishDir "${params.out}/log", mode: 'copy', pattern: '*.log' tag{name} errorStrategy 'finish' @@ -257,7 +260,6 @@ process overlap_with_known_TFBS { output: set name, file ("${name}_unknown.bed") into bed_for_reducing - file ('./known_tfbs/*.bed') optional true file ('*.log') script: @@ -749,7 +751,7 @@ process create_GTF { file ('*.gtf') into gtf when: - params.gtf_path == "" + params.gtf_merged == "" script: """ @@ -757,9 +759,11 @@ process create_GTF { """ } - -gtf.combine(gtf_to_merge).set {gtf_list} - +//TODO put operators in process mwerge_gtf +gtf_list = Channel.empty() +if (params.gtf_merged == "") { + gtf.combine(gtf_to_merge).set {gtf_list} +} process merge_gtf { @@ -773,19 +777,20 @@ process merge_gtf { file ('merged.gtf') into merged_gtf when: - params.gtf_path == "" + params.gtf_merged == "" script: """ cat ${gtf_genes} > merged.gtf - cat ${gtf_reg} >> merged.gtf + cat ${gtf_reg} >> merged.gtf | sort -k 1,1 -k 4,4n -k 5,5n """ } -if (params.gtf_path == "") { + +if (params.gtf_merged == "") { gtf_for_uropa = merged_gtf } else { - gtf_for_uropa = Channel.fromPath(params.gtf_path) + Channel.fromPath( params.gtf_merged ).set {gtf_for_uropa} } re_scan_for_uropa.flatten().combine(gtf_for_uropa).combine(config).set {uropa_in} @@ -795,7 +800,6 @@ re_scan_for_uropa.flatten().combine(gtf_for_uropa).combine(config).set {uropa_in process create_uropa_config { tag{bed} - echo true publishDir "${params.out}/3.2_evaluation/uropa/config", mode: 'copy' input: