Skip to content

Estimation motifs #93

Merged
merged 3 commits into from
Mar 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,14 @@ Required arguments:
--genome_fasta Path to genome in FASTA-format
--motif_db Path to motif-database in MEME-format
--config Path to UROPA configuration file
--gtf_annotation Path to gtf annotation file
--organism Input organism [hg38 | hg19 | mm9 | mm10]
--out Output Directory (Default: './out/')

Optional arguments:

--help [0|1] 1 to show this help message. (Default: 0)
--gtf_path Path to gtf-file. If path is set the process which creates a gtf-file is skipped.
--gtf_merged Path to gtf-file. If path is set the process which creates a gtf-file is skipped.
--tfbs_path Path to directory with tfbsscan output. If given tfbsscan will be skipped.

Footprint extraction:
Expand Down
10 changes: 5 additions & 5 deletions bin/1.2_filter_motifs/compareBed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# BED-format.

# For parameter description, run the script without parameters or -h.
# The output is a file with the filtered footprints and the log file compareBed.log
# The output is a file with the filtered footprints and the log file filter_motifs.log

# One R scripts is used, compareBed_runinfo.R, stored in the same directory.
# Parameter values (file/directory paths) must not contain the | symbol, because it is chosen as "sed"-field separator.
Expand Down Expand Up @@ -264,7 +264,7 @@ fi

# remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition
# also creates a small output file with information about the comparison
Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/compareBed.log
Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/filter_motifs.log
# check if Rscript executed without errors
if [ $? -gt 0 ]
then
Expand All @@ -279,15 +279,15 @@ then
# add initial number of footprints to the log file
fp_initial=`cat $data | wc -l`
fp_initial=`expr $fp_initial - 1`
echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.log
echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/filter_motifs.log
else
# output will be overwritten if it exists
rm -f $output
# add initial number of footprints to the log file
cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.log
cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/filter_motifs.log
fi
# add number of footprints after filtering to the log file
cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.log
cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/filter_motifs.log

# add fasta sequences to bed and create fasta file
out_fasta=`echo $output | sed "s|.bed$|.fasta|g"`
Expand Down
40 changes: 22 additions & 18 deletions pipeline.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ disable_mo_clu = 1
params.config=""
params.tfbs_path=""
params.help = 0
params.gtf_path=""
params.gtf2=""
params.gtf_merged=""
params.gtf_annotation=""
params.out = "./out/"

//footprint_extraction
Expand Down Expand Up @@ -68,7 +68,7 @@ disable_mo_clu = 1
//evaluation
params.max_uropa_runs = 10

if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || params.gtf2 == "" || "${params.help}" != "0" ) {
if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || (params.gtf_annotation == "" && params.gtf_merged == "" ) || "${params.help}" != "0" ) {
log.info """
Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file]

Expand All @@ -78,13 +78,14 @@ if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.g
--genome_fasta Path to genome in FASTA-format
--motif_db Path to motif-database in MEME-format
--config Path to UROPA configuration file
--gtf_annotation Path to gtf annotation file
--organism Input organism [hg38 | hg19 | mm9 | mm10]
--out Output Directory (Default: './out/')

Optional arguments:

--help [0|1] 1 to show this help message. (Default: 0)
--gtf_path Path to gtf-file. If path is set the process which creates a gtf-file is skipped.
--gtf_merged Path to gtf-file. If path is set the process which creates a gtf-file is skipped.
--tfbs_path Path to directory with tfbsscan output. If given tfbsscan will be skipped.

Footprint extraction:
Expand Down Expand Up @@ -141,7 +142,9 @@ if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.g
Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2}
Channel.fromPath(params.motif_db).into {db_for_motivscan; db_for_tomtom}
Channel.fromPath(params.config).set {config}
Channel.fromPath(params.gtf2).set {gtf_to_merge}
if (params.gtf_annotation != ""){
Channel.fromPath(params.gtf_annotation).set {gtf_to_merge}
}
if (params.tfbs_path != ""){
known_tfbs = file(params.tfbs_path).toAbsolutePath()
}
Expand All @@ -161,7 +164,7 @@ int_params = ["window_length", "step", "min_size_fp", "max_size_fp", "kmer",
req_params = ["bigwig", "bed", "genome_fasta", "motif_db", "config"]
all_params = int_params + req_params + ["organism" , "identity", "tfbsscan_method",
"percentage", "tomtom_treshold", "motif_similarity_thresh", "out",
"tissues", "gtf_path", "cluster_motif", "tfbs_path", "help", "gtf2", "seed"]
"tissues", "gtf_merged", "cluster_motif", "tfbs_path", "help", "gtf_annotation", "seed"]

valid_organism = ["hg38", "hg19", "mm9", "mm10"]
valid_tfbsscan_methods = ["moods","fimo"]
Expand All @@ -176,7 +179,7 @@ params.each { key, value ->
System.exit(2)
}
}
if(req_params.contains(key) || (key == "gtf_path" && value != "") ) {
if(req_params.contains(key) || (key == "gtf_merged" && value != "") ) {
if(!file(value).exists()) {
println("ERROR: $key not found. Please check the given path.")
System.exit(2)
Expand Down Expand Up @@ -247,7 +250,7 @@ Find postitions of known tfbs with tfbsscan and discard the overlaps with compar
*/
process overlap_with_known_TFBS {
//conda "${path_env}"
publishDir "${params.out}/1.2_filter_motifs", mode :'copy', pattern: '*_unknown.bed'
publishDir "${params.out}/1.2_filter_motifs", mode :'copy', pattern: "${name}_unknown.bed"
publishDir "${params.out}/log", mode: 'copy', pattern: '*.log'
tag{name}
errorStrategy 'finish'
Expand All @@ -257,7 +260,6 @@ process overlap_with_known_TFBS {

output:
set name, file ("${name}_unknown.bed") into bed_for_reducing
file ('./known_tfbs/*.bed') optional true
file ('*.log')

script:
Expand Down Expand Up @@ -749,17 +751,19 @@ process create_GTF {
file ('*.gtf') into gtf

when:
params.gtf_path == ""
params.gtf_merged == ""

script:
"""
python ${path_bin}/3.1_create_gtf/RegGTFExtractor.py ${params.organism} --tissue ${params.tissues} --wd ${path_bin}/3.1_create_gtf
"""
}


gtf.combine(gtf_to_merge).set {gtf_list}

//TODO put operators in process mwerge_gtf
gtf_list = Channel.empty()
if (params.gtf_merged == "") {
gtf.combine(gtf_to_merge).set {gtf_list}
}

process merge_gtf {

Expand All @@ -773,19 +777,20 @@ process merge_gtf {
file ('merged.gtf') into merged_gtf

when:
params.gtf_path == ""
params.gtf_merged == ""

script:
"""
cat ${gtf_genes} > merged.gtf
cat ${gtf_reg} >> merged.gtf
cat ${gtf_reg} >> merged.gtf | sort -k 1,1 -k 4,4n -k 5,5n
"""
}

if (params.gtf_path == "") {

if (params.gtf_merged == "") {
gtf_for_uropa = merged_gtf
} else {
gtf_for_uropa = Channel.fromPath(params.gtf_path)
Channel.fromPath( params.gtf_merged ).set {gtf_for_uropa}
}

re_scan_for_uropa.flatten().combine(gtf_for_uropa).combine(config).set {uropa_in}
Expand All @@ -795,7 +800,6 @@ re_scan_for_uropa.flatten().combine(gtf_for_uropa).combine(config).set {uropa_in
process create_uropa_config {

tag{bed}
echo true
publishDir "${params.out}/3.2_evaluation/uropa/config", mode: 'copy'

input:
Expand Down