loosolab · renewiegandt · Mar 28, 2019 · Mar 28, 2019 · Mar 28, 2019 · Mar 28, 2019
diff --git a/README.md b/README.md
@@ -54,13 +54,14 @@ Required arguments:
 	--genome_fasta		 Path to genome in FASTA-format
 	--motif_db		 Path to motif-database in MEME-format
 	--config		 Path to UROPA configuration file
+	--gtf_annotation	Path to gtf annotation file
 	--organism 		 Input organism [hg38 | hg19 | mm9 | mm10]
 	--out			 Output Directory (Default: './out/')
 
 Optional arguments:
 
 	--help [0|1]		1 to show this help message. (Default: 0)
-	--gtf_path		Path to gtf-file. If path is set the process which creates a gtf-file is skipped.
+	--gtf_merged		Path to gtf-file. If path is set the process which creates a gtf-file is skipped.
 	--tfbs_path 		Path to directory with tfbsscan output. If given tfbsscan will be skipped.
 
 	Footprint extraction:

diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh
@@ -8,7 +8,7 @@
 #             BED-format.
 
 #             For parameter description, run the script without parameters or -h.
-#             The output is a file with the filtered footprints and the log file compareBed.log
+#             The output is a file with the filtered footprints and the log file filter_motifs.log
 
 # One R scripts is used, compareBed_runinfo.R, stored in the same directory.
 # Parameter values (file/directory paths) must not contain the | symbol, because it is chosen as "sed"-field separator.
@@ -264,7 +264,7 @@ fi
 
 # remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition
 # also creates a small output file with information about the comparison
-Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/compareBed.log
+Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/filter_motifs.log
 # check if Rscript executed without errors
 if [ $? -gt 0 ]
 then
@@ -279,15 +279,15 @@ then
 	# add initial number of footprints to the log file
 	fp_initial=`cat $data | wc -l`
 	fp_initial=`expr $fp_initial - 1`
-	echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.log
+	echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/filter_motifs.log
 else
 	# output will be overwritten if it exists
 	rm -f $output
 	# add initial number of footprints to the log file
-	cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.log
+	cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/filter_motifs.log
 fi
 # add number of footprints after filtering to the log file
-cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.log
+cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/filter_motifs.log
 
 # add fasta sequences to bed and create fasta file
 out_fasta=`echo $output | sed "s|.bed$|.fasta|g"`

diff --git a/pipeline.nf b/pipeline.nf
@@ -10,8 +10,8 @@ disable_mo_clu = 1
 	params.config=""
 	params.tfbs_path=""
 	params.help = 0
-	params.gtf_path=""
-	params.gtf2=""
+	params.gtf_merged=""
+	params.gtf_annotation=""
 	params.out = "./out/"
 
 //footprint_extraction
@@ -68,7 +68,7 @@ disable_mo_clu = 1
 //evaluation
 	params.max_uropa_runs = 10
 
-if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || params.gtf2 == "" || "${params.help}" != "0" ) {
+if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || (params.gtf_annotation == "" && params.gtf_merged == "" ) || "${params.help}" != "0" ) {
 	log.info """
 	Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file]
 
@@ -78,13 +78,14 @@ if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.g
 		--genome_fasta		 Path to genome in FASTA-format
 		--motif_db		 Path to motif-database in MEME-format
 		--config		 Path to UROPA configuration file
+		--gtf_annotation	Path to gtf annotation file
 	 	--organism 		 Input organism [hg38 | hg19 | mm9 | mm10]
 		--out			 Output Directory (Default: './out/')
 
 	Optional arguments:
 
 		--help [0|1]		1 to show this help message. (Default: 0)
-		--gtf_path		Path to gtf-file. If path is set the process which creates a gtf-file is skipped.
+		--gtf_merged		Path to gtf-file. If path is set the process which creates a gtf-file is skipped.
 		--tfbs_path 		Path to directory with tfbsscan output. If given tfbsscan will be skipped.
 
 		Footprint extraction:
@@ -141,7 +142,9 @@ if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.g
 	Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2}
 	Channel.fromPath(params.motif_db).into {db_for_motivscan; db_for_tomtom}
 	Channel.fromPath(params.config).set {config}
-	Channel.fromPath(params.gtf2).set {gtf_to_merge}
+	if (params.gtf_annotation != ""){
+		Channel.fromPath(params.gtf_annotation).set {gtf_to_merge}
+	}
 	if (params.tfbs_path != ""){
 		known_tfbs = file(params.tfbs_path).toAbsolutePath()
 	}
@@ -161,7 +164,7 @@ int_params = ["window_length", "step", "min_size_fp", "max_size_fp", "kmer",
 req_params = ["bigwig", "bed", "genome_fasta", "motif_db", "config"]
 all_params = int_params + req_params + ["organism" , "identity", "tfbsscan_method",
 							"percentage", "tomtom_treshold", "motif_similarity_thresh", "out",
-							"tissues", "gtf_path", "cluster_motif", "tfbs_path", "help", "gtf2", "seed"]
+							"tissues", "gtf_merged", "cluster_motif", "tfbs_path", "help", "gtf_annotation", "seed"]
 
 valid_organism = ["hg38", "hg19", "mm9", "mm10"]
 valid_tfbsscan_methods = ["moods","fimo"]
@@ -176,7 +179,7 @@ params.each { key, value ->
       System.exit(2)
     }
   }
-  if(req_params.contains(key) || (key == "gtf_path" && value != "") ) {
+  if(req_params.contains(key) || (key == "gtf_merged" && value != "") ) {
     if(!file(value).exists()) {
       println("ERROR: $key not found. Please check the given path.")
       System.exit(2)
@@ -247,7 +250,7 @@ Find postitions of known tfbs with tfbsscan and discard the overlaps with compar
 */
 process overlap_with_known_TFBS {
 	//conda "${path_env}"
-	publishDir "${params.out}/1.2_filter_motifs", mode :'copy', pattern: '*_unknown.bed'
+	publishDir "${params.out}/1.2_filter_motifs", mode :'copy', pattern: "${name}_unknown.bed"
 	publishDir "${params.out}/log", mode: 'copy', pattern: '*.log'
 	tag{name}
 	errorStrategy 'finish'
@@ -257,7 +260,6 @@ process overlap_with_known_TFBS {
 
 	output:
 	set name, file ("${name}_unknown.bed") into bed_for_reducing
-	file ('./known_tfbs/*.bed') optional true
 	file ('*.log')
 
 	script:
@@ -749,17 +751,19 @@ process create_GTF {
 	file ('*.gtf') into gtf
 
 	when:
-	params.gtf_path == ""
+	params.gtf_merged == ""
 
 	script:
 	"""
 	python ${path_bin}/3.1_create_gtf/RegGTFExtractor.py ${params.organism} --tissue ${params.tissues} --wd ${path_bin}/3.1_create_gtf
 	"""
 }
 
-
-gtf.combine(gtf_to_merge).set {gtf_list}
-
+//TODO put operators in process mwerge_gtf
+gtf_list = Channel.empty()
+if (params.gtf_merged == "") {
+	gtf.combine(gtf_to_merge).set {gtf_list}
+}
 
 process merge_gtf {
 
@@ -773,19 +777,20 @@ process merge_gtf {
 	file ('merged.gtf') into merged_gtf
 
 	when:
-	params.gtf_path == ""
+	params.gtf_merged == ""
 
 	script:
 	"""
 	cat ${gtf_genes} > merged.gtf
-	cat ${gtf_reg} >> merged.gtf
+	cat ${gtf_reg} >> merged.gtf | sort -k 1,1 -k 4,4n -k 5,5n
 	"""
 }
 
-if (params.gtf_path == "") {
+
+if (params.gtf_merged == "") {
 	gtf_for_uropa = merged_gtf
 } else {
-	gtf_for_uropa = Channel.fromPath(params.gtf_path)
+	Channel.fromPath( params.gtf_merged ).set {gtf_for_uropa}
 }
 
 re_scan_for_uropa.flatten().combine(gtf_for_uropa).combine(config).set {uropa_in}
@@ -795,7 +800,6 @@ re_scan_for_uropa.flatten().combine(gtf_for_uropa).combine(config).set {uropa_in
 process create_uropa_config {
 
 	tag{bed}
-	echo true
 	publishDir "${params.out}/3.2_evaluation/uropa/config", mode: 'copy'
 
 	input: