From a05dbae15152d45ecdff648669e50d9866847a6c Mon Sep 17 00:00:00 2001
From: afust <annika.fust@mpi-bn.mpg.de>
Date: Wed, 9 Jan 2019 16:35:29 +0100
Subject: [PATCH] final hits in bed format

---
 docs_rst/output.rst |  5 ++++-
 uropa/uropa.py      | 15 ++++++++++++---
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/docs_rst/output.rst b/docs_rst/output.rst
index 46058d5..fef59bf 100644
--- a/docs_rst/output.rst
+++ b/docs_rst/output.rst
@@ -8,7 +8,10 @@ File overview
 
 - **allhits.txt**: Basic output table, reports for each peak all valid annotations and additionally NA rows for peaks without valid annotation. 
 
-- **finalhits.txt**: Filtered output table, it reports the best (closest) feature according to the config criteria for each peak. If multiple queries are given, it reports the best annotation taking multiple queries into account.                                              
+- **finalhits.txt**: Filtered output table, it reports the best (closest) feature according to the config criteria for each peak. If multiple queries are given, it reports the best annotation taking multiple queries into account.
+
+- **finalhits.bed**: Similar tp finalhits.txt in bed format. This means there is no header, the column order is as followed: peak_chr	peak_start	peak_end	peak_id	peak_strand	peak_score	feature	feat_start	feat_end	feat_strand	feat_anchor	distance	genomic_location + all attributes that are given in the config file
+
 
 - **besthits.txt**: This table is only produced if more than one query is given. It reports the best annotation per query for each peak.
 
diff --git a/uropa/uropa.py b/uropa/uropa.py
index 32c7d00..4c35a32 100644
--- a/uropa/uropa.py
+++ b/uropa/uropa.py
@@ -173,8 +173,6 @@ def main():
 		logger.warning("File %s is not a proper GTF file!", annot_gtf)
 
 	gtf_feat = cfg.column_from_file(annot_gtf, 3, logger)
-	
-	print("gtf_feat = ", gtf_feat)
 
 	if len(gtf_feat) < 1:
 		logger.error("No features found in file {} for annotation.".format(annot_gtf))
@@ -342,6 +340,17 @@ def main():
 
 	ovls.finalize_file(allhits_outfile, allhits_partials, header, comments, log=logger)
 
+
+	# finalhits in bed format
+	besthits_outfile_bed = outdir + "finalhits.bed"
+	# colnames: peak_chr	peak_start	peak_end	peak_id	peak_strand	peak_score	feature	feat_start	feat_end	feat_strand	feat_anchor	distance 	genomic_location
+	os.system("awk 'BEGIN { OFS = \"\t\" } ; { print $2,$3,$5,$1,\".\",\".\",$6,$7,$8,$9,$10,$11,$12 }' " + outdir + "finalhits.txt" + " | sed -e 1d |  sed -e 's/\t\t/\t/g' > " + besthits_outfile_bed + ".tmp")	
+	# append shown attributes
+	attributes = os.popen("head -1 "+ outdir + "finalhits.txt" + " | awk '{print NF}'").read() 
+	attributes = int(attributes) - 1
+	os.system("cut -f15-" + str(attributes) + " " + outdir + "finalhits.txt" + " | sed -e 1d | paste -d'\t' "+ besthits_outfile_bed + ".tmp - > " + besthits_outfile_bed)
+	os.system("sort -o " + besthits_outfile_bed + " -k1,1 -k2,2n " + besthits_outfile_bed)
+	os.system("rm "+ besthits_outfile_bed + ".tmp")
 	#
 	# Reformat output
 	#
@@ -420,7 +429,7 @@ def main():
 			os.remove(outdir+"summary_config.json")
 		os.remove(gtf_index)  # .gz
 		os.remove(gtf_index + ".tbi")
-		if len(gtf_feat) > 1:
+		if len(gtf_feat) >= 1:
 			os.remove(gtf_cut_file)
 			os.remove(gtf_cut_file + ".sorted")