From a05dbae15152d45ecdff648669e50d9866847a6c Mon Sep 17 00:00:00 2001 From: afust Date: Wed, 9 Jan 2019 16:35:29 +0100 Subject: [PATCH] final hits in bed format --- docs_rst/output.rst | 5 ++++- uropa/uropa.py | 15 ++++++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/docs_rst/output.rst b/docs_rst/output.rst index 46058d5..fef59bf 100644 --- a/docs_rst/output.rst +++ b/docs_rst/output.rst @@ -8,7 +8,10 @@ File overview - **allhits.txt**: Basic output table, reports for each peak all valid annotations and additionally NA rows for peaks without valid annotation. -- **finalhits.txt**: Filtered output table, it reports the best (closest) feature according to the config criteria for each peak. If multiple queries are given, it reports the best annotation taking multiple queries into account. +- **finalhits.txt**: Filtered output table, it reports the best (closest) feature according to the config criteria for each peak. If multiple queries are given, it reports the best annotation taking multiple queries into account. + +- **finalhits.bed**: Similar tp finalhits.txt in bed format. This means there is no header, the column order is as followed: peak_chr peak_start peak_end peak_id peak_strand peak_score feature feat_start feat_end feat_strand feat_anchor distance genomic_location + all attributes that are given in the config file + - **besthits.txt**: This table is only produced if more than one query is given. It reports the best annotation per query for each peak. diff --git a/uropa/uropa.py b/uropa/uropa.py index 32c7d00..4c35a32 100644 --- a/uropa/uropa.py +++ b/uropa/uropa.py @@ -173,8 +173,6 @@ def main(): logger.warning("File %s is not a proper GTF file!", annot_gtf) gtf_feat = cfg.column_from_file(annot_gtf, 3, logger) - - print("gtf_feat = ", gtf_feat) if len(gtf_feat) < 1: logger.error("No features found in file {} for annotation.".format(annot_gtf)) @@ -342,6 +340,17 @@ def main(): ovls.finalize_file(allhits_outfile, allhits_partials, header, comments, log=logger) + + # finalhits in bed format + besthits_outfile_bed = outdir + "finalhits.bed" + # colnames: peak_chr peak_start peak_end peak_id peak_strand peak_score feature feat_start feat_end feat_strand feat_anchor distance genomic_location + os.system("awk 'BEGIN { OFS = \"\t\" } ; { print $2,$3,$5,$1,\".\",\".\",$6,$7,$8,$9,$10,$11,$12 }' " + outdir + "finalhits.txt" + " | sed -e 1d | sed -e 's/\t\t/\t/g' > " + besthits_outfile_bed + ".tmp") + # append shown attributes + attributes = os.popen("head -1 "+ outdir + "finalhits.txt" + " | awk '{print NF}'").read() + attributes = int(attributes) - 1 + os.system("cut -f15-" + str(attributes) + " " + outdir + "finalhits.txt" + " | sed -e 1d | paste -d'\t' "+ besthits_outfile_bed + ".tmp - > " + besthits_outfile_bed) + os.system("sort -o " + besthits_outfile_bed + " -k1,1 -k2,2n " + besthits_outfile_bed) + os.system("rm "+ besthits_outfile_bed + ".tmp") # # Reformat output # @@ -420,7 +429,7 @@ def main(): os.remove(outdir+"summary_config.json") os.remove(gtf_index) # .gz os.remove(gtf_index + ".tbi") - if len(gtf_feat) > 1: + if len(gtf_feat) >= 1: os.remove(gtf_cut_file) os.remove(gtf_cut_file + ".sorted")