From eead1aa4b5487272a5a2154fcf25a104219a7c35 Mon Sep 17 00:00:00 2001 From: Annika Fust Date: Wed, 8 Nov 2017 10:29:39 +0100 Subject: [PATCH] update UROPA to python 3 --- README.md | 6 +- setup.py | 6 +- uropa/annotation.py | 55 ++-- uropa/config.py | 43 +-- uropa/overlaps.py | 30 +- uropa/uropa.py | 785 ++++++++++++++++++++++---------------------- 6 files changed, 463 insertions(+), 462 deletions(-) diff --git a/README.md b/README.md index 4520147..571ddb4 100644 --- a/README.md +++ b/README.md @@ -40,10 +40,10 @@ Installation and Command-line usage We recommend to install UROPA using the conda package manager. Make sure to have `conda` installed, e.g. via - [Miniconda](https://conda.io/miniconda.html) - - download the Miniconda installer for **Python 2.7** - - run ```bash Miniconda2-latest-Linux-x86_64.sh``` to install Miniconda + - download the Miniconda installer for **Python 3** + - run ```bash Miniconda3-latest-Linux-x86_64.sh``` to install Miniconda - Answer the question "Do you wish the installer to prepend the Miniconda install location to PATH in your /home/.../.bashrc ?" with yes - OR do ```PATH=dir/to/miniconda2:$PATH``` after installation process + OR do ```PATH=dir/to/miniconda3:$PATH``` after installation process The UROPA installation is now as easy as ```conda install -c bioconda uropa```. diff --git a/setup.py b/setup.py index 6208b1b..1c358ba 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,11 @@ from setuptools import setup def readme(): - with open('README.md') as f: + with open('README.md', encoding='utf-8') as f: return f.read() setup(name='uropa', - version='1.2.1', + version='2.0.0-alpha', description='UROPA is a command line based tool, intended for genomic region annotation', long_description=readme(), url='https://github.molgen.mpg.de/loosolab/UROPA', @@ -25,7 +25,7 @@ def readme(): 'License :: OSI Approved :: MIT License', 'Intended Audience :: Science/Research', 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'Programming Language :: Python :: 2.7' + 'Programming Language :: Python :: 3.4' ], zip_safe=False, include_package_data=True) diff --git a/uropa/annotation.py b/uropa/annotation.py index a810d55..76b71f7 100644 --- a/uropa/annotation.py +++ b/uropa/annotation.py @@ -3,7 +3,7 @@ import operator import numpy as np import pysam -import overlaps as ovls +from . import overlaps as ovls def annotation_process(input_args, peak_file, log=None): @@ -113,7 +113,7 @@ def annotation_process(input_args, peak_file, log=None): # Find hits with valid values for the queries (Search all # queries ,Not only PRIORITY) - v_fsa = map(lambda q: ovls.valid_fsa(h, hit, q, peak['strand']), queries) + v_fsa = [ovls.valid_fsa(h, hit, q, peak['strand']) for q in queries] # Pair the Valid query values(fsb) with valid_distance and # valid strand for each query @@ -138,7 +138,7 @@ def annotation_process(input_args, peak_file, log=None): NAsList_q = list(np.repeat("NA", nas_len)) # Search hits with only Prior or Secondary Queries - for hitj, vq in dict_vqp.items(): + for hitj, vq in list(dict_vqp.items()): hit = hitj.split("\t") hit_len = abs(int(hit[4]) - int(hit[3])) strand = hit[6] @@ -252,7 +252,7 @@ def annotation_process(input_args, peak_file, log=None): # > Add to Best Hits the Best Internal features(when internals=True, D>config."distance"), # after having filled-in the All Hits for the peak. - for hitj, vq in dict_vqp.items(): + for hitj, vq in list(dict_vqp.items()): for j in vq: if Best_hits_tab[j][peak['id']] != "" and All_hits_tab[j][peak['id']] != "": if (Best_hits_tab[j][peak['id']].split("\t")[10] == "NA" and @@ -261,10 +261,9 @@ def annotation_process(input_args, peak_file, log=None): hit_line = All_hits_tab[j][peak['id']].split("\n") hit_line = [h for h in hit_line if h != ''] - internal = map( - lambda h: h.split("\t")[11], hit_line) - hit_dist = map(lambda h: int( - h.split("\t")[10]), hit_line) + internal = [h.split("\t")[11] for h in hit_line] + hit_dist = [int( + h.split("\t")[10]) for h in hit_line] mv_pos, min_val = min( enumerate(hit_dist), key=operator.itemgetter(1)) @@ -343,8 +342,8 @@ def annotation_process(input_args, peak_file, log=None): if all(isNA): #logg.debug("\nNo query has any Hit.No replacement of Priority Query possible-> NAs will be filled in the Output.") - TabInList_p = map(lambda l: All_hits_tab[l][ - peak['id']], range(len(queries))) + TabInList_p = [All_hits_tab[l][ + peak['id']] for l in range(len(queries))] ##log.debug("Hit lines for all queries are : {}".format(TabInList_p )) # If all_hits_tab doesn't have all queries will have "" TabInList_p = [Tab for Tab in TabInList_p if Tab != ""] @@ -365,14 +364,14 @@ def annotation_process(input_args, peak_file, log=None): All_combo = OrderedDict() # Output peaks with same order as All_hits_tab mydict = All_hits_tab - for k in mydict[0].iterkeys(): - All_combo[k] = [pid[k] for q, pid in mydict.items()] + for k in mydict[0].keys(): + All_combo[k] = [pid[k] for q, pid in list(mydict.items())] # Best hits # Best_combo = OrderedDict() mybestD = Best_hits_tab - for k in mybestD[0].iterkeys(): - Best_combo[k] = [pid[k] for q, pid in mybestD.items()] + for k in mybestD[0].keys(): + Best_combo[k] = [pid[k] for q, pid in list(mybestD.items())] def all_same(items): """Returns true if all items of a list are the same.""" @@ -383,34 +382,32 @@ def all_same(items): BestBest_hits = OrderedDict() for k in Best_combo: # [ [] , [] , [] ]-> can be of same query, same distance - records = map(lambda s: s.split("\n"), Best_combo[k]) + records = [s.split("\n") for s in Best_combo[k]] # split also internally each query's string to see if it # contains more >1 hits(when same distance, more >1 are conc. # to same query) - spl_rec = map(lambda r: map( - lambda t: t if t != '' else None, r), records) + spl_rec = [[t if t != '' else None for t in r] for r in records] recs = [x + "\n" for s in spl_rec for x in s if x != None] if len(recs) > 1: # Only when more than two hits, and no NAs here - splitted_hits = map( - lambda h: recs[h].split("\t"), range(len(recs))) + splitted_hits = [recs[h].split("\t") for h in range(len(recs))] # s= each hit line in string splitted_hits = [s for s in splitted_hits if s != [""]] - featOfHit = map(lambda s: splitted_hits[s][ - 5], range(len(splitted_hits))) - startPos = map(lambda s: splitted_hits[s][ - 6], range(len(splitted_hits))) - endPos = map(lambda s: splitted_hits[s][ - 7], range(len(splitted_hits))) - Dist_hits = map(lambda s: float(splitted_hits[s][10]) if splitted_hits[s][ - 10] != "NA" else float("Inf"), range(len(splitted_hits))) # 9th col= Distance + featOfHit = [splitted_hits[s][ + 5] for s in range(len(splitted_hits))] + startPos = [splitted_hits[s][ + 6] for s in range(len(splitted_hits))] + endPos = [splitted_hits[s][ + 7] for s in range(len(splitted_hits))] + Dist_hits = [float(splitted_hits[s][10]) if splitted_hits[s][ + 10] != "NA" else float("Inf") for s in range(len(splitted_hits))] # 9th col= Distance # Dist_hit = [Dist_hits[0] if all_same(Dist_hits) else # Dist_hits] #Either same dist or All NAs # When All "NAs" for all queries-> keep only line from 1st # query - NA_hits = map(lambda s: "NA" if splitted_hits[s][ - 10] == "NA" else None, range(len(splitted_hits))) + NA_hits = ["NA" if splitted_hits[s][ + 10] == "NA" else None for s in range(len(splitted_hits))] # If None means ALL queries have a feature all_have_feat = all(x is None for x in NA_hits) diff --git a/uropa/config.py b/uropa/config.py index 867e15d..0fa490c 100644 --- a/uropa/config.py +++ b/uropa/config.py @@ -60,7 +60,7 @@ def parse_json(infile): assert isinstance(infile, str), 'Argument {0} of wrong type ({1}), should be {2}!'.format( 'column', type(infile), 'str') - with open(infile, 'r') as f: + with open(infile, 'r', encoding='utf-8') as f: return ast.literal_eval(json.dumps(json.load(f))) @@ -73,8 +73,9 @@ def column_from_file(file, column, log=None): cmd = 'cut -f' + str(column) + ' ' + str(file) + \ ' | sort | uniq | grep -v "^#"' + try: - vals = subprocess.check_output(cmd, shell=True) + vals = str(subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True), encoding='utf-8') except subprocess.CalledProcessError: if log is not None: log.warning("File {} might be empty or has not enough columns".format(file)) @@ -86,16 +87,16 @@ def parse_parameters(config, log=None): """Fills the configuration with default values. Writes a warning to logs, if unknown keys are detected.""" defaults = {"priority": "False", "bed": "no_peaks.bed", "gtf": "no_annotation.gtf"} # , "bigwig": "none.bw" - keys = defaults.keys() - values = map(lambda x: config[x] if x in config else defaults[x], keys) + keys = list(defaults.keys()) + values = [config[x] if x in config else defaults[x] for x in keys] if log is not None: - unknown = [k for k in config.keys() if k not in keys and k != + unknown = [k for k in list(config.keys()) if k not in keys and k != "queries"] if any(unknown): log.warning( "Unknown keys detected in configuration: {}".format(unknown)) - parameters = dict(zip(keys, values)) + parameters = dict(list(zip(keys, values))) return parameters @@ -104,7 +105,7 @@ def parse_queries(config, gtf_feat, log=None): defaults = {"feature": gtf_feat, "strand": "ignore", "show.attributes": "None", "filter.attribute": "None", "attribute.value": "None", "distance": 100000, "feature.anchor": ["start", "center", "end"], "direction": "any_direction", "internals": "False", "priority": "False"} - keys = defaults.keys() + keys = list(defaults.keys()) try: query_list = config["queries"] @@ -122,49 +123,49 @@ def parse_queries(config, gtf_feat, log=None): query_list = [query_list] def give_val(q): - return map(lambda x: q[x] if x in q and q[x] != "" else defaults[x], keys) + return [q[x] if x in q and q[x] != "" else defaults[x] for x in keys] def make_list(l): - return map(lambda v: [v] if not isinstance(v, list) else v, l) + return [[v] if not isinstance(v, list) else v for v in l] - vals = map(lambda l: give_val(l), query_list) - values = map(lambda l: make_list(l), vals) + vals = [give_val(l) for l in query_list] + values = [make_list(l) for l in vals] if not log is None: - unknown = [k for k in config.keys() if k not in list(set(['gtf', 'bed', 'queries']).union(keys))] + unknown = [k for k in list(config.keys()) if k not in list(set(['gtf', 'bed', 'queries']).union(keys))] if any(unknown): log.warning("Unknown keys detected in configuration: {}".format(unknown)) - queries = map(lambda x: dict(zip(keys, x)), values) + queries = [dict(list(zip(keys, x))) for x in values] return queries def remove_invalid_queries(queries, log=None): """Removes queries that have multiple attribute.filter or filter.value values and more than two distance constraints.""" # Validate distance constraints - has_valid_distance = map(lambda q: len(q["distance"]) < 3, queries) + has_valid_distance = [len(q["distance"]) < 3 for q in queries] if not all(has_valid_distance) and log is not None: log.warning("Queries with invalid distances present! Affected queries: {}".format( [i for i, x in enumerate(has_valid_distance) if not x])) # Validate query attributes - has_valid_attributes = map(lambda q: False if (q["filter.attribute"] != ['None'] and q["attribute.value"] == [ - 'None']) or (q["filter.attribute"] == ['None'] and q["attribute.value"] != ['None']) else True, queries) + has_valid_attributes = [False if (q["filter.attribute"] != ['None'] and q["attribute.value"] == [ + 'None']) or (q["filter.attribute"] == ['None'] and q["attribute.value"] != ['None']) else True for q in queries] if not all(has_valid_attributes) and log is not None: log.warning("Queries with invalid filter.attribute and attribute.value pairings present! Affected queries: {}".format( [i for i, x in enumerate(has_valid_attributes) if not x])) # Validate strand attribute - has_valid_strand = map(lambda q: True if q["strand"] in [["ignore"],["same"],["opposite"]] else False, queries) + has_valid_strand = [True if q["strand"] in [["ignore"], ["same"], ["opposite"]] else False for q in queries] if not all(has_valid_strand) and log is not None: log.warning("Queries with invalid strand values present! Affected queries: {}".format( [i for i, x in enumerate(has_valid_strand) if not x])) # Validate query attribute lengths - has_valid_attribute_lengths = map(lambda q: False if len( - q["filter.attribute"]) > 1 or len(q["attribute.value"]) > 1 else True, queries) + has_valid_attribute_lengths = [False if len( + q["filter.attribute"]) > 1 or len(q["attribute.value"]) > 1 else True for q in queries] if not all(has_valid_attribute_lengths) and log is not None: log.warning("Queries with more than one value for either filter.attribute or attribute.value present! Affected queries: {}".format( [i for i, x in enumerate(has_valid_attribute_lengths) if not x])) @@ -176,7 +177,7 @@ def remove_invalid_queries(queries, log=None): def parse_first_gtf_line(gtf): """Removes comment lines, reads first line to check for prefix chr. Returns if chr was found and the number of columns.""" - f = open(gtf, "r") + f = open(gtf, "r", encoding='utf-8') is_comment = True while is_comment: line = f.readline() @@ -192,7 +193,7 @@ def cut_gtf_perFeat(gtf, features, prefix): gtf_per_feat = prefix + os.path.basename(gtf).split(".gtf")[0] + "_cut_per_feat.gtf" feat2cut = np.unique(features) - f = open(gtf, "r") + f = open(gtf, "r", encoding='utf-8') is_comment = True while is_comment: line = f.readline() diff --git a/uropa/overlaps.py b/uropa/overlaps.py index 57c83f2..61648fe 100644 --- a/uropa/overlaps.py +++ b/uropa/overlaps.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """Contains functions for UROPA overlap evaluation.""" -from __future__ import division + import os import re import sys @@ -134,11 +134,11 @@ def parse_peak(peakstr, extend=0, delim='\t'): defaults = {'id': p['chr'] + ":" + p['start'] + "-" + p['end'], 'name': p['chr'] + ":" + p['start'] + "-" + p['end'], 'score': 0, 'strand': ".", 'center': int(np.around(np.mean([int(p['start']), int(p['end'])]))), 'estart': max(1, int(p['start']) - extend), 'eend': int(p['end']) + extend, 'length': abs(int(p['end']) - int(p['start']))} - if 'name' in p.keys(): + if 'name' in list(p.keys()): p['id'] = p['chr'] + ":" + p['start'] + "-" + p['end'] + "_" + p['name'] - values = [p[k] if k in p else defaults[k] for k in defaults.keys()] - peak = dict(zip(p.keys() + defaults.keys(), p.values() + values)) + values = [p[k] if k in p else defaults[k] for k in list(defaults.keys())] + peak = dict(list(zip(list(p.keys()) + list(defaults.keys()), list(p.values()) + values))) return peak @@ -170,7 +170,7 @@ def distance_to_peak_center(p_center, hit3, hit4, strand, feat_pos): hit_center = np.mean([feat_start, feat_end]) hit_pos = {"start": feat_start, "center": hit_center, "end": feat_end} - d_pos = map(lambda i: abs(hit_pos[i] - p_center), feat_pos) + d_pos = [abs(hit_pos[i] - p_center) for i in feat_pos] pos, dmin = min(enumerate(d_pos), key=operator.itemgetter(1)) min_pos = feat_pos[pos] @@ -233,7 +233,7 @@ def get_distance_by_dir(inputD, genom_loc, intern_loc, Dhit, internals_allowed=[ if internals_allowed == ["True"]: return(any(intern_loc)) else: - best_dist = map(lambda l: Dhit <= D_upstr if l == "upstream" else Dhit <= D_downstr, intern_loc) + best_dist = [Dhit <= D_upstr if l == "upstream" else Dhit <= D_downstr for l in intern_loc] return(any(best_dist)) @@ -261,8 +261,8 @@ def round_up(val): # import "division" allows decimals def calculate_overlap(pstart, pend, peakL, featL, hit_start, hit_end): """[Local] Returns value of overlap btw {0,1} representing percentage of length covered by the peak and that covered by the feature """ - p_range = range(pstart, pend) - feat_range = range(hit_start, hit_end) + p_range = list(range(pstart, pend)) + feat_range = list(range(hit_start, hit_end)) pset = set(p_range) ovl_range = pset.intersection(feat_range) @@ -315,10 +315,10 @@ def get_hit_attribute(hit, attribute): """Splits attributes from hit lines""" pos_match = [i if re.match( attribute, hit_a) else None for i, hit_a in enumerate(hit[8].split("; "))] - pos_match = filter(lambda x: x is not None, pos_match) + pos_match = [x for x in pos_match if x is not None] if len(pos_match) > 0: - attr_val = map(lambda m: hit[8].split("; ")[m].split(" ")[ - 1].strip('"\'').rstrip('\";'), pos_match) + attr_val = [hit[8].split("; ")[m].split(" ")[ + 1].strip('"\'').rstrip('\";') for m in pos_match] val = [av for av in attr_val if av is not None][0] val = val.replace("\t", "").replace("\r", "").replace("\n", "") return val @@ -334,7 +334,7 @@ def get_besthit(q, len_q_dist, p_nm, hit, attrib_keys, Dhit): if attrib_keys != ["None"]: #attr_val = map(lambda k: re.split(k+" ", hit[8])[1].split(';')[0].strip('"\'').rstrip('\"'), attrib_keys) # list of all values if multiple keys - attr_val = map(lambda a: get_hit_attribute(hit, a), attrib_keys) + attr_val = [get_hit_attribute(hit, a) for a in attrib_keys] ret = [Dhit, [hit[2], hit[3], hit[4], hit[6], attr_val]] elif attrib_keys == ["None"]: @@ -367,7 +367,7 @@ def create_table(peak_id, chrom, pstart, pend, p_center, min_dist_hit, attrib_ke def write_hit_to_All(All_hits_tab, p_name, attrib_k, Dhit, hit, peak_id, chrom, pstart, pend, p_center, min_pos, genomic_location, ovl_pk, ovl_feat, j): """Writes an output table.""" if attrib_k != ["None"]: - attr_val = map(lambda a: get_hit_attribute(hit, a), attrib_k) + attr_val = [get_hit_attribute(hit, a) for a in attrib_k] hit2add = [Dhit, [hit[2], hit[3], hit[4], hit[6], attr_val]] the_res = create_table(peak_id, chrom, pstart, pend, p_center, hit2add, attrib_k, min_pos, genomic_location, ovl_pk, ovl_feat, j) @@ -422,8 +422,8 @@ def write_partial_file(outfile, Table): def merge_queries(Best_combo_k): """Merges queries.""" - merged_q = map(lambda l: Best_combo_k[l].split( - "\t")[-1].strip("\n"), range(len(Best_combo_k))) + merged_q = [Best_combo_k[l].split( + "\t")[-1].strip("\n") for l in range(len(Best_combo_k))] # Read if combo_k is "" and give it the q.numb, so I have it ready for # merging when one query ="" merged_q[-1] = merged_q[-1] + "\n" diff --git a/uropa/uropa.py b/uropa/uropa.py index 416855e..0949b50 100644 --- a/uropa/uropa.py +++ b/uropa/uropa.py @@ -3,12 +3,12 @@ @authors: Maria Kondili, Jens Preussner and Annika Fust @license: MIT -@version: 1.2.1 +@version: 2.0.0-alpha @maintainer: Mario Looso @email: mario.looso@mpi-bn.mpg.de """ -from __future__ import division + import os import sys import json @@ -25,399 +25,402 @@ import numpy as np -import config as cfg -import overlaps as ovls -import annotation as ant +from . import config as cfg +from . import overlaps as ovls +from . import annotation as ant #if __name__ == "__main__": def main(): - parser = argparse.ArgumentParser( - prog="uropa", - description='UROPA - Universal RObust Peak Annotator.', - epilog=cfg.howtoconfig(), - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument( - "-i", - "--input", - help="filename of configuration file", - required=True, - action="store", - metavar="config.json") - parser.add_argument( - "-p", - "--prefix", - dest="prefix", - help="prefix for result file names (defaults to basename of input file)", - required=False, - action="store", - metavar="prefix") - parser.add_argument( - "-r", - "--reformat", - help="create an additional compact and line-reduced table as result file", - required=False, - action="store_true") - parser.add_argument( - "-s", - "--summary", - help="filename of additional visualisation of results in graphical format", - required=False, - action="store_true") - parser.add_argument( - "-t", - "--threads", - help="multiprocessed run: n = number of threads to run annotation process", - type=int, - required=False, - action="store", - metavar="n", - default=1) - parser.add_argument( - "--add-comments", - help="add comment lines to output files", - required=False, - action="store_true") - parser.add_argument( - "-l", - "--log", - help="log file name for messages and warnings", - action="store", - metavar="uropa.log") - parser.add_argument( - "-d", - "--debug", - help="print verbose messages (for debugging) to stdout and log", - required=False, - action="store_true") - parser.add_argument( - "-v", - "--version", - help="prints the version and exits", - action="version", - version="%(prog)s 1.2.1") - args = parser.parse_args() - - config = args.input - - # Configure logging - logger = logging.getLogger(__name__) - logger.setLevel(logging.DEBUG) - - streamHandle = logging.StreamHandler() - loggerFormat = logging.Formatter('[%(levelname)s] - %(message)s') - streamHandle.setFormatter(loggerFormat) - - if args.debug: - streamHandle.setLevel(logging.DEBUG) - else: - streamHandle.setLevel(logging.WARNING) - logger.addHandler(streamHandle) - - if args.log is not None: - logpath = os.path.dirname(args.log) - if not os.path.exists(logpath) and logpath != '': - try: - os.makedirs(logpath) - except OSError: - logger.error("Could not create directory for log file {}".format(logpath)) - try: - fileHandle = logging.FileHandler(args.log, mode='w') - fileHandle.setLevel(logging.DEBUG) - fileHandle.setFormatter(loggerFormat) - logger.addHandler(fileHandle) - except IOError: - logger.error("Could not create log file {}".format(args.log)) - - if args.prefix is not None: - prefixpath = os.path.dirname(args.prefix) - if not os.path.exists(prefixpath): - if not prefixpath == '': - try: - os.makedirs(prefixpath) - except IOError: - logger.error("Could not create directory {} for output".format(prefixpath)) - else: - prefixpath = '.' - outdir = prefixpath + '/' + os.path.basename(args.prefix) + '_' - else: - outdir = "./" + os.path.splitext(os.path.basename(args.input))[0] + '_' - - logger.debug("Directory for output files is {}".format(outdir)) - logger.info("Start time: %s", datetime.datetime.now().strftime("%d.%m.%Y %H:%M")) - - try: - cfg_dict = cfg.parse_json(config) - except IOError: - logger.error("File %s does not exists or is not readable.", config) - sys.exit() - except ValueError as e: - logger.error("File %s contains malformed JSON. %s", config, e) - sys.exit() - - parameters = cfg.parse_parameters(cfg_dict, logger) - priority = parameters["priority"] - annot_gtf = parameters["gtf"] - peaks_bed = parameters["bed"] - - try: - gtf_has_chr, gtf_cols = cfg.parse_first_gtf_line(annot_gtf) - except IOError: - logger.error("File %s does not exist or is not readable.", annot_gtf) - sys.exit() - - # > Check GTF format - if gtf_cols == 9: - logger.info("GTF file format validated. Extracting all features from GTF now...") - else: - logger.warning("File %s is not a proper GTF file!", annot_gtf) - - gtf_feat = cfg.column_from_file(annot_gtf, 3, logger) - if len(gtf_feat) < 1: - logger.error("No features found in file {} for annotation.".format(annot_gtf)) - sys.exit() - - if not os.path.exists(peaks_bed): - logger.error("File %s does not exists or is not readable.", peaks_bed) - sys.exit() - - # Sanity check for other parameters - accepted_val = ['T', 'True', 'TRUE', 'Yes', 'YES', 'Y', 'yes', 'F', 'False', 'FALSE', 'No', 'NO', 'N', 'no'] - if priority not in accepted_val: - logger.error("Priority value should be one of the values from %s", accepted_val) - sys.exit() - - # All parameters of Queries, filled-in with default values where necessary. - all_queries = cfg.parse_queries(cfg_dict, gtf_feat, logger) - logger.info("Number of all queries for parametric peak annotation: %s", len(all_queries)) - queries = cfg.remove_invalid_queries(all_queries, logger) - logger.info("Number of valid queries for parametric peak annotation: %s", len(queries)) - - # Validate query features - query_features = list(chain.from_iterable([q["feature"] for q in queries])) - feat_not_valid = [f for f in query_features if f not in gtf_feat] - feat_valid = [f for f in query_features if f not in feat_not_valid] - - if feat_not_valid: - logger.warning("Invalid features present (found in a query but not in the GTF): %s", ','.join(feat_not_valid)) - - # > Cut gtf according to features of config, and index the cut.gtf for Faster Annotation - # ! Attention : In the UCSC transformed gtf files,the feature is put by default "tfbs" - if len(gtf_feat) > 1: - gtf_cut_file = cfg.cut_gtf_perFeat(annot_gtf, feat_valid, outdir) - mygtf = gtf_cut_file - else: - mygtf = annot_gtf - - # Create a new JSON for SUMMARY, with all keys completed : - summ_dict = dict() - summ_dict["queries"] = queries - summ_dict["priority"] = priority - summ_dict["gtf"] = annot_gtf - summ_dict["bed"] = peaks_bed - - with open(outdir+"summary_config.json", "w") as fj: - json.dump(summ_dict, fj, indent=4) - - distances = reduce(list.__add__, map(lambda q: q["distance"], queries)) - max_distance = int(max(distances)) - - logger.info("Distance for peak enlargment: %s", max_distance) - - gtf_index = mygtf + '.sorted.gz' - - if os.path.exists(gtf_index): - logger.warning( - "GTF-Index already existed before indexing. Will overwrite old index.") - try: - ovls.tabix_index(mygtf) # Sort, zip, index the gtf - except IOError: - logger.error("Unable to create index for file %s. Make sure tabix, bgzip and sort are available and UROPA has rights to (over)write files.", mygtf) - sys.exit() - - # Validate priority parameter - pr = True if priority in ['T', 'True', 'TRUE', 'Yes', 'YES', 'Y', 'yes'] else False - - # Create list of Attributes from all queries together,to be shown in final - # Table-columns for All queries - query_attributes_none = list([q["show.attributes"] for q in queries]) - query_attributes = [a for a in np.unique(list(chain.from_iterable(query_attributes_none))) if a is not None and a != 'None'] - - logger.info("Additional attributes included in output: %s", ','.join(query_attributes)) - - # query included, when there is no hit from tabix - nas_len = len(query_attributes) + 10 - NAsList = list(np.repeat("NA", nas_len)) - NAsList[nas_len - 1] = "- " - - header_base = [ - "peak_id", - "peak_chr", - "peak_start", - "peak_center", - "peak_end", - "feature", - "feat_start", - "feat_end", - "feat_strand", - "feat_anchor", - "distance", - "genomic_location", - "feat_ovl_peak", - "peak_ovl_feat"] - - header = "\t".join(header_base + query_attributes + ["query"]) - - # - # Preparation of multiprocessing - # - spl_dir = outdir + "split_peaks/" - if args.threads > 1: - logger.info("Multiprocessing: Peak file will be split in %s smaller files.", args.threads) - if not os.path.exists(spl_dir): - os.makedirs(spl_dir) - cmd = ['split', - '-n l/' + str(args.threads), - peaks_bed, - spl_dir + 'spl_peak_'] - - try: - sp.check_call(cmd, stderr=open(os.devnull, 'w')) - except sp.CalledProcessError: - args.threads = 1 - logger.warning( - "Unable to split peak input into smaller files. Falling back to one thread.") - logger.info("Check if split command is installed in version 8.22 or higher.") - except OSError as e: - args.threads = 1 - logger.warning( - "Split command not available. Falling back to one thread.") + parser = argparse.ArgumentParser( + prog="uropa", + description='UROPA - Universal RObust Peak Annotator.', + epilog=cfg.howtoconfig(), + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument( + "-i", + "--input", + help="filename of configuration file", + required=True, + action="store", + metavar="config.json") + parser.add_argument( + "-p", + "--prefix", + dest="prefix", + help="prefix for result file names (defaults to basename of input file)", + required=False, + action="store", + metavar="prefix") + parser.add_argument( + "-r", + "--reformat", + help="create an additional compact and line-reduced table as result file", + required=False, + action="store_true") + parser.add_argument( + "-s", + "--summary", + help="filename of additional visualisation of results in graphical format", + required=False, + action="store_true") + parser.add_argument( + "-t", + "--threads", + help="multiprocessed run: n = number of threads to run annotation process", + type=int, + required=False, + action="store", + metavar="n", + default=1) + parser.add_argument( + "--add-comments", + help="add comment lines to output files", + required=False, + action="store_true") + parser.add_argument( + "-l", + "--log", + help="log file name for messages and warnings", + action="store", + metavar="uropa.log") + parser.add_argument( + "-d", + "--debug", + help="print verbose messages (for debugging) to stdout and log", + required=False, + action="store_true") + parser.add_argument( + "-v", + "--version", + help="prints the version and exits", + action="version", + version="%(prog)s 1.2.1") + args = parser.parse_args() + + config = args.input + + # Configure logging + logger = logging.getLogger(__name__) + logger.setLevel(logging.DEBUG) + + streamHandle = logging.StreamHandler() + loggerFormat = logging.Formatter('[%(levelname)s] - %(message)s') + streamHandle.setFormatter(loggerFormat) + + if args.debug: + streamHandle.setLevel(logging.DEBUG) + else: + streamHandle.setLevel(logging.WARNING) + logger.addHandler(streamHandle) + + if args.log is not None: + logpath = os.path.dirname(args.log) + if not os.path.exists(logpath) and logpath != '': + try: + os.makedirs(logpath) + except OSError: + logger.error("Could not create directory for log file {}".format(logpath)) + try: + fileHandle = logging.FileHandler(args.log, mode='w') + fileHandle.setLevel(logging.DEBUG) + fileHandle.setFormatter(loggerFormat) + logger.addHandler(fileHandle) + except IOError: + logger.error("Could not create log file {}".format(args.log)) + + if args.prefix is not None: + prefixpath = os.path.dirname(args.prefix) + if not os.path.exists(prefixpath): + if not prefixpath == '': + try: + os.makedirs(prefixpath) + except IOError: + logger.error("Could not create directory {} for output".format(prefixpath)) + else: + prefixpath = '.' + outdir = prefixpath + '/' + os.path.basename(args.prefix) + '_' + else: + outdir = "./" + os.path.splitext(os.path.basename(args.input))[0] + '_' + + logger.debug("Directory for output files is {}".format(outdir)) + logger.info("Start time: %s", datetime.datetime.now().strftime("%d.%m.%Y %H:%M")) + + try: + cfg_dict = cfg.parse_json(config) + except IOError: + logger.error("File %s does not exists or is not readable.", config) + sys.exit() + except ValueError as e: + logger.error("File %s contains malformed JSON. %s", config, e) + sys.exit() + + parameters = cfg.parse_parameters(cfg_dict, logger) + priority = parameters["priority"] + annot_gtf = parameters["gtf"] + peaks_bed = parameters["bed"] + + try: + gtf_has_chr, gtf_cols = cfg.parse_first_gtf_line(annot_gtf) + except IOError: + logger.error("File %s does not exist or is not readable.", annot_gtf) + sys.exit() + + # > Check GTF format + if gtf_cols == 9: + logger.info("GTF file format validated. Extracting all features from GTF now...") + else: + logger.warning("File %s is not a proper GTF file!", annot_gtf) + + gtf_feat = cfg.column_from_file(annot_gtf, 3, logger) + + print("gtf_feat = ", gtf_feat) + + if len(gtf_feat) < 1: + logger.error("No features found in file {} for annotation.".format(annot_gtf)) + sys.exit() + + if not os.path.exists(peaks_bed): + logger.error("File %s does not exists or is not readable.", peaks_bed) + sys.exit() + + # Sanity check for other parameters + accepted_val = ['T', 'True', 'TRUE', 'Yes', 'YES', 'Y', 'yes', 'F', 'False', 'FALSE', 'No', 'NO', 'N', 'no'] + if priority not in accepted_val: + logger.error("Priority value should be one of the values from %s", accepted_val) + sys.exit() + + # All parameters of Queries, filled-in with default values where necessary. + all_queries = cfg.parse_queries(cfg_dict, gtf_feat, logger) + logger.info("Number of all queries for parametric peak annotation: %s", len(all_queries)) + queries = cfg.remove_invalid_queries(all_queries, logger) + logger.info("Number of valid queries for parametric peak annotation: %s", len(queries)) + + # Validate query features + query_features = list(chain.from_iterable([q["feature"] for q in queries])) + feat_not_valid = [f for f in query_features if f not in gtf_feat] + feat_valid = [f for f in query_features if f not in feat_not_valid] + + if feat_not_valid: + logger.warning("Invalid features present (found in a query but not in the GTF): %s", ','.join(feat_not_valid)) + + # > Cut gtf according to features of config, and index the cut.gtf for Faster Annotation + # ! Attention : In the UCSC transformed gtf files,the feature is put by default "tfbs" + if len(gtf_feat) > 1: + gtf_cut_file = cfg.cut_gtf_perFeat(annot_gtf, feat_valid, outdir) + mygtf = gtf_cut_file + else: + mygtf = annot_gtf + + # Create a new JSON for SUMMARY, with all keys completed : + summ_dict = dict() + summ_dict["queries"] = queries + summ_dict["priority"] = priority + summ_dict["gtf"] = annot_gtf + summ_dict["bed"] = peaks_bed + + with open(outdir+"summary_config.json", "w") as fj: + json.dump(summ_dict, fj, indent=4) + + distances = reduce(list.__add__, [q["distance"] for q in queries]) + max_distance = int(max(distances)) + + logger.info("Distance for peak enlargment: %s", max_distance) + + gtf_index = mygtf + '.sorted.gz' + + if os.path.exists(gtf_index): + logger.warning( + "GTF-Index already existed before indexing. Will overwrite old index.") + try: + ovls.tabix_index(mygtf) # Sort, zip, index the gtf + except IOError: + logger.error("Unable to create index for file %s. Make sure tabix, bgzip and sort are available and UROPA has rights to (over)write files.", mygtf) + sys.exit() + + # Validate priority parameter + pr = True if priority in ['T', 'True', 'TRUE', 'Yes', 'YES', 'Y', 'yes'] else False + + # Create list of Attributes from all queries together,to be shown in final + # Table-columns for All queries + query_attributes_none = list([q["show.attributes"] for q in queries]) + query_attributes = [a for a in np.unique(list(chain.from_iterable(query_attributes_none))) if a is not None and a != 'None'] + + logger.info("Additional attributes included in output: %s", ','.join(query_attributes)) + + # query included, when there is no hit from tabix + nas_len = len(query_attributes) + 10 + NAsList = list(np.repeat("NA", nas_len)) + NAsList[nas_len - 1] = "- " + + header_base = [ + "peak_id", + "peak_chr", + "peak_start", + "peak_center", + "peak_end", + "feature", + "feat_start", + "feat_end", + "feat_strand", + "feat_anchor", + "distance", + "genomic_location", + "feat_ovl_peak", + "peak_ovl_feat"] + + header = "\t".join(header_base + query_attributes + ["query"]) + + # + # Preparation of multiprocessing + # + spl_dir = outdir + "split_peaks/" + if args.threads > 1: + logger.info("Multiprocessing: Peak file will be split in %s smaller files.", args.threads) + if not os.path.exists(spl_dir): + os.makedirs(spl_dir) + cmd = ['split', + '-n l/' + str(args.threads), + peaks_bed, + spl_dir + 'spl_peak_'] + + try: + sp.check_call(cmd, stderr=open(os.devnull, 'w')) + except sp.CalledProcessError: + args.threads = 1 + logger.warning( + "Unable to split peak input into smaller files. Falling back to one thread.") + logger.info("Check if split command is installed in version 8.22 or higher.") + except OSError as e: + args.threads = 1 + logger.warning( + "Split command not available. Falling back to one thread.") # # Processing peaks # - input_args = [outdir, gtf_index, query_attributes, queries, max_distance, pr, gtf_has_chr] - - # > Write output according to Thread option - if args.threads > 1: - pool = mp.Pool(args.threads) - partial_func = partial(ant.annotation_process, input_args) - pool.map(partial_func, glob.glob(spl_dir + "spl_peak_*")) - pool.close() - pool.join() - else: - ant.annotation_process(input_args, peaks_bed, logger) - # Files created after annot.process: - - allhits_partials = glob.glob(outdir + "allhits_part_*") - finalhits_partials = glob.glob(outdir + "finalhits_part_*") - - logger.info("Writing output files to {}".format(outdir)) - - if args.add_comments: - comments = ovls.concat_comments(queries, str(pr), annot_gtf, peaks_bed) - else: - comments = None - - # - # Merging output files - # - allhits_outfile = outdir + "allhits.txt" - - if len(queries) > 1 and not pr: - besthits_outfile = outdir + "besthits.txt" - merged_outfile = outdir + "finalhits.txt" - - besthits_partials = glob.glob(outdir + "besthits_part_*") - ovls.finalize_file(merged_outfile, finalhits_partials, header, comments, log=logger) - ovls.finalize_file(besthits_outfile, besthits_partials, header, comments, log=logger) - else: - besthits_outfile = outdir + "finalhits.txt" - ovls.finalize_file(besthits_outfile, finalhits_partials, header, comments, log=logger) - - logger.debug("Filenames for output files are: {}, {}". format(allhits_outfile, besthits_outfile)) - - ovls.finalize_file(allhits_outfile, allhits_partials, header, comments, log=logger) - - # - # Reformat output - # - if args.reformat and len(queries) > 1 and not pr: - logger.info("Reformatting output...") - R_reform_Best = [ - 'uropa_reformat_output.R', - '-i', - besthits_outfile, - '-k', - 'peak_id', - '-c', - '1:5', - '-d', - ',', - '-t', - str(args.threads)] - try: - # creates output of Best and gives name "Reformatted_" - logger.debug('Reformat output call is {}'.format(R_reform_Best)) - pr_R = sp.check_output(R_reform_Best) - except sp.CalledProcessError: - logger.warning("Reformatted output could not be created with call %s", ' '.join(R_reform_Best)) - except OSError: - logger.warning("Rscript command not available for summary output.") - - # - # Visual summary - # - if args.summary: - logger.info("Creating the Summary graphs of the results...") - summary_script = "uropa_summary.R" - summary_output = outdir + "summary.pdf" - - if len(queries) > 1 and not pr and os.path.exists(merged_outfile): - call = [ - summary_script, - "-f", - merged_outfile, - "-c", - outdir + "summary_config.json", - "-o", - summary_output, - "-b", - besthits_outfile] - else: - call = [ - summary_script, - "-f", - besthits_outfile, - "-c", - outdir + "summary_config.json", - "-o", - summary_output] - try: - logger.debug('Summary output call is {}'.format(call)) - sum_pr = sp.check_output(call) - except sp.CalledProcessError: - logger.warning("Visualized summary output could not be created from %s.", ' '.join(call)) - except OSError: - logger.warning("Rscript command not available for summary output.") - - # - # Clean up of temporary files - # - outputs_ready = os.path.exists( - allhits_outfile) and os.path.exists(besthits_outfile) - - if outputs_ready: - #logger.debug("Attempting to clean {}".format(outdir)) - ovls.cleanup(outdir, logger) - - if os.path.exists(spl_dir): - shutil.rmtree(spl_dir) - if os.path.exists(outdir+"summary_config.json"): - os.remove(outdir+"summary_config.json") - os.remove(gtf_index) # .gz - os.remove(gtf_index + ".tbi") - if len(gtf_feat) > 1: - os.remove(gtf_cut_file) - os.remove(gtf_cut_file + ".sorted") - - logger.info("End time: %s", datetime.datetime.now().strftime("%d.%m.%Y %H:%M")) + input_args = [outdir, gtf_index, query_attributes, queries, max_distance, pr, gtf_has_chr] + + # > Write output according to Thread option + if args.threads > 1: + pool = mp.Pool(args.threads) + partial_func = partial(ant.annotation_process, input_args) + pool.map(partial_func, glob.glob(spl_dir + "spl_peak_*")) + pool.close() + pool.join() + else: + ant.annotation_process(input_args, peaks_bed, logger) + # Files created after annot.process: + + allhits_partials = glob.glob(outdir + "allhits_part_*") + finalhits_partials = glob.glob(outdir + "finalhits_part_*") + + logger.info("Writing output files to {}".format(outdir)) + + if args.add_comments: + comments = ovls.concat_comments(queries, str(pr), annot_gtf, peaks_bed) + else: + comments = None + + # + # Merging output files + # + allhits_outfile = outdir + "allhits.txt" + + if len(queries) > 1 and not pr: + besthits_outfile = outdir + "besthits.txt" + merged_outfile = outdir + "finalhits.txt" + + besthits_partials = glob.glob(outdir + "besthits_part_*") + ovls.finalize_file(merged_outfile, finalhits_partials, header, comments, log=logger) + ovls.finalize_file(besthits_outfile, besthits_partials, header, comments, log=logger) + else: + besthits_outfile = outdir + "finalhits.txt" + ovls.finalize_file(besthits_outfile, finalhits_partials, header, comments, log=logger) + + logger.debug("Filenames for output files are: {}, {}". format(allhits_outfile, besthits_outfile)) + + ovls.finalize_file(allhits_outfile, allhits_partials, header, comments, log=logger) + + # + # Reformat output + # + if args.reformat and len(queries) > 1 and not pr: + logger.info("Reformatting output...") + R_reform_Best = [ + 'uropa_reformat_output.R', + '-i', + besthits_outfile, + '-k', + 'peak_id', + '-c', + '1:5', + '-d', + ',', + '-t', + str(args.threads)] + try: + # creates output of Best and gives name "Reformatted_" + logger.debug('Reformat output call is {}'.format(R_reform_Best)) + pr_R = sp.check_output(R_reform_Best) + except sp.CalledProcessError: + logger.warning("Reformatted output could not be created with call %s", ' '.join(R_reform_Best)) + except OSError: + logger.warning("Rscript command not available for summary output.") + + # + # Visual summary + # + if args.summary: + logger.info("Creating the Summary graphs of the results...") + summary_script = "uropa_summary.R" + summary_output = outdir + "summary.pdf" + + if len(queries) > 1 and not pr and os.path.exists(merged_outfile): + call = [ + summary_script, + "-f", + merged_outfile, + "-c", + outdir + "summary_config.json", + "-o", + summary_output, + "-b", + besthits_outfile] + else: + call = [ + summary_script, + "-f", + besthits_outfile, + "-c", + outdir + "summary_config.json", + "-o", + summary_output] + try: + logger.debug('Summary output call is {}'.format(call)) + sum_pr = sp.check_output(call) + except sp.CalledProcessError: + logger.warning("Visualized summary output could not be created from %s.", ' '.join(call)) + except OSError: + logger.warning("Rscript command not available for summary output.") + + # + # Clean up of temporary files + # + outputs_ready = os.path.exists( + allhits_outfile) and os.path.exists(besthits_outfile) + + if outputs_ready: + #logger.debug("Attempting to clean {}".format(outdir)) + ovls.cleanup(outdir, logger) + + if os.path.exists(spl_dir): + shutil.rmtree(spl_dir) + if os.path.exists(outdir+"summary_config.json"): + os.remove(outdir+"summary_config.json") + os.remove(gtf_index) # .gz + os.remove(gtf_index + ".tbi") + if len(gtf_feat) > 1: + os.remove(gtf_cut_file) + os.remove(gtf_cut_file + ".sorted") + + logger.info("End time: %s", datetime.datetime.now().strftime("%d.%m.%Y %H:%M"))