update UROPA to python 3

kaischmid · Nov 8, 2017 · eead1aa · eead1aa
1 parent 767ed42
commit eead1aa
Show file tree

Hide file tree

Showing 6 changed files with 463 additions and 462 deletions.
diff --git a/README.md b/README.md
@@ -40,10 +40,10 @@ Installation and Command-line usage
 We recommend to install UROPA using the conda package manager. Make sure to have `conda` installed, e.g. via
 
 - [Miniconda](https://conda.io/miniconda.html)
-	- download the Miniconda installer for **Python 2.7**
-	- run ```bash Miniconda2-latest-Linux-x86_64.sh``` to install Miniconda
+	- download the Miniconda installer for **Python 3**
+	- run ```bash Miniconda3-latest-Linux-x86_64.sh``` to install Miniconda
 	- Answer the question "Do you wish the installer to prepend the Miniconda install location to PATH in your /home/.../.bashrc ?" with yes
-		OR do ```PATH=dir/to/miniconda2:$PATH``` after installation process
+		OR do ```PATH=dir/to/miniconda3:$PATH``` after installation process
 
 The UROPA installation is now as easy as ```conda install -c bioconda uropa```.
 

diff --git a/setup.py b/setup.py
@@ -1,11 +1,11 @@
 from setuptools import setup
 
 def readme():
-    with open('README.md') as f:
+    with open('README.md', encoding='utf-8') as f:
         return f.read()
 
 setup(name='uropa',
-      version='1.2.1',
+      version='2.0.0-alpha',
       description='UROPA is a command line based tool, intended for genomic region annotation',
       long_description=readme(),
       url='https://github.molgen.mpg.de/loosolab/UROPA',
@@ -25,7 +25,7 @@ def readme():
         'License :: OSI Approved :: MIT License',
         'Intended Audience :: Science/Research',
         'Topic :: Scientific/Engineering :: Bio-Informatics',
-        'Programming Language :: Python :: 2.7'
+        'Programming Language :: Python :: 3.4'
       ],
       zip_safe=False,
       include_package_data=True)
diff --git a/uropa/annotation.py b/uropa/annotation.py
@@ -3,7 +3,7 @@
 import operator
 import numpy as np
 import pysam
-import overlaps as ovls
+from . import overlaps as ovls
 
 
 def annotation_process(input_args, peak_file, log=None):
@@ -113,7 +113,7 @@ def annotation_process(input_args, peak_file, log=None):
 
                 # Find hits with valid values for the queries (Search all
                 # queries ,Not only PRIORITY)
-                v_fsa = map(lambda q: ovls.valid_fsa(h, hit, q, peak['strand']), queries)
+                v_fsa = [ovls.valid_fsa(h, hit, q, peak['strand']) for q in queries]
 
                 # Pair the Valid query values(fsb) with valid_distance and
                 # valid strand for each query
@@ -138,7 +138,7 @@ def annotation_process(input_args, peak_file, log=None):
             NAsList_q = list(np.repeat("NA", nas_len))
 
             # Search hits with only Prior or Secondary Queries
-            for hitj, vq in dict_vqp.items():
+            for hitj, vq in list(dict_vqp.items()):
                 hit = hitj.split("\t")
                 hit_len = abs(int(hit[4]) - int(hit[3]))
                 strand = hit[6]
@@ -252,7 +252,7 @@ def annotation_process(input_args, peak_file, log=None):
 
             # > Add to Best Hits the Best Internal features(when internals=True, D>config."distance"),
             # after having filled-in the All Hits for the peak.
-            for hitj, vq in dict_vqp.items():
+            for hitj, vq in list(dict_vqp.items()):
                 for j in vq:
                     if Best_hits_tab[j][peak['id']] != "" and All_hits_tab[j][peak['id']] != "":
                         if (Best_hits_tab[j][peak['id']].split("\t")[10] == "NA" and
@@ -261,10 +261,9 @@ def annotation_process(input_args, peak_file, log=None):
 
                             hit_line = All_hits_tab[j][peak['id']].split("\n")
                             hit_line = [h for h in hit_line if h != '']
-                            internal = map(
-                                lambda h: h.split("\t")[11], hit_line)
-                            hit_dist = map(lambda h: int(
-                                h.split("\t")[10]), hit_line)
+                            internal = [h.split("\t")[11] for h in hit_line]
+                            hit_dist = [int(
+                                h.split("\t")[10]) for h in hit_line]
                             mv_pos, min_val = min(
                                 enumerate(hit_dist), key=operator.itemgetter(1))
 
@@ -343,8 +342,8 @@ def annotation_process(input_args, peak_file, log=None):
 
                 if all(isNA):
                     #logg.debug("\nNo query has any Hit.No replacement of Priority Query possible-> NAs will be filled in the Output.")
-                    TabInList_p = map(lambda l: All_hits_tab[l][
-                        peak['id']], range(len(queries)))
+                    TabInList_p = [All_hits_tab[l][
+                        peak['id']] for l in range(len(queries))]
                     ##log.debug("Hit lines for all queries are : {}".format(TabInList_p ))
                     # If all_hits_tab doesn't have all queries will have ""
                     TabInList_p = [Tab for Tab in TabInList_p if Tab != ""]
@@ -365,14 +364,14 @@ def annotation_process(input_args, peak_file, log=None):
 
         All_combo = OrderedDict()  # Output peaks with same order as All_hits_tab
         mydict = All_hits_tab
-        for k in mydict[0].iterkeys():
-            All_combo[k] = [pid[k] for q, pid in mydict.items()]
+        for k in mydict[0].keys():
+            All_combo[k] = [pid[k] for q, pid in list(mydict.items())]
 
         #  Best hits  #
         Best_combo = OrderedDict()
         mybestD = Best_hits_tab
-        for k in mybestD[0].iterkeys():
-            Best_combo[k] = [pid[k] for q, pid in mybestD.items()]
+        for k in mybestD[0].keys():
+            Best_combo[k] = [pid[k] for q, pid in list(mybestD.items())]
 
         def all_same(items):
             """Returns true if all items of a list are the same."""
@@ -383,34 +382,32 @@ def all_same(items):
             BestBest_hits = OrderedDict()
             for k in Best_combo:
                 # [  [] , [] , [] ]-> can be of same query, same distance
-                records = map(lambda s: s.split("\n"), Best_combo[k])
+                records = [s.split("\n") for s in Best_combo[k]]
                 # split also internally each query's string to see if it
                 # contains more >1 hits(when same distance, more >1 are conc.
                 # to same query)
-                spl_rec = map(lambda r: map(
-                    lambda t: t if t != '' else None, r), records)
+                spl_rec = [[t if t != '' else None for t in r] for r in records]
                 recs = [x + "\n" for s in spl_rec for x in s if x != None]
 
                 if len(recs) > 1:  # Only when more than two hits, and no NAs here
-                    splitted_hits = map(
-                        lambda h: recs[h].split("\t"), range(len(recs)))
+                    splitted_hits = [recs[h].split("\t") for h in range(len(recs))]
                     # s= each hit line in string
                     splitted_hits = [s for s in splitted_hits if s != [""]]
-                    featOfHit = map(lambda s: splitted_hits[s][
-                        5], range(len(splitted_hits)))
-                    startPos = map(lambda s: splitted_hits[s][
-                        6], range(len(splitted_hits)))
-                    endPos = map(lambda s: splitted_hits[s][
-                        7], range(len(splitted_hits)))
-                    Dist_hits = map(lambda s: float(splitted_hits[s][10]) if splitted_hits[s][
-                        10] != "NA" else float("Inf"), range(len(splitted_hits)))  # 9th col= Distance
+                    featOfHit = [splitted_hits[s][
+                        5] for s in range(len(splitted_hits))]
+                    startPos = [splitted_hits[s][
+                        6] for s in range(len(splitted_hits))]
+                    endPos = [splitted_hits[s][
+                        7] for s in range(len(splitted_hits))]
+                    Dist_hits = [float(splitted_hits[s][10]) if splitted_hits[s][
+                        10] != "NA" else float("Inf") for s in range(len(splitted_hits))]  # 9th col= Distance
                     # Dist_hit = [Dist_hits[0] if all_same(Dist_hits) else
                     # Dist_hits]  #Either same dist or All NAs
 
                     # When All "NAs" for all queries-> keep only line from 1st
                     # query
-                    NA_hits = map(lambda s: "NA" if splitted_hits[s][
-                        10] == "NA" else None, range(len(splitted_hits)))
+                    NA_hits = ["NA" if splitted_hits[s][
+                        10] == "NA" else None for s in range(len(splitted_hits))]
                     # If None means ALL queries have a feature
                     all_have_feat = all(x is None for x in NA_hits)
 

diff --git a/uropa/config.py b/uropa/config.py
@@ -60,7 +60,7 @@ def parse_json(infile):
     assert isinstance(infile, str), 'Argument {0} of wrong type ({1}), should be {2}!'.format(
         'column', type(infile), 'str')
 
-    with open(infile, 'r') as f:
+    with open(infile, 'r', encoding='utf-8') as f:
         return ast.literal_eval(json.dumps(json.load(f)))
 
 
@@ -73,8 +73,9 @@ def column_from_file(file, column, log=None):
 
     cmd = 'cut -f' + str(column) + ' ' + str(file) + \
         ' | sort | uniq | grep -v "^#"'
+
     try:
-        vals = subprocess.check_output(cmd, shell=True)
+        vals = str(subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True), encoding='utf-8')
     except subprocess.CalledProcessError:
         if log is not None:
             log.warning("File {} might be empty or has not enough columns".format(file))
@@ -86,16 +87,16 @@ def parse_parameters(config, log=None):
     """Fills the configuration with default values. Writes a warning to logs, if unknown keys are detected."""
     defaults = {"priority": "False", "bed": "no_peaks.bed",
                 "gtf": "no_annotation.gtf"}  # , "bigwig": "none.bw"
-    keys = defaults.keys()
-    values = map(lambda x: config[x] if x in config else defaults[x], keys)
+    keys = list(defaults.keys())
+    values = [config[x] if x in config else defaults[x] for x in keys]
 
     if log is not None:
-        unknown = [k for k in config.keys() if k not in keys and k !=
+        unknown = [k for k in list(config.keys()) if k not in keys and k !=
                    "queries"]
         if any(unknown):
             log.warning(
                 "Unknown keys detected in configuration: {}".format(unknown))
-    parameters = dict(zip(keys, values))
+    parameters = dict(list(zip(keys, values)))
     return parameters
 
 
@@ -104,7 +105,7 @@ def parse_queries(config, gtf_feat, log=None):
     defaults = {"feature": gtf_feat, "strand": "ignore", "show.attributes": "None", "filter.attribute": "None",
                 "attribute.value": "None", "distance": 100000, "feature.anchor": ["start", "center", "end"],
                 "direction": "any_direction", "internals": "False", "priority": "False"}
-    keys = defaults.keys()
+    keys = list(defaults.keys())
 
     try:
         query_list = config["queries"]
@@ -122,49 +123,49 @@ def parse_queries(config, gtf_feat, log=None):
         query_list = [query_list]
 
     def give_val(q):
-        return map(lambda x: q[x] if x in q and q[x] != "" else defaults[x], keys)
+        return [q[x] if x in q and q[x] != "" else defaults[x] for x in keys]
 
     def make_list(l):
-        return map(lambda v: [v] if not isinstance(v, list) else v, l)
+        return [[v] if not isinstance(v, list) else v for v in l]
 
-    vals = map(lambda l: give_val(l), query_list)
-    values = map(lambda l: make_list(l), vals)
+    vals = [give_val(l) for l in query_list]
+    values = [make_list(l) for l in vals]
 
     if not log is None:
-        unknown = [k for k in config.keys() if k not in list(set(['gtf', 'bed', 'queries']).union(keys))]
+        unknown = [k for k in list(config.keys()) if k not in list(set(['gtf', 'bed', 'queries']).union(keys))]
         if any(unknown):
             log.warning("Unknown keys detected in configuration: {}".format(unknown))
 
-    queries = map(lambda x: dict(zip(keys, x)), values)
+    queries = [dict(list(zip(keys, x))) for x in values]
     return queries
 
 
 def remove_invalid_queries(queries, log=None):
     """Removes queries that have multiple attribute.filter or filter.value values and more than two distance constraints."""
     # Validate distance constraints
-    has_valid_distance = map(lambda q: len(q["distance"]) < 3, queries)
+    has_valid_distance = [len(q["distance"]) < 3 for q in queries]
     if not all(has_valid_distance) and log is not None:
         log.warning("Queries with invalid distances present! Affected queries: {}".format(
             [i for i, x in enumerate(has_valid_distance) if not x]))
 
     # Validate query attributes
-    has_valid_attributes = map(lambda q: False if (q["filter.attribute"] != ['None'] and q["attribute.value"] == [
-        'None']) or (q["filter.attribute"] == ['None'] and q["attribute.value"] != ['None']) else True, queries)
+    has_valid_attributes = [False if (q["filter.attribute"] != ['None'] and q["attribute.value"] == [
+        'None']) or (q["filter.attribute"] == ['None'] and q["attribute.value"] != ['None']) else True for q in queries]
 
     if not all(has_valid_attributes) and log is not None:
         log.warning("Queries with invalid filter.attribute and attribute.value pairings present! Affected queries: {}".format(
             [i for i, x in enumerate(has_valid_attributes) if not x]))
 
     # Validate strand attribute
-    has_valid_strand = map(lambda q: True if q["strand"] in [["ignore"],["same"],["opposite"]] else False, queries)
+    has_valid_strand = [True if q["strand"] in [["ignore"], ["same"], ["opposite"]] else False for q in queries]
 
     if not all(has_valid_strand) and log is not None:
         log.warning("Queries with invalid strand values present! Affected queries: {}".format(
             [i for i, x in enumerate(has_valid_strand) if not x]))
 
     # Validate query attribute lengths
-    has_valid_attribute_lengths = map(lambda q: False if len(
-        q["filter.attribute"]) > 1 or len(q["attribute.value"]) > 1 else True, queries)
+    has_valid_attribute_lengths = [False if len(
+        q["filter.attribute"]) > 1 or len(q["attribute.value"]) > 1 else True for q in queries]
     if not all(has_valid_attribute_lengths) and log is not None:
         log.warning("Queries with more than one value for either filter.attribute or attribute.value present! Affected queries: {}".format(
             [i for i, x in enumerate(has_valid_attribute_lengths) if not x]))
@@ -176,7 +177,7 @@ def remove_invalid_queries(queries, log=None):
 
 def parse_first_gtf_line(gtf):
     """Removes comment lines, reads first line to check for prefix chr. Returns if chr was found and the number of columns."""
-    f = open(gtf, "r")
+    f = open(gtf, "r", encoding='utf-8')
     is_comment = True
     while is_comment:
         line = f.readline()
@@ -192,7 +193,7 @@ def cut_gtf_perFeat(gtf, features, prefix):
     gtf_per_feat = prefix + os.path.basename(gtf).split(".gtf")[0] + "_cut_per_feat.gtf"
     feat2cut = np.unique(features)
 
-    f = open(gtf, "r")
+    f = open(gtf, "r", encoding='utf-8')
     is_comment = True
     while is_comment:
         line = f.readline()

diff --git a/uropa/overlaps.py b/uropa/overlaps.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 """Contains functions for UROPA overlap evaluation."""
-from __future__ import division
+
 import os
 import re
 import sys
@@ -134,11 +134,11 @@ def parse_peak(peakstr, extend=0, delim='\t'):
 
     defaults = {'id': p['chr'] + ":" + p['start'] + "-" + p['end'], 'name': p['chr'] + ":" + p['start'] + "-" + p['end'], 'score': 0, 'strand': ".",
                 'center': int(np.around(np.mean([int(p['start']), int(p['end'])]))), 'estart': max(1, int(p['start']) - extend), 'eend': int(p['end']) + extend, 'length': abs(int(p['end']) - int(p['start']))}
-    if 'name' in p.keys():
+    if 'name' in list(p.keys()):
         p['id'] = p['chr'] + ":" + p['start'] + "-" + p['end'] + "_" + p['name']
 
-    values = [p[k] if k in p else defaults[k] for k in defaults.keys()]
-    peak = dict(zip(p.keys() + defaults.keys(), p.values() + values))
+    values = [p[k] if k in p else defaults[k] for k in list(defaults.keys())]
+    peak = dict(list(zip(list(p.keys()) + list(defaults.keys()), list(p.values()) + values)))
     return peak
 
 
@@ -170,7 +170,7 @@ def distance_to_peak_center(p_center, hit3, hit4, strand, feat_pos):
 
     hit_center = np.mean([feat_start, feat_end])
     hit_pos = {"start": feat_start, "center": hit_center, "end": feat_end}
-    d_pos = map(lambda i: abs(hit_pos[i] - p_center), feat_pos)
+    d_pos = [abs(hit_pos[i] - p_center) for i in feat_pos]
     pos, dmin = min(enumerate(d_pos), key=operator.itemgetter(1))
 
     min_pos = feat_pos[pos]
@@ -233,7 +233,7 @@ def get_distance_by_dir(inputD, genom_loc, intern_loc, Dhit, internals_allowed=[
     if internals_allowed == ["True"]:
         return(any(intern_loc))
     else:
-        best_dist = map(lambda l: Dhit <= D_upstr if l == "upstream" else Dhit <= D_downstr, intern_loc)
+        best_dist = [Dhit <= D_upstr if l == "upstream" else Dhit <= D_downstr for l in intern_loc]
         return(any(best_dist))
 
 
@@ -261,8 +261,8 @@ def round_up(val):
 # import "division" allows decimals
 def calculate_overlap(pstart, pend, peakL, featL, hit_start, hit_end):
     """[Local] Returns value of overlap btw {0,1} representing percentage of length covered by the peak and that covered by the feature """
-    p_range = range(pstart, pend)
-    feat_range = range(hit_start, hit_end)
+    p_range = list(range(pstart, pend))
+    feat_range = list(range(hit_start, hit_end))
     pset = set(p_range)
 
     ovl_range = pset.intersection(feat_range)
@@ -315,10 +315,10 @@ def get_hit_attribute(hit, attribute):
     """Splits attributes from hit lines"""
     pos_match = [i if re.match(
         attribute, hit_a) else None for i, hit_a in enumerate(hit[8].split("; "))]
-    pos_match = filter(lambda x: x is not None, pos_match)
+    pos_match = [x for x in pos_match if x is not None]
     if len(pos_match) > 0:
-        attr_val = map(lambda m: hit[8].split("; ")[m].split(" ")[
-            1].strip('"\'').rstrip('\";'), pos_match)
+        attr_val = [hit[8].split("; ")[m].split(" ")[
+            1].strip('"\'').rstrip('\";') for m in pos_match]
         val = [av for av in attr_val if av is not None][0]
         val = val.replace("\t", "").replace("\r", "").replace("\n", "")
         return val
@@ -334,7 +334,7 @@ def get_besthit(q, len_q_dist, p_nm, hit, attrib_keys, Dhit):
     if attrib_keys != ["None"]:
         #attr_val = map(lambda k:  re.split(k+" ", hit[8])[1].split(';')[0].strip('"\'').rstrip('\"'), attrib_keys)
         # list of all values if multiple keys
-        attr_val = map(lambda a: get_hit_attribute(hit, a), attrib_keys)
+        attr_val = [get_hit_attribute(hit, a) for a in attrib_keys]
         ret = [Dhit, [hit[2], hit[3], hit[4], hit[6], attr_val]]
 
     elif attrib_keys == ["None"]:
@@ -367,7 +367,7 @@ def create_table(peak_id, chrom, pstart, pend, p_center, min_dist_hit, attrib_ke
 def write_hit_to_All(All_hits_tab, p_name, attrib_k, Dhit, hit, peak_id, chrom, pstart, pend, p_center, min_pos, genomic_location, ovl_pk, ovl_feat, j):
     """Writes an output table."""
     if attrib_k != ["None"]:
-        attr_val = map(lambda a: get_hit_attribute(hit, a), attrib_k)
+        attr_val = [get_hit_attribute(hit, a) for a in attrib_k]
         hit2add = [Dhit, [hit[2], hit[3], hit[4], hit[6], attr_val]]
         the_res = create_table(peak_id, chrom, pstart, pend, p_center,
                                hit2add, attrib_k, min_pos, genomic_location, ovl_pk, ovl_feat, j)
@@ -422,8 +422,8 @@ def write_partial_file(outfile, Table):
 
 def merge_queries(Best_combo_k):
     """Merges queries."""
-    merged_q = map(lambda l: Best_combo_k[l].split(
-        "\t")[-1].strip("\n"), range(len(Best_combo_k)))
+    merged_q = [Best_combo_k[l].split(
+        "\t")[-1].strip("\n") for l in range(len(Best_combo_k))]
     # Read if combo_k is "" and give it the q.numb, so I have it ready for
     # merging when one query =""
     merged_q[-1] = merged_q[-1] + "\n"