Skip to content

Commit

Permalink
update UROPA to python 3
Browse files Browse the repository at this point in the history
  • Loading branch information
afust committed Nov 8, 2017
1 parent 767ed42 commit eead1aa
Show file tree
Hide file tree
Showing 6 changed files with 463 additions and 462 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@ Installation and Command-line usage
We recommend to install UROPA using the conda package manager. Make sure to have `conda` installed, e.g. via

- [Miniconda](https://conda.io/miniconda.html)
- download the Miniconda installer for **Python 2.7**
- run ```bash Miniconda2-latest-Linux-x86_64.sh``` to install Miniconda
- download the Miniconda installer for **Python 3**
- run ```bash Miniconda3-latest-Linux-x86_64.sh``` to install Miniconda
- Answer the question "Do you wish the installer to prepend the Miniconda install location to PATH in your /home/.../.bashrc ?" with yes
OR do ```PATH=dir/to/miniconda2:$PATH``` after installation process
OR do ```PATH=dir/to/miniconda3:$PATH``` after installation process

The UROPA installation is now as easy as ```conda install -c bioconda uropa```.

Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from setuptools import setup

def readme():
with open('README.md') as f:
with open('README.md', encoding='utf-8') as f:
return f.read()

setup(name='uropa',
version='1.2.1',
version='2.0.0-alpha',
description='UROPA is a command line based tool, intended for genomic region annotation',
long_description=readme(),
url='https://github.molgen.mpg.de/loosolab/UROPA',
Expand All @@ -25,7 +25,7 @@ def readme():
'License :: OSI Approved :: MIT License',
'Intended Audience :: Science/Research',
'Topic :: Scientific/Engineering :: Bio-Informatics',
'Programming Language :: Python :: 2.7'
'Programming Language :: Python :: 3.4'
],
zip_safe=False,
include_package_data=True)
55 changes: 26 additions & 29 deletions uropa/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import operator
import numpy as np
import pysam
import overlaps as ovls
from . import overlaps as ovls


def annotation_process(input_args, peak_file, log=None):
Expand Down Expand Up @@ -113,7 +113,7 @@ def annotation_process(input_args, peak_file, log=None):

# Find hits with valid values for the queries (Search all
# queries ,Not only PRIORITY)
v_fsa = map(lambda q: ovls.valid_fsa(h, hit, q, peak['strand']), queries)
v_fsa = [ovls.valid_fsa(h, hit, q, peak['strand']) for q in queries]

# Pair the Valid query values(fsb) with valid_distance and
# valid strand for each query
Expand All @@ -138,7 +138,7 @@ def annotation_process(input_args, peak_file, log=None):
NAsList_q = list(np.repeat("NA", nas_len))

# Search hits with only Prior or Secondary Queries
for hitj, vq in dict_vqp.items():
for hitj, vq in list(dict_vqp.items()):
hit = hitj.split("\t")
hit_len = abs(int(hit[4]) - int(hit[3]))
strand = hit[6]
Expand Down Expand Up @@ -252,7 +252,7 @@ def annotation_process(input_args, peak_file, log=None):

# > Add to Best Hits the Best Internal features(when internals=True, D>config."distance"),
# after having filled-in the All Hits for the peak.
for hitj, vq in dict_vqp.items():
for hitj, vq in list(dict_vqp.items()):
for j in vq:
if Best_hits_tab[j][peak['id']] != "" and All_hits_tab[j][peak['id']] != "":
if (Best_hits_tab[j][peak['id']].split("\t")[10] == "NA" and
Expand All @@ -261,10 +261,9 @@ def annotation_process(input_args, peak_file, log=None):

hit_line = All_hits_tab[j][peak['id']].split("\n")
hit_line = [h for h in hit_line if h != '']
internal = map(
lambda h: h.split("\t")[11], hit_line)
hit_dist = map(lambda h: int(
h.split("\t")[10]), hit_line)
internal = [h.split("\t")[11] for h in hit_line]
hit_dist = [int(
h.split("\t")[10]) for h in hit_line]
mv_pos, min_val = min(
enumerate(hit_dist), key=operator.itemgetter(1))

Expand Down Expand Up @@ -343,8 +342,8 @@ def annotation_process(input_args, peak_file, log=None):

if all(isNA):
#logg.debug("\nNo query has any Hit.No replacement of Priority Query possible-> NAs will be filled in the Output.")
TabInList_p = map(lambda l: All_hits_tab[l][
peak['id']], range(len(queries)))
TabInList_p = [All_hits_tab[l][
peak['id']] for l in range(len(queries))]
##log.debug("Hit lines for all queries are : {}".format(TabInList_p ))
# If all_hits_tab doesn't have all queries will have ""
TabInList_p = [Tab for Tab in TabInList_p if Tab != ""]
Expand All @@ -365,14 +364,14 @@ def annotation_process(input_args, peak_file, log=None):

All_combo = OrderedDict() # Output peaks with same order as All_hits_tab
mydict = All_hits_tab
for k in mydict[0].iterkeys():
All_combo[k] = [pid[k] for q, pid in mydict.items()]
for k in mydict[0].keys():
All_combo[k] = [pid[k] for q, pid in list(mydict.items())]

# Best hits #
Best_combo = OrderedDict()
mybestD = Best_hits_tab
for k in mybestD[0].iterkeys():
Best_combo[k] = [pid[k] for q, pid in mybestD.items()]
for k in mybestD[0].keys():
Best_combo[k] = [pid[k] for q, pid in list(mybestD.items())]

def all_same(items):
"""Returns true if all items of a list are the same."""
Expand All @@ -383,34 +382,32 @@ def all_same(items):
BestBest_hits = OrderedDict()
for k in Best_combo:
# [ [] , [] , [] ]-> can be of same query, same distance
records = map(lambda s: s.split("\n"), Best_combo[k])
records = [s.split("\n") for s in Best_combo[k]]
# split also internally each query's string to see if it
# contains more >1 hits(when same distance, more >1 are conc.
# to same query)
spl_rec = map(lambda r: map(
lambda t: t if t != '' else None, r), records)
spl_rec = [[t if t != '' else None for t in r] for r in records]
recs = [x + "\n" for s in spl_rec for x in s if x != None]

if len(recs) > 1: # Only when more than two hits, and no NAs here
splitted_hits = map(
lambda h: recs[h].split("\t"), range(len(recs)))
splitted_hits = [recs[h].split("\t") for h in range(len(recs))]
# s= each hit line in string
splitted_hits = [s for s in splitted_hits if s != [""]]
featOfHit = map(lambda s: splitted_hits[s][
5], range(len(splitted_hits)))
startPos = map(lambda s: splitted_hits[s][
6], range(len(splitted_hits)))
endPos = map(lambda s: splitted_hits[s][
7], range(len(splitted_hits)))
Dist_hits = map(lambda s: float(splitted_hits[s][10]) if splitted_hits[s][
10] != "NA" else float("Inf"), range(len(splitted_hits))) # 9th col= Distance
featOfHit = [splitted_hits[s][
5] for s in range(len(splitted_hits))]
startPos = [splitted_hits[s][
6] for s in range(len(splitted_hits))]
endPos = [splitted_hits[s][
7] for s in range(len(splitted_hits))]
Dist_hits = [float(splitted_hits[s][10]) if splitted_hits[s][
10] != "NA" else float("Inf") for s in range(len(splitted_hits))] # 9th col= Distance
# Dist_hit = [Dist_hits[0] if all_same(Dist_hits) else
# Dist_hits] #Either same dist or All NAs

# When All "NAs" for all queries-> keep only line from 1st
# query
NA_hits = map(lambda s: "NA" if splitted_hits[s][
10] == "NA" else None, range(len(splitted_hits)))
NA_hits = ["NA" if splitted_hits[s][
10] == "NA" else None for s in range(len(splitted_hits))]
# If None means ALL queries have a feature
all_have_feat = all(x is None for x in NA_hits)

Expand Down
43 changes: 22 additions & 21 deletions uropa/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def parse_json(infile):
assert isinstance(infile, str), 'Argument {0} of wrong type ({1}), should be {2}!'.format(
'column', type(infile), 'str')

with open(infile, 'r') as f:
with open(infile, 'r', encoding='utf-8') as f:
return ast.literal_eval(json.dumps(json.load(f)))


Expand All @@ -73,8 +73,9 @@ def column_from_file(file, column, log=None):

cmd = 'cut -f' + str(column) + ' ' + str(file) + \
' | sort | uniq | grep -v "^#"'

try:
vals = subprocess.check_output(cmd, shell=True)
vals = str(subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True), encoding='utf-8')
except subprocess.CalledProcessError:
if log is not None:
log.warning("File {} might be empty or has not enough columns".format(file))
Expand All @@ -86,16 +87,16 @@ def parse_parameters(config, log=None):
"""Fills the configuration with default values. Writes a warning to logs, if unknown keys are detected."""
defaults = {"priority": "False", "bed": "no_peaks.bed",
"gtf": "no_annotation.gtf"} # , "bigwig": "none.bw"
keys = defaults.keys()
values = map(lambda x: config[x] if x in config else defaults[x], keys)
keys = list(defaults.keys())
values = [config[x] if x in config else defaults[x] for x in keys]

if log is not None:
unknown = [k for k in config.keys() if k not in keys and k !=
unknown = [k for k in list(config.keys()) if k not in keys and k !=
"queries"]
if any(unknown):
log.warning(
"Unknown keys detected in configuration: {}".format(unknown))
parameters = dict(zip(keys, values))
parameters = dict(list(zip(keys, values)))
return parameters


Expand All @@ -104,7 +105,7 @@ def parse_queries(config, gtf_feat, log=None):
defaults = {"feature": gtf_feat, "strand": "ignore", "show.attributes": "None", "filter.attribute": "None",
"attribute.value": "None", "distance": 100000, "feature.anchor": ["start", "center", "end"],
"direction": "any_direction", "internals": "False", "priority": "False"}
keys = defaults.keys()
keys = list(defaults.keys())

try:
query_list = config["queries"]
Expand All @@ -122,49 +123,49 @@ def parse_queries(config, gtf_feat, log=None):
query_list = [query_list]

def give_val(q):
return map(lambda x: q[x] if x in q and q[x] != "" else defaults[x], keys)
return [q[x] if x in q and q[x] != "" else defaults[x] for x in keys]

def make_list(l):
return map(lambda v: [v] if not isinstance(v, list) else v, l)
return [[v] if not isinstance(v, list) else v for v in l]

vals = map(lambda l: give_val(l), query_list)
values = map(lambda l: make_list(l), vals)
vals = [give_val(l) for l in query_list]
values = [make_list(l) for l in vals]

if not log is None:
unknown = [k for k in config.keys() if k not in list(set(['gtf', 'bed', 'queries']).union(keys))]
unknown = [k for k in list(config.keys()) if k not in list(set(['gtf', 'bed', 'queries']).union(keys))]
if any(unknown):
log.warning("Unknown keys detected in configuration: {}".format(unknown))

queries = map(lambda x: dict(zip(keys, x)), values)
queries = [dict(list(zip(keys, x))) for x in values]
return queries


def remove_invalid_queries(queries, log=None):
"""Removes queries that have multiple attribute.filter or filter.value values and more than two distance constraints."""
# Validate distance constraints
has_valid_distance = map(lambda q: len(q["distance"]) < 3, queries)
has_valid_distance = [len(q["distance"]) < 3 for q in queries]
if not all(has_valid_distance) and log is not None:
log.warning("Queries with invalid distances present! Affected queries: {}".format(
[i for i, x in enumerate(has_valid_distance) if not x]))

# Validate query attributes
has_valid_attributes = map(lambda q: False if (q["filter.attribute"] != ['None'] and q["attribute.value"] == [
'None']) or (q["filter.attribute"] == ['None'] and q["attribute.value"] != ['None']) else True, queries)
has_valid_attributes = [False if (q["filter.attribute"] != ['None'] and q["attribute.value"] == [
'None']) or (q["filter.attribute"] == ['None'] and q["attribute.value"] != ['None']) else True for q in queries]

if not all(has_valid_attributes) and log is not None:
log.warning("Queries with invalid filter.attribute and attribute.value pairings present! Affected queries: {}".format(
[i for i, x in enumerate(has_valid_attributes) if not x]))

# Validate strand attribute
has_valid_strand = map(lambda q: True if q["strand"] in [["ignore"],["same"],["opposite"]] else False, queries)
has_valid_strand = [True if q["strand"] in [["ignore"], ["same"], ["opposite"]] else False for q in queries]

if not all(has_valid_strand) and log is not None:
log.warning("Queries with invalid strand values present! Affected queries: {}".format(
[i for i, x in enumerate(has_valid_strand) if not x]))

# Validate query attribute lengths
has_valid_attribute_lengths = map(lambda q: False if len(
q["filter.attribute"]) > 1 or len(q["attribute.value"]) > 1 else True, queries)
has_valid_attribute_lengths = [False if len(
q["filter.attribute"]) > 1 or len(q["attribute.value"]) > 1 else True for q in queries]
if not all(has_valid_attribute_lengths) and log is not None:
log.warning("Queries with more than one value for either filter.attribute or attribute.value present! Affected queries: {}".format(
[i for i, x in enumerate(has_valid_attribute_lengths) if not x]))
Expand All @@ -176,7 +177,7 @@ def remove_invalid_queries(queries, log=None):

def parse_first_gtf_line(gtf):
"""Removes comment lines, reads first line to check for prefix chr. Returns if chr was found and the number of columns."""
f = open(gtf, "r")
f = open(gtf, "r", encoding='utf-8')
is_comment = True
while is_comment:
line = f.readline()
Expand All @@ -192,7 +193,7 @@ def cut_gtf_perFeat(gtf, features, prefix):
gtf_per_feat = prefix + os.path.basename(gtf).split(".gtf")[0] + "_cut_per_feat.gtf"
feat2cut = np.unique(features)

f = open(gtf, "r")
f = open(gtf, "r", encoding='utf-8')
is_comment = True
while is_comment:
line = f.readline()
Expand Down
30 changes: 15 additions & 15 deletions uropa/overlaps.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
"""Contains functions for UROPA overlap evaluation."""
from __future__ import division

import os
import re
import sys
Expand Down Expand Up @@ -134,11 +134,11 @@ def parse_peak(peakstr, extend=0, delim='\t'):

defaults = {'id': p['chr'] + ":" + p['start'] + "-" + p['end'], 'name': p['chr'] + ":" + p['start'] + "-" + p['end'], 'score': 0, 'strand': ".",
'center': int(np.around(np.mean([int(p['start']), int(p['end'])]))), 'estart': max(1, int(p['start']) - extend), 'eend': int(p['end']) + extend, 'length': abs(int(p['end']) - int(p['start']))}
if 'name' in p.keys():
if 'name' in list(p.keys()):
p['id'] = p['chr'] + ":" + p['start'] + "-" + p['end'] + "_" + p['name']

values = [p[k] if k in p else defaults[k] for k in defaults.keys()]
peak = dict(zip(p.keys() + defaults.keys(), p.values() + values))
values = [p[k] if k in p else defaults[k] for k in list(defaults.keys())]
peak = dict(list(zip(list(p.keys()) + list(defaults.keys()), list(p.values()) + values)))
return peak


Expand Down Expand Up @@ -170,7 +170,7 @@ def distance_to_peak_center(p_center, hit3, hit4, strand, feat_pos):

hit_center = np.mean([feat_start, feat_end])
hit_pos = {"start": feat_start, "center": hit_center, "end": feat_end}
d_pos = map(lambda i: abs(hit_pos[i] - p_center), feat_pos)
d_pos = [abs(hit_pos[i] - p_center) for i in feat_pos]
pos, dmin = min(enumerate(d_pos), key=operator.itemgetter(1))

min_pos = feat_pos[pos]
Expand Down Expand Up @@ -233,7 +233,7 @@ def get_distance_by_dir(inputD, genom_loc, intern_loc, Dhit, internals_allowed=[
if internals_allowed == ["True"]:
return(any(intern_loc))
else:
best_dist = map(lambda l: Dhit <= D_upstr if l == "upstream" else Dhit <= D_downstr, intern_loc)
best_dist = [Dhit <= D_upstr if l == "upstream" else Dhit <= D_downstr for l in intern_loc]
return(any(best_dist))


Expand Down Expand Up @@ -261,8 +261,8 @@ def round_up(val):
# import "division" allows decimals
def calculate_overlap(pstart, pend, peakL, featL, hit_start, hit_end):
"""[Local] Returns value of overlap btw {0,1} representing percentage of length covered by the peak and that covered by the feature """
p_range = range(pstart, pend)
feat_range = range(hit_start, hit_end)
p_range = list(range(pstart, pend))
feat_range = list(range(hit_start, hit_end))
pset = set(p_range)

ovl_range = pset.intersection(feat_range)
Expand Down Expand Up @@ -315,10 +315,10 @@ def get_hit_attribute(hit, attribute):
"""Splits attributes from hit lines"""
pos_match = [i if re.match(
attribute, hit_a) else None for i, hit_a in enumerate(hit[8].split("; "))]
pos_match = filter(lambda x: x is not None, pos_match)
pos_match = [x for x in pos_match if x is not None]
if len(pos_match) > 0:
attr_val = map(lambda m: hit[8].split("; ")[m].split(" ")[
1].strip('"\'').rstrip('\";'), pos_match)
attr_val = [hit[8].split("; ")[m].split(" ")[
1].strip('"\'').rstrip('\";') for m in pos_match]
val = [av for av in attr_val if av is not None][0]
val = val.replace("\t", "").replace("\r", "").replace("\n", "")
return val
Expand All @@ -334,7 +334,7 @@ def get_besthit(q, len_q_dist, p_nm, hit, attrib_keys, Dhit):
if attrib_keys != ["None"]:
#attr_val = map(lambda k: re.split(k+" ", hit[8])[1].split(';')[0].strip('"\'').rstrip('\"'), attrib_keys)
# list of all values if multiple keys
attr_val = map(lambda a: get_hit_attribute(hit, a), attrib_keys)
attr_val = [get_hit_attribute(hit, a) for a in attrib_keys]
ret = [Dhit, [hit[2], hit[3], hit[4], hit[6], attr_val]]

elif attrib_keys == ["None"]:
Expand Down Expand Up @@ -367,7 +367,7 @@ def create_table(peak_id, chrom, pstart, pend, p_center, min_dist_hit, attrib_ke
def write_hit_to_All(All_hits_tab, p_name, attrib_k, Dhit, hit, peak_id, chrom, pstart, pend, p_center, min_pos, genomic_location, ovl_pk, ovl_feat, j):
"""Writes an output table."""
if attrib_k != ["None"]:
attr_val = map(lambda a: get_hit_attribute(hit, a), attrib_k)
attr_val = [get_hit_attribute(hit, a) for a in attrib_k]
hit2add = [Dhit, [hit[2], hit[3], hit[4], hit[6], attr_val]]
the_res = create_table(peak_id, chrom, pstart, pend, p_center,
hit2add, attrib_k, min_pos, genomic_location, ovl_pk, ovl_feat, j)
Expand Down Expand Up @@ -422,8 +422,8 @@ def write_partial_file(outfile, Table):

def merge_queries(Best_combo_k):
"""Merges queries."""
merged_q = map(lambda l: Best_combo_k[l].split(
"\t")[-1].strip("\n"), range(len(Best_combo_k)))
merged_q = [Best_combo_k[l].split(
"\t")[-1].strip("\n") for l in range(len(Best_combo_k))]
# Read if combo_k is "" and give it the q.numb, so I have it ready for
# merging when one query =""
merged_q[-1] = merged_q[-1] + "\n"
Expand Down
Loading

0 comments on commit eead1aa

Please sign in to comment.