Skip to content
This repository has been archived by the owner. It is now read-only.

Commit

Permalink
Browse files Browse the repository at this point in the history
3.1.0: fixes to gtf reading and filtering, internal speed-up, threads…
… available from config
  • Loading branch information
msbentsen committed Apr 1, 2019
1 parent 23f08cb commit cc25665
Show file tree
Hide file tree
Showing 9 changed files with 448 additions and 278 deletions.
8 changes: 8 additions & 0 deletions CHANGES
@@ -1,3 +1,11 @@
## 3.1.0 (2019-03-31)
- fixed missing sorting of gtf
- added handling of comment lines in gtf
- fixed insufficient backwards functionality for feature.anchor -> feature_anchor and other keys
- internal speed-up of priority queries by fetching hits from queries separately
- keys such as threads and prefix can now be given via config file
- re-added --summary

## 3.0.0 (2019-03-18)
- revision of functionality to fix several bugs and to simplify future problems
- new functionality:
Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
@@ -1,2 +1,3 @@
include README.md
include LICENSE
include uropa/__init__.py
2 changes: 1 addition & 1 deletion sample_config.json
@@ -1,7 +1,7 @@
{
"queries":[
{"name":"", "feature":"", "distance":"", "feature_anchor": "", "strand":"",
"filter_attribute":"", "attribute_value":"","direction":"", "internals":"" }
"filter_attribute":"", "attribute_values":"","direction":"", "internals":"" }
],
"show_attributes":"",
"priority": "",
Expand Down
3 changes: 1 addition & 2 deletions setup.py
Expand Up @@ -18,7 +18,7 @@ def readme():
return f.read()

setup(name='uropa',
version='3.0.1',
version=find_version(os.path.join(setupdir, "uropa", "__init__.py")),
description='UROPA is a command line based tool, intended for genomic region annotation',
long_description=readme(),
url='https://github.molgen.mpg.de/loosolab/UROPA',
Expand All @@ -33,7 +33,6 @@ def readme():
install_requires=[
'numpy',
'pysam',
'pandas'
],
classifiers = [
'License :: OSI Approved :: MIT License',
Expand Down
2 changes: 1 addition & 1 deletion uropa/__init__.py
@@ -1 +1 @@
__version__ = "3.0.2"
__version__ = "3.1.0"
257 changes: 148 additions & 109 deletions uropa/annotation.py

Large diffs are not rendered by default.

245 changes: 138 additions & 107 deletions uropa/uropa.py

Large diffs are not rendered by default.

206 changes: 149 additions & 57 deletions uropa/utils.py
Expand Up @@ -69,92 +69,179 @@ def parse_json(infile):
return ast.literal_eval(json.dumps(json.load(f)))


def format_config(cfg_dict):
def format_config(cfg_dict, logger):
""" Format input config to catch any errors """

convert_values = {value: True for value in [True, 'T', 'True', 'TRUE', 'Yes', 'YES', 'Y', 'yes']}
convert_values.update({value: False for value in [False, 'F', 'False', 'FALSE', 'No', 'NO', 'N', 'no']})
query_prefix = "query_"
case_conversion = {"peakinsidefeature": "PeakInsideFeature", "featureinsidepeak": "FeatureInsidePeak",
"upstream": "Upstream", "downstream":"Downstream", "overlapstart":"OverlapStart", "overlapend": "OverlapEnd"}
convert_values = {value: True for value in [True, 't', 'true', 'yes', 'y']}
convert_values.update({value: False for value in [False, 'f', 'false', 'no', 'n']})


###################### Upper level config keys ####################
#Check existance of upper-level keys
valid_config_keys = set(["queries", "priority", "show_attributes", "gtf", "bed", "prefix", "outdir", "threads"])
given_config_keys = set(cfg_dict.keys())
invalid_config_keys = given_config_keys - valid_config_keys
if len(invalid_config_keys) > 0:
logger.error("Error in config file: Key(s) {0} not known to UROPA.".format(invalid_config_keys))
sys.exit()

#Delete empty keys
for key in list(cfg_dict.keys()):
if is_empty(cfg_dict[key]):
del cfg_dict[key]

#Set upper-level keys
cfg_dict["priority"] = convert_values[cfg_dict.get("priority", False)]
#Check format of values
if "show_attributes" not in cfg_dict:
cfg_dict["show_attributes"] = []

cfg_dict["priority"] = convert_values[str(cfg_dict.get("priority", False)).lower()] #per default false

####################### Query level keys ##########################
#Convert keys for backwards compatibility {old:new, (...)}
conversion = {"show.attributes":"show_attributes", "feature.anchor": "feature_anchor", "filter.attribute":"filter_attribute",
"attribute.values": "attribute_values", "attribute.value": "attribute_values", "attribute_value": "attribute_values",
"direction": "relative_location"}
valid_query_keys = set(["feature", "feature_anchor", "distance", "strand", "relative_location", "internals", "filter_attribute", "attribute_values", "name"])

#Check and fill queries with needed defaults such as distance
for i, query in enumerate(cfg_dict["queries"]):

#Feature
if type(query.get("feature", [])) != list:
query["feature"] = [query["feature"]]
#Convert keys for backwards compatibility
for key in list(query.keys()):
if key in conversion:
query[conversion[key]] = query[key]
del query[key]

#Move show_attributes to upper_level if given
if "show_attributes" in query:
if type(query["show_attributes"]) == list:
for att in query["show_attributes"]:
if att not in cfg_dict["show_attributes"]:
cfg_dict["show_attributes"].append(att)
else:
if query["show_attributes"] not in cfg_dict["show_attributes"]:
cfg_dict["show_attributes"].append(query["show_attributes"])
del query["show_attributes"]

#Check if there are attributes in query which are not in the "allowed" values
query_keys = set(query.keys())
invalid_keys = query_keys - valid_query_keys
if len(invalid_keys) > 0:
logger.error("Error in query {0}. Key(s) {1} are not known to UROPA.".format(i+1, invalid_keys))
sys.exit()

if type(query.get("feature_anchor", [])) != list:
query["feature_anchor"] = [query["feature_anchor"]]
#Delete empty keys
for key in list(query.keys()):
if is_empty(query[key]):
del query[key]

if type(query.get("attribute_values", [])) != list:
query["attribute_values"] = [query["attribute_values"]]
# If filter_attribute is set, attribute_values has to be set as well
if ("filter_attribute" in query and "attribute_values" not in query) or ("attribute_values" in query and "filter_attribute" not in query):
logger.error("Error in query {0}. Keys for filter_attribute/attribute_values have to be set together.".format(i+1))
sys.exit()

##### Check content of individual keys #####
if "distance" in query:
try:
if type(query["distance"]) == list:
query["distance"] = [int(query["distance"][0]), int(query["distance"][-1])]
else:
query["distance"] = [int(query["distance"]), int(query["distance"])] #same distance in both positions
except:
logger.error("Error trying to convert distance \"{0}\" to integer - please check the format of this value.".format(query["distance"]))
sys.exit()
else:
logger.error("No 'distance' not given in query: {0}".format(query))
sys.exit()

#Internals
internalval = query.get("internals", False)
query["internals"] = float(convert_values.get(internalval, internalval)) #if internals is a float, the value will not be converted
query["internals"] = float(convert_values.get(str(internalval).lower(), internalval)) #if internals is a float, the value will not be converted
if query["internals"] == 0:
del query["internals"]

#Keys which should be lists (sets) to search in
for key in ["feature", "feature_anchor", "attribute_values", "relative_location"]:
if type(query.get(key, [])) != list:
query[key] = set([query[key]])

#Check the default values of input
if "strand" in query:
query["strand"] = query["strand"].lower()
valid = set(['ignore', 'same', 'opposite'])
if query["strand"] not in valid:
logger.error("Invalid strand ({0}) set in query {1}. Valid options are: {2}".format(query["strand"], i+1, valid))
sys.exit()

if "distance" in query:
if type(query["distance"]) == list:
query["distance"] = [int(query["distance"][0]), int(query["distance"][1])]
else:
query["distance"] = [int(query["distance"]), int(query["distance"])] #same distance in both positions
if "feature_anchor" in query:
query["feature_anchor"] = set([element.lower() for element in query["feature_anchor"]])
valid = set(["start", "center", "end"])
invalid = query["feature_anchor"] - valid
if len(invalid) > 0:
logger.error("Invalid feature_anchor ({0}) set in query {1}. Valid options are: {2}".format(invalid, i+1, valid))
sys.exit()

if "relative_location" in query:
query["relative_location"] = set([case_conversion.get(str(element).lower(), element) for element in query["relative_location"]])
valid = set(["PeakInsideFeature", "FeatureInsidePeak", "Upstream", "Downstream", "OverlapStart", "OverlapEnd"])
invalid = query["relative_location"] - valid
if len(invalid) > 0:
logger.error("Invalid relative_location ({0}) set in query {1}. Valid options are: {3}".format(invalid, i+1, valid))
sys.exit()

#Name the query if it was not already named
if "name" not in query:
query["name"] = query_prefix + str(i + 1)

############## Warnings related to config/queries #############
#Check for general input in queries, such as no show_attributes
if len(cfg_dict.get("show_attributes", [])) == 0:
logger.warning("No show_attributes given in config - no attributes for annotations are displayed in output.")

#Catch duplicates in query names
query_names = [query["name"] for query in cfg_dict["queries"]]
if len(query_names) != len(set(query_names)):
logger.warning("Duplicates in query names: {0}".format(query_names))

return(cfg_dict)



def config_string(cfg_dict):
""" Pretty-print cfg_dict with one-line queries """

upper_level = ["queries", "show_attributes", "priority", "gtf", "bed"]
upper_level = ["queries", "show_attributes", "priority", "gtf", "bed", "prefix", "outdir", "threads"]
query_level = ["feature", "feature_anchor", "distance", "strand", "relative_location", "filter_attribute", "attribute_values", "internals", "name"]

upper_lines = []
for upper_key in upper_level:
upper_str = "\"{0}\":".format(upper_key)

if upper_key == "queries":
query_lines = "\"queries\":[\n"

queries = []
for query in cfg_dict[upper_key]:

one_query = []
for query_key in query_level:
query_string = ""

if query_key in query:
value = None
if type(query[query_key]) == list:
if len(query[query_key]) > 0:
value = "[" + ",".join(["\"{0}\"".format(value) for value in query[query_key]]) + "]"
else:
if query[query_key] != None:
value = "\"{0}\"".format(query[query_key])

if value is not None:
query_string += "\"{0}\": {1}".format(query_key, value)
one_query.append(query_string)

querystr = " {" + ", ".join(one_query) + "}"
queries.append(querystr)

upper_str += "[\n" + ",\n".join(queries) + "\n]"

elif upper_key == "show_attributes":
upper_str += "[" + ",".join(["\"{0}\"".format(value) for value in cfg_dict[upper_key]]) + "]"
#Convert sets to lists
for query in cfg_dict["queries"]:
for key in query:
if type(query[key]) == set:
query[key] = list(query[key])

else:
upper_str += "\"{0}\"".format(cfg_dict[upper_key])
query_strings = [json.dumps(query, sort_keys=True) for query in cfg_dict["queries"]]
query_lines += " " + ",\n ".join(query_strings) + "\n ]"

upper_lines.append(upper_str )
upper_lines.append(query_lines)

elif upper_key == "show_attributes" and upper_key in cfg_dict:
upper_lines.append("\"{0}\": {1}".format(upper_key, json.dumps(cfg_dict[upper_key])))

fstr = "{\n" + ",\n".join(upper_lines) + "\n}\n"
else:
if upper_key in cfg_dict:
upper_lines.append("\"{0}\": \"{1}\"".format(upper_key, cfg_dict[upper_key]))

config_string = "{\n" + ",\n".join(upper_lines) + "\n}\n"

return(fstr)
return(config_string)

def check_file_access(fil, logger):
""" Check if file exists and can be read """
Expand Down Expand Up @@ -203,7 +290,7 @@ def parse_bedfile(bedfile, gtf_has_chr):
if gtf_has_chr == True:
gtf_chr = "chr" + chrom if "chr" not in chrom else chrom

peak_dict = {"gtf_chr": gtf_chr, "peak_chr":chrom, "peak_start":start, "peak_end":end, "peak_id":name, "peak_score":score, "strand":strand, "internal_peak_id": "peak_{0}".format(i+1)}
peak_dict = {"gtf_chr": gtf_chr, "peak_chr":chrom, "peak_start":start, "peak_end":end, "peak_id":name, "peak_score":score, "peak_strand":strand, "internal_peak_id": i+1}
peak_dict.update(dict(zip(additional_header, additional)))
peaks.append(peak_dict)

Expand All @@ -230,14 +317,19 @@ def check_chr(file, lines=100):

def subset_gtf(gtf, features, sub_gtf):
"""Removes lines with features not in features from a gtf file."""
feat2cut = np.unique(features)
feat2cut = set(np.unique(features))

with open(gtf, "r") as f:
lines = f.readlines()
gtf_query_feat = [line for line in lines if line.startswith("#") or line.split("\t")[2] in feat2cut]
gtf_query_feat = [line for line in lines if line.split("\t")[2] in feat2cut]

# List of lines containing only selected features from query
with open(sub_gtf, "w") as f:
f.write("".join(gtf_query_feat))


def is_empty(value):
""" Check whether value is empty """
if (type(value) == str and value.strip() == "") or (type(value) == list and len(value) == 0):
return(True)
else:
return(False)
2 changes: 1 addition & 1 deletion utils/uropa_summary.R
Expand Up @@ -135,7 +135,7 @@ features <- c()
plot.new()
df.uropa.final <- read.table(final.hits, header=TRUE, sep="\t",stringsAsFactors = FALSE)
# number of peaks annoteted with uropa run
num.peaks <- length(unique(df.uropa.final$peak_id))
num.peaks <- length(df.uropa.final$peak_id)
# stats is based on annoted peaks -> remove na rows
df.uropa.final[,"distance"] <- as.numeric(df.uropa.final[,"distance"])
df.uropa.final <- df.uropa.final[complete.cases(df.uropa.final),]
Expand Down

0 comments on commit cc25665

Please sign in to comment.