3.1.0: fixes to gtf reading and filtering, internal speed-up, threads…

… available from config
loosolab · Apr 1, 2019 · cc25665 · cc25665
1 parent 23f08cb
commit cc25665
Show file tree

Hide file tree

Showing 9 changed files with 448 additions and 278 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,3 +1,11 @@
+## 3.1.0 (2019-03-31)
+- fixed missing sorting of gtf
+- added handling of comment lines in gtf
+- fixed insufficient backwards functionality for feature.anchor -> feature_anchor and other keys
+- internal speed-up of priority queries by fetching hits from queries separately
+- keys such as threads and prefix can now be given via config file
+- re-added --summary
+
 ## 3.0.0 (2019-03-18)
 - revision of functionality to fix several bugs and to simplify future problems
 - new functionality: 

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,3 @@
 include README.md
 include LICENSE
+include uropa/__init__.py
diff --git a/sample_config.json b/sample_config.json
@@ -1,7 +1,7 @@
 {
 "queries":[
 		  {"name":"", "feature":"", "distance":"", "feature_anchor": "", "strand":"",
-		   "filter_attribute":"", "attribute_value":"","direction":"", "internals":"" }
+		   "filter_attribute":"", "attribute_values":"","direction":"", "internals":"" }
           ],
 "show_attributes":"",
 "priority": "",

diff --git a/setup.py b/setup.py
@@ -18,7 +18,7 @@ def readme():
         return f.read()
 
 setup(name='uropa',
-      version='3.0.1',
+      version=find_version(os.path.join(setupdir, "uropa", "__init__.py")),
       description='UROPA is a command line based tool, intended for genomic region annotation',
       long_description=readme(),
       url='https://github.molgen.mpg.de/loosolab/UROPA',
@@ -33,7 +33,6 @@ def readme():
       install_requires=[
         'numpy',
         'pysam',
-        'pandas'
       ],
       classifiers = [
         'License :: OSI Approved :: MIT License',

diff --git a/uropa/__init__.py b/uropa/__init__.py
@@ -1 +1 @@
-__version__ = "3.0.2"
+__version__ = "3.1.0"
diff --git a/uropa/annotation.py b/uropa/annotation.py
diff --git a/uropa/uropa.py b/uropa/uropa.py
diff --git a/uropa/utils.py b/uropa/utils.py
@@ -69,92 +69,179 @@ def parse_json(infile):
         return ast.literal_eval(json.dumps(json.load(f)))
 
 
-def format_config(cfg_dict):
+def format_config(cfg_dict, logger):
 	""" Format input config to catch any errors """
 
-	convert_values = {value: True for value in [True, 'T', 'True', 'TRUE', 'Yes', 'YES', 'Y', 'yes']}
-	convert_values.update({value: False for value in [False, 'F', 'False', 'FALSE', 'No', 'NO', 'N', 'no']})
 	query_prefix = "query_"
+	case_conversion = {"peakinsidefeature": "PeakInsideFeature", "featureinsidepeak": "FeatureInsidePeak", 
+						"upstream": "Upstream", "downstream":"Downstream", "overlapstart":"OverlapStart", "overlapend": "OverlapEnd"}
+	convert_values = {value: True for value in [True, 't', 'true', 'yes', 'y']}
+	convert_values.update({value: False for value in [False, 'f', 'false', 'no', 'n']})
+
+
+	###################### Upper level config keys ####################
+	#Check existance of upper-level keys
+	valid_config_keys = set(["queries", "priority", "show_attributes", "gtf", "bed", "prefix", "outdir", "threads"])
+	given_config_keys = set(cfg_dict.keys())
+	invalid_config_keys = given_config_keys - valid_config_keys
+	if len(invalid_config_keys) > 0:
+		logger.error("Error in config file: Key(s) {0} not known to UROPA.".format(invalid_config_keys))
+		sys.exit()
+
+	#Delete empty keys
+	for key in list(cfg_dict.keys()):
+		if is_empty(cfg_dict[key]):
+			del cfg_dict[key]
 
-	#Set upper-level keys
-	cfg_dict["priority"] = convert_values[cfg_dict.get("priority", False)]
+	#Check format of values
+	if "show_attributes" not in cfg_dict:
+		cfg_dict["show_attributes"] = []
 
+	cfg_dict["priority"] = convert_values[str(cfg_dict.get("priority", False)).lower()] 	#per default false
+
+	####################### Query level keys ##########################
+	#Convert keys for backwards compatibility {old:new, (...)}
+	conversion = {"show.attributes":"show_attributes", "feature.anchor": "feature_anchor", "filter.attribute":"filter_attribute", 
+					"attribute.values": "attribute_values", "attribute.value": "attribute_values", "attribute_value": "attribute_values", 
+					"direction": "relative_location"}
+	valid_query_keys = set(["feature", "feature_anchor", "distance", "strand", "relative_location", "internals", "filter_attribute", "attribute_values", "name"])
+
+	#Check and fill queries with needed defaults such as distance
 	for i, query in enumerate(cfg_dict["queries"]):
 
-		#Feature
-		if type(query.get("feature", [])) != list:
-			query["feature"] = [query["feature"]]
+		#Convert keys for backwards compatibility
+		for key in list(query.keys()):
+			if key in conversion:
+				query[conversion[key]] = query[key]
+				del query[key]
+
+		#Move show_attributes to upper_level if given
+		if "show_attributes" in query:
+			if type(query["show_attributes"]) == list:
+				for att in query["show_attributes"]:
+					if att not in cfg_dict["show_attributes"]:
+						cfg_dict["show_attributes"].append(att)
+			else:
+				if query["show_attributes"] not in cfg_dict["show_attributes"]:
+					cfg_dict["show_attributes"].append(query["show_attributes"])
+			del query["show_attributes"]
+
+		#Check if there are attributes in query which are not in the "allowed" values
+		query_keys = set(query.keys())
+		invalid_keys = query_keys - valid_query_keys
+		if len(invalid_keys) > 0:
+			logger.error("Error in query {0}. Key(s) {1} are not known to UROPA.".format(i+1, invalid_keys))
+			sys.exit()
 
-		if type(query.get("feature_anchor", [])) != list:
-			query["feature_anchor"] = [query["feature_anchor"]]
+		#Delete empty keys
+		for key in list(query.keys()):
+			if is_empty(query[key]):
+				del query[key]
 
-		if type(query.get("attribute_values", [])) != list:
-			query["attribute_values"] = [query["attribute_values"]]
+		# If filter_attribute is set, attribute_values has to be set as well
+		if ("filter_attribute" in query and "attribute_values" not in query) or ("attribute_values" in query and "filter_attribute" not in query):
+			logger.error("Error in query {0}. Keys for filter_attribute/attribute_values have to be set together.".format(i+1))
+			sys.exit()
+
+		##### Check content of individual keys #####
+		if "distance" in query:
+			try:
+				if type(query["distance"]) == list:
+					query["distance"] =  [int(query["distance"][0]), int(query["distance"][-1])]
+				else:
+					query["distance"] = [int(query["distance"]), int(query["distance"])]	#same distance in both positions
+			except:
+				logger.error("Error trying to convert distance \"{0}\" to integer - please check the format of this value.".format(query["distance"]))
+				sys.exit()
+		else:
+			logger.error("No 'distance' not given in query: {0}".format(query))
+			sys.exit()
 
 		#Internals
 		internalval = query.get("internals", False)
-		query["internals"] = float(convert_values.get(internalval, internalval))	#if internals is a float, the value will not be converted
+		query["internals"] = float(convert_values.get(str(internalval).lower(), internalval))	#if internals is a float, the value will not be converted
+		if query["internals"] == 0:
+			del query["internals"]
+
+		#Keys which should be lists (sets) to search in 
+		for key in ["feature", "feature_anchor", "attribute_values", "relative_location"]:
+			if type(query.get(key, [])) != list:
+				query[key] = set([query[key]])
+
+		#Check the default values of input
+		if "strand" in query:
+			query["strand"] = query["strand"].lower()
+			valid = set(['ignore', 'same', 'opposite'])
+			if query["strand"] not in valid:
+				logger.error("Invalid strand ({0}) set in query {1}. Valid options are: {2}".format(query["strand"], i+1, valid))
+				sys.exit()
 
-		if "distance" in query:
-			if type(query["distance"]) == list:
-				query["distance"] =  [int(query["distance"][0]), int(query["distance"][1])]
-			else:
-				query["distance"] = [int(query["distance"]), int(query["distance"])]	#same distance in both positions
+		if "feature_anchor" in query:
+			query["feature_anchor"] = set([element.lower() for element in query["feature_anchor"]])
+			valid = set(["start", "center", "end"])
+			invalid = query["feature_anchor"] - valid
+			if len(invalid) > 0:
+				logger.error("Invalid feature_anchor ({0}) set in query {1}. Valid options are: {2}".format(invalid, i+1, valid))
+				sys.exit()
+
+		if "relative_location" in query:
+			query["relative_location"] = set([case_conversion.get(str(element).lower(), element) for element in query["relative_location"]])
+			valid = set(["PeakInsideFeature", "FeatureInsidePeak", "Upstream", "Downstream", "OverlapStart", "OverlapEnd"])
+			invalid = query["relative_location"] - valid
+			if len(invalid) > 0:
+				logger.error("Invalid relative_location ({0}) set in query {1}. Valid options are: {3}".format(invalid, i+1, valid))
+				sys.exit()
 
 		#Name the query if it was not already named
 		if "name" not in query:
 			query["name"] = query_prefix + str(i + 1)
 
+	############## Warnings related to config/queries #############
+	#Check for general input in queries, such as no show_attributes
+	if len(cfg_dict.get("show_attributes", [])) == 0:
+		logger.warning("No show_attributes given in config - no attributes for annotations are displayed in output.")
+
+	#Catch duplicates in query names	
+	query_names = [query["name"] for query in cfg_dict["queries"]]
+	if len(query_names) != len(set(query_names)):
+		logger.warning("Duplicates in query names: {0}".format(query_names))
+
 	return(cfg_dict)
 
+
+
 def config_string(cfg_dict):
 	""" Pretty-print cfg_dict with one-line queries """
 
-	upper_level = ["queries", "show_attributes", "priority",  "gtf", "bed"]
+	upper_level = ["queries", "show_attributes", "priority",  "gtf", "bed", "prefix", "outdir", "threads"]
 	query_level = ["feature", "feature_anchor", "distance", "strand", "relative_location", "filter_attribute", "attribute_values", "internals", "name"]
 
 	upper_lines = []
 	for upper_key in upper_level:
-		upper_str = "\"{0}\":".format(upper_key)
-
 		if upper_key == "queries":
+			query_lines = "\"queries\":[\n"
 
-			queries = []
-			for query in cfg_dict[upper_key]:
-
-				one_query = []
-				for query_key in query_level:
-					query_string = ""
-
-					if query_key in query:
-						value = None
-						if type(query[query_key]) == list:
-							if len(query[query_key]) > 0:
-								value = "[" + ",".join(["\"{0}\"".format(value) for value in query[query_key]]) + "]"
-						else:
-							if query[query_key] != None:
-								value = "\"{0}\"".format(query[query_key])
-
-						if value is not None:
-							query_string += "\"{0}\": {1}".format(query_key, value)
-							one_query.append(query_string)
-
-				querystr = "    {" + ", ".join(one_query) + "}"
-				queries.append(querystr)
-
-			upper_str += "[\n" + ",\n".join(queries) + "\n]"
-
-		elif upper_key == "show_attributes":
-			upper_str  += "[" + ",".join(["\"{0}\"".format(value) for value in cfg_dict[upper_key]]) + "]"
+			#Convert sets to lists
+			for query in cfg_dict["queries"]:
+				for key in query:
+					if type(query[key]) == set:
+						query[key] = list(query[key])
 
-		else:
-			upper_str += "\"{0}\"".format(cfg_dict[upper_key])
+			query_strings = [json.dumps(query, sort_keys=True) for query in cfg_dict["queries"]]
+			query_lines += "    " + ",\n    ".join(query_strings) + "\n          ]"
 
-		upper_lines.append(upper_str )
+			upper_lines.append(query_lines)
+
+		elif upper_key == "show_attributes" and upper_key in cfg_dict:
+			upper_lines.append("\"{0}\": {1}".format(upper_key, json.dumps(cfg_dict[upper_key])))
 
-	fstr = "{\n" + ",\n".join(upper_lines) + "\n}\n"
+		else:
+			if upper_key in cfg_dict:
+				upper_lines.append("\"{0}\": \"{1}\"".format(upper_key, cfg_dict[upper_key]))
+
+	config_string = "{\n" + ",\n".join(upper_lines) + "\n}\n"
 
-	return(fstr)
+	return(config_string)
 
 def check_file_access(fil, logger):
 	""" Check if file exists and can be read """
@@ -203,7 +290,7 @@ def parse_bedfile(bedfile, gtf_has_chr):
 			if gtf_has_chr == True:
 				gtf_chr = "chr" + chrom if "chr" not in chrom else chrom
 
-			peak_dict = {"gtf_chr": gtf_chr, "peak_chr":chrom, "peak_start":start, "peak_end":end, "peak_id":name, "peak_score":score, "strand":strand, "internal_peak_id": "peak_{0}".format(i+1)}
+			peak_dict = {"gtf_chr": gtf_chr, "peak_chr":chrom, "peak_start":start, "peak_end":end, "peak_id":name, "peak_score":score, "peak_strand":strand, "internal_peak_id": i+1}
 			peak_dict.update(dict(zip(additional_header, additional)))
 			peaks.append(peak_dict)
 
@@ -230,14 +317,19 @@ def check_chr(file, lines=100):
 
 def subset_gtf(gtf, features, sub_gtf):
 	"""Removes lines with features not in features from a gtf file."""
-	feat2cut = np.unique(features)
+	feat2cut = set(np.unique(features))
 
 	with open(gtf, "r") as f:
 		lines = f.readlines()
-		gtf_query_feat = [line for line in lines if line.startswith("#") or line.split("\t")[2] in feat2cut]
+		gtf_query_feat = [line for line in lines if line.split("\t")[2] in feat2cut]
 
 	# List of lines containing only selected features from query
 	with open(sub_gtf, "w") as f:
 		f.write("".join(gtf_query_feat))	
 
-
+def is_empty(value):
+	""" Check whether value is empty """
+	if (type(value) == str and value.strip() == "") or (type(value) == list and len(value) == 0):
+		return(True)
+	else:
+		return(False)
diff --git a/utils/uropa_summary.R b/utils/uropa_summary.R
@@ -135,7 +135,7 @@ features <- c()
 	plot.new()
 	df.uropa.final <- read.table(final.hits, header=TRUE, sep="\t",stringsAsFactors = FALSE)
 	# number of peaks annoteted with uropa run
-	num.peaks <- length(unique(df.uropa.final$peak_id))
+	num.peaks <- length(df.uropa.final$peak_id)
 	# stats is based on annoted peaks -> remove na rows
 	df.uropa.final[,"distance"] <- as.numeric(df.uropa.final[,"distance"])
 	df.uropa.final <- df.uropa.final[complete.cases(df.uropa.final),]