From cc25665bae62a51087856bfa1834c1f82bb14bb3 Mon Sep 17 00:00:00 2001 From: msbentsen Date: Mon, 1 Apr 2019 09:54:02 +0200 Subject: [PATCH] 3.1.0: fixes to gtf reading and filtering, internal speed-up, threads available from config --- CHANGES | 8 ++ MANIFEST.in | 1 + sample_config.json | 2 +- setup.py | 3 +- uropa/__init__.py | 2 +- uropa/annotation.py | 257 ++++++++++++++++++++++++------------------ uropa/uropa.py | 245 ++++++++++++++++++++++------------------ uropa/utils.py | 206 +++++++++++++++++++++++---------- utils/uropa_summary.R | 2 +- 9 files changed, 448 insertions(+), 278 deletions(-) diff --git a/CHANGES b/CHANGES index 344f4c5..13e835c 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,11 @@ +## 3.1.0 (2019-03-31) +- fixed missing sorting of gtf +- added handling of comment lines in gtf +- fixed insufficient backwards functionality for feature.anchor -> feature_anchor and other keys +- internal speed-up of priority queries by fetching hits from queries separately +- keys such as threads and prefix can now be given via config file +- re-added --summary + ## 3.0.0 (2019-03-18) - revision of functionality to fix several bugs and to simplify future problems - new functionality: diff --git a/MANIFEST.in b/MANIFEST.in index 04f196a..6ac52f9 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ include README.md include LICENSE +include uropa/__init__.py diff --git a/sample_config.json b/sample_config.json index 3f50bd5..c34c093 100644 --- a/sample_config.json +++ b/sample_config.json @@ -1,7 +1,7 @@ { "queries":[ {"name":"", "feature":"", "distance":"", "feature_anchor": "", "strand":"", - "filter_attribute":"", "attribute_value":"","direction":"", "internals":"" } + "filter_attribute":"", "attribute_values":"","direction":"", "internals":"" } ], "show_attributes":"", "priority": "", diff --git a/setup.py b/setup.py index 82acbae..a03cd24 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ def readme(): return f.read() setup(name='uropa', - version='3.0.1', + version=find_version(os.path.join(setupdir, "uropa", "__init__.py")), description='UROPA is a command line based tool, intended for genomic region annotation', long_description=readme(), url='https://github.molgen.mpg.de/loosolab/UROPA', @@ -33,7 +33,6 @@ def readme(): install_requires=[ 'numpy', 'pysam', - 'pandas' ], classifiers = [ 'License :: OSI Approved :: MIT License', diff --git a/uropa/__init__.py b/uropa/__init__.py index e72968d..5e6690c 100644 --- a/uropa/__init__.py +++ b/uropa/__init__.py @@ -1 +1 @@ -__version__ = "3.0.2" +__version__ = "3.1.0" diff --git a/uropa/annotation.py b/uropa/annotation.py index 7f5ae22..750b3b1 100644 --- a/uropa/annotation.py +++ b/uropa/annotation.py @@ -6,46 +6,46 @@ import re import numpy as np import logging +import datetime +def get_frozenset(adict): + return frozenset((key, set_from_dict(val) if isinstance(val, dict) else val) for key, val in adict.items()) -def create_anno_dict(peak, hit=None): +def create_anno_dict(peak, hit): """ Returns a dictionary containing information on the hit from gtf """ - #Initialize dict with NAs - keys = ["feature", "feat_strand", "feat_start", "feat_end", "feat_center", "feat_length", "feat_attributes", "distance", - "feat_anchor", "query", "query_name", "best_hit", "relative_location", "feat_ovl_peak", "peak_ovl_feat"] - anno_dict = {key:None for key in keys} - #Add peak information + anno_dict = {} anno_dict.update(peak) #fills out peak chr/start/end/id/score/strand anno_dict["peak_center"] = int((anno_dict["peak_end"] + anno_dict["peak_start"])/2) anno_dict["peak_length"] = anno_dict["peak_end"] - anno_dict["peak_start"] - #If hit was given, parse this as well - if hit is not None: - - #Parse info from gtf string + #Parse info from gtf string + try: pairs = re.split(";\s*", hit.attributes) #regex remove 0 to n spaces - pairs.remove("") - attribute_dict = {pair.split()[0]:pair.split()[1].replace("\"", "") for pair in pairs} # parse " of attribute values - - # Fill in with feature info - anno_dict["feature"] = hit.feature - anno_dict["feat_strand"] = hit.strand - anno_dict["feat_start"] = hit.start - anno_dict["feat_end"] = hit.end - anno_dict["feat_center"] = int((anno_dict["feat_end"] + anno_dict["feat_start"])/2) - anno_dict["feat_length"] = anno_dict["feat_end"] - anno_dict["feat_start"] - anno_dict["feat_attributes"] = attribute_dict - - #Look-up keys for annotation - anno_dict["anchor_pos"] = {"start": anno_dict["feat_start"] if anno_dict["feat_strand"] != "-" else anno_dict["feat_end"], - "center": anno_dict["feat_center"], - "end": anno_dict["feat_end"] if anno_dict["feat_strand"] != "-" else anno_dict["feat_start"]} - - #Change with each query - anno_dict["query"] = 0 #query for which the hit was valid - anno_dict["best_hit"] = 0 + pairs = [pair.replace("\"", "") for pair in pairs] + attribute_dict = {pair.split()[0]:pair.split()[1] for pair in pairs if pair != ""} # parse " of attribute values + except: + print("Error reading attributes: {0}".format(hit.attributes)) + attribute_dict = {} + + # Fill in with feature info + anno_dict["feature"] = hit.feature + anno_dict["feat_strand"] = hit.strand + anno_dict["feat_start"] = int(hit.start) + anno_dict["feat_end"] = int(hit.end) + anno_dict["feat_center"] = int((anno_dict["feat_end"] + anno_dict["feat_start"])/2) + anno_dict["feat_length"] = int(anno_dict["feat_end"] - anno_dict["feat_start"]) + anno_dict["feat_attributes"] = attribute_dict + + #Look-up keys for annotation + anno_dict["anchor_pos"] = {"start": anno_dict["feat_start"] if anno_dict["feat_strand"] != "-" else anno_dict["feat_end"], + "center": anno_dict["feat_center"], + "end": anno_dict["feat_end"] if anno_dict["feat_strand"] != "-" else anno_dict["feat_start"]} + + #Change with each query + anno_dict["query"] = 0 #query for which the hit was valid + anno_dict["best_hit"] = 0 return(anno_dict) @@ -53,15 +53,21 @@ def create_anno_dict(peak, hit=None): def distance_to_peak_center(anno_dict, query_anchor): """ Assigns the distance of peak center to best query anchor """ + anchor_list = list(query_anchor) + + #Set default if anchor list is empty + if len(query_anchor) == 0: + anchor_list = ["start", "center", "end"] + #Calculate distances to each possible anchor - raw_distances = [anno_dict["peak_center"] - anno_dict["anchor_pos"][anchor] for anchor in query_anchor] + raw_distances = [anno_dict["peak_center"] - anno_dict["anchor_pos"][anchor] for anchor in anchor_list] abs_distances = [abs(dist) for dist in raw_distances] min_dist_i = abs_distances.index(min(abs_distances)) #Set minimum distance as best anchor anno_dict["raw_distance"] = raw_distances[min_dist_i] - anno_dict["distance"] = abs(raw_distances[min_dist_i]) - anno_dict["feat_anchor"] = query_anchor[min_dist_i] + anno_dict["distance"] = int(abs(raw_distances[min_dist_i])) + anno_dict["feat_anchor"] = anchor_list[min_dist_i] return(anno_dict) @@ -70,12 +76,16 @@ def distance_to_peak_center(anno_dict, query_anchor): def calculate_overlap(anno_dict): """ Calculates percentage of length covered by the peak/feature """ - peak_range = list(range(anno_dict["peak_start"], anno_dict["peak_end"])) - feature_range = list(range(anno_dict["feat_start"], anno_dict["feat_end"])) - ovl_range = set(peak_range).intersection(feature_range) + #beds exclude first position, therefore +1 for starts. Range excludes last position in range - therefore +1 for end + ovl_range = range(max(anno_dict["peak_start"]+1, anno_dict["feat_start"]+1), min(anno_dict["peak_end"], anno_dict["feat_end"])+1) + ovl_bp = len(ovl_range) + + #peak_range = list(range(anno_dict["peak_start"], anno_dict["peak_end"])) + #feature_range = list(range(anno_dict["feat_start"], anno_dict["feat_end"])) + #ovl_range = set(peak_range).intersection(feature_range) - ovl_pk = round(len(ovl_range) / anno_dict["peak_length"], 3) - ovl_feat = round(len(ovl_range) / anno_dict["feat_length"], 3) + ovl_pk = round(ovl_bp / anno_dict["peak_length"], 3) + ovl_feat = round(ovl_bp / anno_dict["feat_length"], 3) anno_dict["feat_ovl_peak"] = ovl_feat anno_dict["peak_ovl_feat"] = ovl_pk @@ -96,13 +106,19 @@ def get_relative_location(anno_dict): if anno_dict["feat_ovl_peak"] > 0: location = "OverlapStart" else: - location = "Upstream" + if anno_dict["feat_strand"] == "+": + location = "Upstream" + else: + location = "Downstream" elif anno_dict["feat_anchor"] == "end": if anno_dict["feat_ovl_peak"] > 0: location = "OverlapEnd" else: - location = "Downstream" + if anno_dict["feat_strand"] == "+": + location = "Downstream" + else: + location = "Upstream" else: location = "NA" @@ -132,100 +148,123 @@ def annotate_peaks(peaks, gtf_gz, gtf_index, cfg_dict, logger=None): #For each peak in input peaks, collect all_valid_annotations all_valid_annotations = [] for peak in peaks: + logger.debug("\n\n") logger.debug("Annotating peak: {0}".format(peak)) - #Fetch all possible hits from tabix - extend_start = int(max(1, peak["peak_start"] - max_distance)) - extend_end = peak["peak_end"] + max_distance - tabix_query = "{0}:{1}-{2}".format(peak["gtf_chr"], extend_start, extend_end) - logger.debug("Tabix query: {0}".format(tabix_query)) - try: - hits = tabix_obj.fetch(peak["peak_chr"], extend_start, extend_end, parser=pysam.asGTF()) - except ValueError: - print("Could not fetch any hits for tabix {0}:{1}-{2}. Continueing.".format(peak["gtf_chr"], extend_start, extend_end)) - continue - - #Go through each hit from tabix and establish whether they are valid for queries - valid_annotations = [] + valid_annotations = [] #for this peak stop_searching = False - while stop_searching == False: # Loop will stop when stop_searching is true - for hit in hits: - logger.debug("Validating hit: {0}".format(hit)) + query_i = -1 + while query_i+1 < n_queries and stop_searching == False: + + query_i += 1 #First query is 0 + query = queries[query_i] #current query to check + + logger.debug("Finding hits for query: {0}".format(query)) + + #Extend and fetch possible hits from tabix + max_distance = max(query["distance"]) + extend_start = int(max(1, peak["peak_start"] - max_distance)) + extend_end = peak["peak_end"] + max_distance + tabix_query = "{0}:{1}-{2}".format(peak["gtf_chr"], extend_start, extend_end) + logger.debug("Tabix query for query {0} ({1}): {2}".format(query_i, query["name"], tabix_query)) + + try: + begin = datetime.datetime.now() + hits = list(tabix_obj.fetch(peak["peak_chr"], extend_start, extend_end, parser=pysam.asGTF())) #hits for this query + end = datetime.datetime.now() + logger.debug("Fetched {0} hits in {1}".format(len(hits), end - begin)) + except: + #exception if no hits could be fetched from tabix, for example if the contig does not exist in the gtf index. + #print("ERROR: Could not create iterator for peak: {0} \n with tabix query {1}".format(peak, tabix_query)) + stop_searching = True + hits = [] + + begin = datetime.datetime.now() + for hit in hits: + + #If feature is not the right one, we do not have to go further - saves computation of distances + if "feature" in query: + if hit.feature not in query["feature"]: + #logger.debug("{0} not in {1} - continuing to next hit".format(hit.feature, query["feature"])) + continue + anno_dict = create_anno_dict(peak, hit) + anno_dict["query"] = query_i + anno_dict["query_name"] = query["name"] + + #Calculate distances/relative location + anno_dict = distance_to_peak_center(anno_dict, query.get("feature_anchor", [])) + anno_dict = calculate_overlap(anno_dict) + anno_dict = get_relative_location(anno_dict) + + ##### Test validity of the hit to query_i #### + checks = {} + #if "feature" in query: + # checks["feature"] = anno_dict["feature"] in query["feature"] #feature - #Validate annotation for each peakquery - stop_searching = False - query_i = -1 #first query will be index 0 - while query_i+1 < n_queries and stop_searching == False: - - #Query information - query_i += 1 - query = queries[query_i] #current query to check - anno_dict["query"] = query_i - anno_dict["query_name"] = query["name"] - - #Calculate distances/relative location - anno_dict = distance_to_peak_center(anno_dict, query["feature_anchor"]) - anno_dict = calculate_overlap(anno_dict) - anno_dict = get_relative_location(anno_dict) - - ##### Test validity of the hit to query_i #### - checks = {} - checks["feature"] = anno_dict["feature"] in query["feature"] #feature - - #Check feature anchor + #Check feature anchor + if "feature_anchor" in query: checks["feature_anchor"] = anno_dict["feat_anchor"] in query["feature_anchor"] - #Peak strand relative to feature strand + #Peak strand relative to feature strand + if "strand" in query: if query["strand"] != "ignore" and anno_dict["peak_strand"] != ".": - checks["strand"].append((anno_dict["peak_strand"] == anno_dict["feat_strand"] and query["strand"] == "same") or - (anno_dict["peak_strand"] != anno_dict["feat_strand"] and query["strand"] == "opposite")) + checks["strand"] = ((anno_dict["peak_strand"] == anno_dict["feat_strand"] and query["strand"] == "same") or + (anno_dict["peak_strand"] != anno_dict["feat_strand"] and query["strand"] == "opposite")) - #Check whether distance was valid - if anno_dict["feat_strand"] == "+": - checks["distance"] = anno_dict["raw_distance"] in range(-query["distance"][0], query["distance"][1]) - else: - checks["distance"] = anno_dict["raw_distance"] in range(-query["distance"][1], query["distance"][0]) + #Check whether distance was valid + if anno_dict["feat_strand"] == "+": + checks["distance"] = anno_dict["raw_distance"] > -query["distance"][0] and anno_dict["raw_distance"] < query["distance"][1] + else: + checks["distance"] = anno_dict["raw_distance"] > -query["distance"][1] and anno_dict["raw_distance"] < query["distance"][0] - #Check distance (Distance can still be valid if PeakInsideFeature/FeatureInsidePeak and internals flag is set) + #Check distance (Distance can still be valid if PeakInsideFeature/FeatureInsidePeak and internals flag is set) + if "internals" in query: max_overlap = max(anno_dict["feat_ovl_peak"], anno_dict["peak_ovl_feat"]) checks["distance"] = checks["distance"] or (query["internals"]*1.0 > 0 and max_overlap >= query["internals"]*1.0) #if internals is set to more than 0 overlap - #Filter on relative location + #Filter on relative location + if "relative_location" in query: checks["relative_location"] = anno_dict["relative_location"] in query["relative_location"] - - #Filter on attribute if any was set - if query["filter_attribute"] != None: - checks["attribute"] = anno_dict["feat_attributes"].get(query["filter_attribute"], None) in query["attribute_values"] + + #Filter on attribute if any was set + if "filter_attribute" in query and "attribute_values" in query: #query["filter_attribute"] != None: + checks["attribute"] = anno_dict["feat_attributes"].get(query["filter_attribute"], None) in query["attribute_values"] + + ##### All checks are done -> establish if hit is a valid annotation ##### + valid = sum(checks.values()) == len(checks.values()) #all checks must be valid + if valid: + valid_annotations.append(anno_dict.copy()) + logger.debug("Validity: {0} | Checks: {1} | Annotation dict: {2}".format(valid, checks, {key:anno_dict[key] for key in anno_dict})) + + end = datetime.datetime.now() + logger.debug("Validated hits in {0}".format(end-begin)) - ##### All checks are done -> establish if hit is a valid annotation ##### - logger.debug("Query {0} | Checks: {1} | Annotation: {2}".format(query_i+1, checks, {key:anno_dict[key] for key in anno_dict if key != "feat_attributes"})) + #All tabix hits for this query were checked - if priority, stop searching if any valid hit was found -> else, check next query + stop_searching = (len(valid_annotations) > 0 and cfg_dict["priority"]) or stop_searching #or if stop_searching was already set previously - valid = sum(checks.values()) == len(checks.values()) #all checks must be valid - if valid: - valid_annotations.append(anno_dict.copy()) + if stop_searching == True: + logger.debug("{0} valid hit(s) were found for query_i = {1} and priority is true - stopping search.".format(len(valid_annotations), query_i)) + else: + logger.debug("A total of {0} valid hits were found. Incrementing query_i.".format(len(valid_annotations), query_i)) + + logger.debug("") #create empty line in debugger output for easy overview - #Stop searching for queries fitting to hits if a hit was found and priority is true - stop_searching = (valid and cfg_dict["priority"]) - - #All tabix hits hits were checked - stop_searching = True - - #After all hits have been checked; make final checks and set best flag + #After all hits have been checked for peak; make final checks and set best flag if len(valid_annotations) > 0: #If priority == True, find the highest ranked annotations for this peak - if cfg_dict["priority"] == True: - highest_priority_query = min([anno_dict["query"] for anno_dict in valid_annotations]) - logger.debug("Highest priority hit for {0} was {1}".format(peak, highest_priority_query)) - valid_annotations = [anno_dict for anno_dict in valid_annotations if anno_dict["query"] == highest_priority_query] + #if cfg_dict["priority"] == True: + # highest_priority_query = min([anno_dict["query"] for anno_dict in valid_annotations]) + # #logger.debug("Highest priority hit for {0} was {1}".format(peak, highest_priority_query)) + # valid_annotations = [anno_dict for anno_dict in valid_annotations if anno_dict["query"] == highest_priority_query] distances = [anno_dict["distance"] for anno_dict in valid_annotations] valid_annotations[distances.index(min(distances))]["best_hit"] = 1 else: - valid_annotations = [create_anno_dict(peak)] #empty hit - valid_annotations[0]["best_hit"] = 1 #the empty hit is the best hit + valid_annotations = [peak] + valid_annotations[0]["best_hit"] = 1 #the empty hit for the peak is the best hit #Add result to all the valid annotations all_valid_annotations.extend(valid_annotations) diff --git a/uropa/uropa.py b/uropa/uropa.py index 8c33e90..cebcc85 100644 --- a/uropa/uropa.py +++ b/uropa/uropa.py @@ -18,11 +18,11 @@ import time import subprocess import gzip +import copy import logging import multiprocessing as mp import pysam -import pandas as pd #Import internal functions from uropa.utils import * @@ -41,6 +41,7 @@ def main(): #################################################### INPUT ################################################# ############################################################################################################ + start_time = datetime.datetime.now() cmd = " ".join(sys.argv) #----------------------------------------------------------------------------------------------------------# @@ -58,14 +59,15 @@ def main(): one_query.add_argument("-b", "--bed", metavar="", help="Filename of .bed-file to annotate", action="store") one_query.add_argument("-g", "--gtf", metavar="", help="Filename of .gtf-file with features", action="store") one_query.add_argument("--feature", help="Feature for annotation", metavar="", nargs="*", default=[]) - one_query.add_argument("--feature_anchor", help="Feature anchor to annotate to", metavar="", nargs="*", default=["start", "center", "end"]) + one_query.add_argument("--feature_anchor", help="Specific feature anchor to annotate to", metavar="", choices=["start", "center", "end"], nargs="*", default=[]) one_query.add_argument("--distance", help="Maximum permitted distance from feature (1 or 2 arguments)", metavar="", nargs="*", type=int, default=[1000,10000]) - one_query.add_argument("--strand", metavar="", help="Desired strand of annotated feature relative to peak", nargs="*", choices=['ignore', 'same', 'opposite'], default='ignore') - one_query.add_argument("--relative_location", metavar="", help="Peak locaion relative to feature location", nargs="*", choices=["PeakInsideFeature", "FeatureInsidePeak", "Upstream", "Downstream", "OverlapStart", "OverlapEnd"], default=["PeakInsideFeature", "FeatureInsidePeak", "Upstream", "Downstream", "OverlapStart", "OverlapEnd"]) + one_query.add_argument("--strand", metavar="", help="Desired strand of annotated feature relative to peak", choices=['ignore', 'same', 'opposite'], default='ignore') + one_query.add_argument("--relative_location", metavar="", help="Peak location relative to feature location", nargs="*", choices=["PeakInsideFeature", "FeatureInsidePeak", "Upstream", "Downstream", "OverlapStart", "OverlapEnd"], default=[]) one_query.add_argument("--internals", metavar="", help="Set minimum overlap fraction for internal feature annotations. 0 equates to internals=False and 1 equates to internals=True. Default is False.", type=lambda x: restricted_float(x, 0, 1), default=False) - one_query.add_argument("--filter_attribute",metavar="", help="Filter on 9th column of GTF", default=None) + one_query.add_argument("--filter_attribute",metavar="", help="Filter on 9th column of GTF", default="") one_query.add_argument("--attribute_values", help="Value(s) of attribute corresponding to --filter_attribute", nargs="*", metavar="", default=[]) one_query.add_argument("--show_attributes", help="A list of attributes to show in output", metavar="", nargs="*", default=[]) + one_query.add_argument("--priority", help="argparse.SUPPRESS", action="store_true", default=False) #Or configuration arguments for multiple queries (overwrites) multi_query = parser.add_argument_group("Multi-query configuration file") @@ -76,9 +78,8 @@ def main(): additional.add_argument("-p", "--prefix", metavar="", help="Prefix for result file names (defaults to basename of .bed-file)") additional.add_argument("-o", "--outdir", metavar="", help="Output directory for output files (default: current dir)", default=".") #additional.add_argument("-r","--reformat", help="create an additional compact and line-reduced table as result file", action="store_true") - additional.add_argument("-s","--summary", help="Filename of additional visualisation of results in graphical format", action="store_true") + additional.add_argument("-s","--summary", help="Create additional visualisation of results in graphical format", action="store_true") additional.add_argument("-t","--threads", help="Multiprocessed run: n = number of threads to run annotation process", type=int, action="store", metavar="n", default=1) - #additional.add_argument("--add-comments",help="add comment lines to output files", action="store_true") additional.add_argument("-l","--log", help="Log file name for messages and warnings (default: log is written to stdout)", action="store", metavar="uropa.log") additional.add_argument("-d","--debug",help="Print verbose messages (for debugging)", action="store_true") additional.add_argument("-v","--version", help="Prints the version and exits", action="version", version="%(prog)s " + VERSION) @@ -153,7 +154,7 @@ def main(): logger.info("Reading configuration from commandline/input config") #First, fill in parameters from commandline - default_query = {"feature":args.feature, + cmd_query = {"feature":args.feature, "feature_anchor":args.feature_anchor, "distance": [args.distance[0], args.distance[0]] if len(args.distance) == 1 else args.distance, "strand": args.strand, @@ -163,16 +164,22 @@ def main(): "attribute_values": args.attribute_values, } + valid_query_keys = set(list(cmd_query.keys()) + ["name"]) + #create cfg_dict like it would have been parsed from config .json - cfg_dict = {"queries": [default_query], + cfg_dict = {"queries": [cmd_query], "show_attributes": args.show_attributes, - "priority": False, + "priority": args.priority, "gtf": args.gtf, - "bed": args.bed + "bed": args.bed, + "prefix": args.prefix, + "outdir": args.outdir, + "threads": args.threads } logger.debug("Config from command-line arguments: {0}".format(cfg_dict)) + #### Read from config file #Next, overwrite with config arguments if given, otherwise the arguments fall back to commandline default config = args.input if config != None: @@ -180,7 +187,7 @@ def main(): json_cfg_dict = parse_json(config) logger.debug("Config from json: {0}".format(json_cfg_dict)) for key in json_cfg_dict: - cfg_dict[key] = json_cfg_dict[key] #config values always win over commandline + cfg_dict[key] = json_cfg_dict[key] #config values always win over commandline input except IOError: logger.error("File %s does not exists or is not readable.", config) sys.exit() @@ -188,26 +195,25 @@ def main(): logger.error("File %s contains malformed JSON. %s", config, e) sys.exit() - #Fill each key with defaults if they were not set - for query in cfg_dict["queries"]: - for key in default_query: - query[key] = query.get(key, default_query[key]) + #Set prefix if not set + if cfg_dict["prefix"] == None: + cfg_dict["prefix"] = os.path.splitext(os.path.basename(cfg_dict["bed"]))[0] + output_prefix = os.path.join(cfg_dict["outdir"], cfg_dict["prefix"]) + logger.debug("Output_prefix set to: {0}".format(output_prefix)) - #Format keys in cfg_dict - cfg_dict = format_config(cfg_dict) + #Format keys in cfg_dict and exit if error + cfg_dict = format_config(cfg_dict, logger) logger.debug("Formatted config: {0}".format(cfg_dict)) - #Check for general input in queries, such as no show_attributes - if len(cfg_dict["show_attributes"]) == 0: - logger.warning("No show_attributes given - no attributes for annotations are displayed in output.") - - #catch duplicates in query names - query_names = [query["name"] for query in cfg_dict["queries"]] - if len(query_names) != len(set(query_names)): - logger.warning("Duplicates in query names: {0}".format(query_names)) - + #Write out formatted config file + cfg_dict_filled = copy.deepcopy(cfg_dict) + for query in cfg_dict_filled["queries"]: + query["relative_location"] = query.get("relative_location", ["PeakInsideFeature", "FeatureInsidePeak", "Upstream", "Downstream", "OverlapStart", "OverlapEnd"]) + query["strand"] = query.get("strand", 'ignore') + query["feature_anchor"] = query.get("feature_anchor", ["start", "center", "end"]) + #----------------------------------------------------------------------------------------------------------# - # Validate gtf / bed input + # Validate existance of gtf / bed input and writability of output #----------------------------------------------------------------------------------------------------------# #Check if bed & gtf files exists @@ -218,13 +224,6 @@ def main(): logger.error("No .{0}-file given as input - please check that a .{0}-file is given either via the commandline option --{0} or in the configuration file.".format(key)) sys.exit() - #Check prefix - if args.prefix == None: - args.prefix = os.path.splitext(os.path.basename(cfg_dict["bed"]))[0] - - output_prefix = os.path.join(args.outdir, args.prefix) - logger.debug("Output_prefix set to: {0}".format(output_prefix)) - #Check whether output files can be written output_files = [os.path.join(output_prefix + suffix) for suffix in ["_allhits.txt", "_allhits.bed", "_finalhits.txt", "_finalhits.bed"]] for f in output_files: @@ -247,22 +246,17 @@ def main(): check_bed_format(cfg_dict["bed"], logger) gtf_has_chr = check_chr(cfg_dict["gtf"]) #True/False - peaks = parse_bedfile(cfg_dict["bed"], gtf_has_chr) + peaks = parse_bedfile(cfg_dict["bed"], gtf_has_chr) #list of peak-dictionaries logger.debug("Read {0} peaks from {1}".format(len(peaks), cfg_dict["bed"])) #Establish order of peaks internal_peak_ids = [peak["internal_peak_id"] for peak in peaks] - #Split bed into chunks - n_chunks = 100 - peak_chunks = [peaks[i::n_chunks] for i in range(n_chunks)] - - #----------------------------------------------------------------------------------------------------------# # Prepare GTF and extract chosen features to a subset gtf-file if needed #----------------------------------------------------------------------------------------------------------# - logger.info("Preparing .gtf for fast access") + logger.info("Preparing .gtf-file for fast access") #Check all possible features in gtf logger.debug("Finding all possible features in gtf") @@ -277,7 +271,6 @@ def main(): sys.exit() feature = columns[2] - if feature not in gtf_feat_count: gtf_feat_count[feature] = 0 gtf_feat_count[feature] += 1 @@ -303,37 +296,58 @@ def main(): #Subset gtf if needed logger.debug("Subsetting gtf if needed") - gtf_specific = list(set(gtf_feat) - set(query_feat)) #features specific for gtf but which are not taken into account in queries + gtf_specific = list(set(gtf_feat) - set(query_feat)) #features in gtf which are not taken into account in queries if len(gtf_specific) > 0: sub_gtf = output_prefix + "_feature_subset.gtf" logger.debug("Subsetting {0} -> {1} with features {2}".format(cfg_dict["gtf"], sub_gtf, query_feat)) - subset_gtf(cfg_dict["gtf"], gtf_feat, sub_gtf) + subset_gtf(cfg_dict["gtf"], query_feat, sub_gtf) anno_gtf = sub_gtf temp_files.append(sub_gtf) else: anno_gtf = cfg_dict["gtf"] - #Read in and sort gtf file - logger.debug("Sorting gtf") - anno_gtf_sorted = output_prefix + "_sorted.gtf" - temp_files.append(anno_gtf_sorted) - sort_call = "sort -k1,1 -k4,4n {0} > {1}".format(anno_gtf, anno_gtf_sorted) - try: - sub = subprocess.check_output(sort_call, shell=True) - except subprocess.CalledProcessError: - logger.error("Could not sort GTF file using command-line call: {0}".format(sort_call)) - sys.exit() - #Compress and index using gzip/tabix logger.debug("Tabix compress") - anno_gtf_gz = output_prefix + "_sorted.gtf.gz" - anno_gtf_index = anno_gtf_gz + "_sorted.tbi" - pysam.tabix_compress(anno_gtf_sorted, anno_gtf_gz, force=True) - anno_gtf_gz = pysam.tabix_index(anno_gtf_gz, index=anno_gtf_index, seq_col=0, start_col=3, end_col=4, keep_original=True, force=True) - temp_files.extend([anno_gtf_gz, anno_gtf_index]) + anno_gtf_gz = output_prefix + ".gtf.gz" + anno_gtf_index = anno_gtf_gz + ".tbi" - #Write out formatted config file - json_string = config_string(cfg_dict) + success = 0 + sort_done = 0 + while success == 0: + try: + pysam.tabix_compress(anno_gtf, anno_gtf_gz, force=True) + anno_gtf_gz = pysam.tabix_index(anno_gtf_gz, index=anno_gtf_index, seq_col=0, start_col=3, end_col=4, keep_original=True, force=True, meta_char='#') + temp_files.extend([anno_gtf_gz, anno_gtf_index]) + success = 1 + if sort_done == 1: + logger.info("Sorting and indexing was successful") + + except Exception as e: + + #Exit if we already tried to sort file once + if sort_done == 1: + logger.error("Could not index .gtf-file - please check whether the file has the correct 9-column format.") + sys.exit() + + #Read in and sort gtf file + anno_gtf_sorted = output_prefix + "_sorted.gtf" + temp_files.append(anno_gtf_sorted) + sort_call = "grep -v \"^#\" {0} | sort -k1,1 -k4,4n > {1}".format(anno_gtf, anno_gtf_sorted) + + logger.warning("Indexing failed - the GTF is probably unsorted") + logger.warning("Attempting to sort with call: {0}".format(sort_call)) + + try: + sub = subprocess.check_output(sort_call, shell=True) + except subprocess.CalledProcessError: + logger.error("Could not sort GTF file using command-line call: {0}".format(sort_call)) + sys.exit() + + anno_gtf = anno_gtf_sorted + sort_done = 1 + + #Write config used for annotation + json_string = config_string(cfg_dict_filled) f = open(output_prefix + ".json", "w") f.write(json_string) f.close() @@ -343,10 +357,17 @@ def main(): ############################################################################################################ logger.info("Started annotation") + threads = int(cfg_dict["threads"]) + + #Split bed into chunks + n_chunks = 100 + chunk_size = int(np.ceil(len(peaks)/n_chunks)) + peak_chunks = [peaks[i:i+chunk_size] for i in range(0, len(peaks), chunk_size)] n_chunks = len(peak_chunks) - if args.threads > 1: - pool = mp.Pool(args.threads) + if cfg_dict["threads"] > 1: + + pool = mp.Pool(threads) task_list = [pool.apply_async(annotate_peaks, args=(chunk, anno_gtf_gz, anno_gtf_index, cfg_dict, )) for chunk in peak_chunks] pool.close() #done sending jobs to pool @@ -367,81 +388,89 @@ def main(): else: results = [] + logger.info("Progress: {0:.0f}%".format(0/float(n_chunks)*100)) for i, chunk in enumerate(peak_chunks): + logger.debug("Chunk {0}".format(i+1)) + logger.debug("First peak: {0}".format(chunk[0])) + logger.debug("Last peak: {0}".format(chunk[-1])) results.append(annotate_peaks(chunk, anno_gtf_gz, anno_gtf_index, cfg_dict, logger)) logger.info("Progress: {0:.0f}%".format((i+1)/float(n_chunks)*100)) - #Join results from threads to one dictionary + #Join results from threads to one list all_annotations = sum(results, []) - ############################################################################################################ ################################################ POSTPROCESSING ############################################ ############################################################################################################ logger.info("Processing annotated peaks") - #Add attribute columns + #Add attribute columns to output #The keys are different internally vs. the output columns - attribute_columns = cfg_dict["show_attributes"] - main = ["peak_chr", "peak_start", "peak_end", "peak_id", "feature", "feat_start", "feat_end", "feat_strand", "feat_anchor", "distance", "relative_location", "feat_ovl_peak", "peak_ovl_feat"] - header_internal = main + ["show_" + col for col in attribute_columns] + ["query_name"] + attribute_columns = cfg_dict.get("show_attributes", []) + main = ["peak_chr", "peak_start", "peak_end", "peak_id", "peak_score", "peak_strand", "feature", "feat_start", "feat_end", "feat_strand", "feat_anchor", "distance", "relative_location", "feat_ovl_peak", "peak_ovl_feat"] + header_internal = main + ["attribute_" + col for col in attribute_columns] + ["query_name"] header_output = main + attribute_columns + ["name"] + logger.debug("Adding attribute columns") for annotation in all_annotations: - attributes_dict = annotation["feat_attributes"] - - if attributes_dict is None: - attributes_dict = {} - - if type(attributes_dict) == dict: - annotation.update({"show_" + key: attributes_dict.get(key, None) for key in attribute_columns}) + attributes_dict = annotation.get("feat_attributes", {}) + for key in attribute_columns: + annotation["attribute_" + key] = attributes_dict.get(key, "NA") #Check if no annotations were found all_NA = 0 for anno in all_annotations: - if anno["feature"] != None: + if "feature" in anno: all_NA = 1 - if all_NA == 0: #This is 0 coming out of the loop if no features were found logger.warning("No annotations were found for input regions (all hits are NA). If this is unexpected, please check the configuration of your input queries.") + ##### Write output files ##### - all_hits = pd.DataFrame(all_annotations) - - #Sort on peak_ids - all_hits["original_order"] = [internal_peak_ids.index(peak_id) for peak_id in all_hits["internal_peak_id"]] #map position - all_hits.sort_values(by=["original_order", "feat_start"], inplace=True) + logger.info("Writing output files") + + #Make list of all hits in right order + all_hits_sorted = sorted(all_annotations, key= lambda d: (d["internal_peak_id"], d.get("feat_start", 0))) #use get because not all hits have feat_start + + header_str = "\t".join(header_output) + "\n" + allhits_str = "\n".join(["\t".join([str(hit.get(key, "NA")) for key in header_internal]) for hit in all_hits_sorted]) + "\n" + besthits_str = "\n".join(["\t".join([str(hit.get(key, "NA")) for key in header_internal]) for hit in all_hits_sorted if hit["best_hit"] == 1]) + "\n" #All hits logger.debug("Writing _allhits.txt") - all_hits.to_csv(os.path.join(output_prefix + "_allhits.txt"), na_rep="NA", sep="\t", index=False, columns=header_internal, header=header_output) - all_hits.to_csv(os.path.join(output_prefix + "_allhits.bed"), na_rep="NA", sep="\t", index=False, columns=header_internal, header=False) + with open(os.path.join(output_prefix + "_allhits.txt"), "w") as f: + f.write(header_str + allhits_str) + with open(os.path.join(output_prefix + "_allhits.bed"), "w") as f: + f.write(allhits_str) #Best hits - best_hits = all_hits.loc[all_hits["best_hit"] == 1] - best_hits.to_csv(os.path.join(output_prefix + "_finalhits.txt"), na_rep="NA", sep="\t", index=False, columns=header_internal, header=header_output) - best_hits.to_csv(os.path.join(output_prefix + "_finalhits.bed"), na_rep="NA", sep="\t", index=False, columns=header_internal, header=False) + logger.debug("Writing _besthits.txt") + with open(os.path.join(output_prefix + "_finalhits.txt"), "w") as f: + f.write(header_str + besthits_str) + with open(os.path.join(output_prefix + "_finalhits.bed"), "w") as f: + f.write(besthits_str) ##### Visual summary ##### - logger.info("Creating the Summary graphs of the results...") - summary_script = "uropa_summary.R" - summary_output = output_prefix + "_summary.pdf" - - #cmd is the command-line call str - call = [summary_script, "-f", os.path.join(output_prefix + "_finalhits.txt"), "-c", output_prefix + ".json", "-o", summary_output, "-b", os.path.join(output_prefix + "_allhits.txt"), "-a \'", cmd, "\'"] - call_str = ' '.join(call) - - try: - logger.debug('Summary output call is {}'.format(call_str)) - sum_pr = subprocess.check_output(call_str, shell=True) - except subprocess.CalledProcessError: - logger.warning("Visualized summary output could not be created from: %s", call_str) - except OSError: - logger.warning("Rscript command not available for summary output.") + if args.summary: + logger.info("Creating the Summary graphs of the results...") + summary_script = "uropa_summary.R" + summary_output = output_prefix + "_summary.pdf" + + #cmd is the command-line call str + call = [summary_script, "-f", os.path.join(output_prefix + "_finalhits.txt"), "-c", output_prefix + ".json", "-o", summary_output, "-b", os.path.join(output_prefix + "_allhits.txt"), "-a \'", cmd, "\'"] + call_str = ' '.join(call) + + try: + logger.debug('Summary output call is {}'.format(call_str)) + sum_pr = subprocess.check_output(call_str, shell=True) + except subprocess.CalledProcessError: + logger.warning("Visualized summary output could not be created from: %s", call_str) + except OSError: + logger.warning("Rscript command not available for summary output.") ##### Cleanup ##### - logger.info("Cleaning up temporary files.") + logger.info("Cleaning up temporary files") if args.debug == False: for f in temp_files: try: @@ -449,4 +478,6 @@ def main(): except: logger.warning("Could not remove temporary file {0}".format(f)) - logger.info("UROPA run ended successfully!") + end_time = datetime.datetime.now() + total_time = end_time - start_time + logger.info("UROPA run finished in {0}!".format(str(total_time).split('.', 2)[0])) diff --git a/uropa/utils.py b/uropa/utils.py index 46421d0..4b9867e 100644 --- a/uropa/utils.py +++ b/uropa/utils.py @@ -69,92 +69,179 @@ def parse_json(infile): return ast.literal_eval(json.dumps(json.load(f))) -def format_config(cfg_dict): +def format_config(cfg_dict, logger): """ Format input config to catch any errors """ - convert_values = {value: True for value in [True, 'T', 'True', 'TRUE', 'Yes', 'YES', 'Y', 'yes']} - convert_values.update({value: False for value in [False, 'F', 'False', 'FALSE', 'No', 'NO', 'N', 'no']}) query_prefix = "query_" + case_conversion = {"peakinsidefeature": "PeakInsideFeature", "featureinsidepeak": "FeatureInsidePeak", + "upstream": "Upstream", "downstream":"Downstream", "overlapstart":"OverlapStart", "overlapend": "OverlapEnd"} + convert_values = {value: True for value in [True, 't', 'true', 'yes', 'y']} + convert_values.update({value: False for value in [False, 'f', 'false', 'no', 'n']}) + + + ###################### Upper level config keys #################### + #Check existance of upper-level keys + valid_config_keys = set(["queries", "priority", "show_attributes", "gtf", "bed", "prefix", "outdir", "threads"]) + given_config_keys = set(cfg_dict.keys()) + invalid_config_keys = given_config_keys - valid_config_keys + if len(invalid_config_keys) > 0: + logger.error("Error in config file: Key(s) {0} not known to UROPA.".format(invalid_config_keys)) + sys.exit() + + #Delete empty keys + for key in list(cfg_dict.keys()): + if is_empty(cfg_dict[key]): + del cfg_dict[key] - #Set upper-level keys - cfg_dict["priority"] = convert_values[cfg_dict.get("priority", False)] + #Check format of values + if "show_attributes" not in cfg_dict: + cfg_dict["show_attributes"] = [] + cfg_dict["priority"] = convert_values[str(cfg_dict.get("priority", False)).lower()] #per default false + + ####################### Query level keys ########################## + #Convert keys for backwards compatibility {old:new, (...)} + conversion = {"show.attributes":"show_attributes", "feature.anchor": "feature_anchor", "filter.attribute":"filter_attribute", + "attribute.values": "attribute_values", "attribute.value": "attribute_values", "attribute_value": "attribute_values", + "direction": "relative_location"} + valid_query_keys = set(["feature", "feature_anchor", "distance", "strand", "relative_location", "internals", "filter_attribute", "attribute_values", "name"]) + + #Check and fill queries with needed defaults such as distance for i, query in enumerate(cfg_dict["queries"]): - #Feature - if type(query.get("feature", [])) != list: - query["feature"] = [query["feature"]] + #Convert keys for backwards compatibility + for key in list(query.keys()): + if key in conversion: + query[conversion[key]] = query[key] + del query[key] + + #Move show_attributes to upper_level if given + if "show_attributes" in query: + if type(query["show_attributes"]) == list: + for att in query["show_attributes"]: + if att not in cfg_dict["show_attributes"]: + cfg_dict["show_attributes"].append(att) + else: + if query["show_attributes"] not in cfg_dict["show_attributes"]: + cfg_dict["show_attributes"].append(query["show_attributes"]) + del query["show_attributes"] + + #Check if there are attributes in query which are not in the "allowed" values + query_keys = set(query.keys()) + invalid_keys = query_keys - valid_query_keys + if len(invalid_keys) > 0: + logger.error("Error in query {0}. Key(s) {1} are not known to UROPA.".format(i+1, invalid_keys)) + sys.exit() - if type(query.get("feature_anchor", [])) != list: - query["feature_anchor"] = [query["feature_anchor"]] + #Delete empty keys + for key in list(query.keys()): + if is_empty(query[key]): + del query[key] - if type(query.get("attribute_values", [])) != list: - query["attribute_values"] = [query["attribute_values"]] + # If filter_attribute is set, attribute_values has to be set as well + if ("filter_attribute" in query and "attribute_values" not in query) or ("attribute_values" in query and "filter_attribute" not in query): + logger.error("Error in query {0}. Keys for filter_attribute/attribute_values have to be set together.".format(i+1)) + sys.exit() + + ##### Check content of individual keys ##### + if "distance" in query: + try: + if type(query["distance"]) == list: + query["distance"] = [int(query["distance"][0]), int(query["distance"][-1])] + else: + query["distance"] = [int(query["distance"]), int(query["distance"])] #same distance in both positions + except: + logger.error("Error trying to convert distance \"{0}\" to integer - please check the format of this value.".format(query["distance"])) + sys.exit() + else: + logger.error("No 'distance' not given in query: {0}".format(query)) + sys.exit() #Internals internalval = query.get("internals", False) - query["internals"] = float(convert_values.get(internalval, internalval)) #if internals is a float, the value will not be converted + query["internals"] = float(convert_values.get(str(internalval).lower(), internalval)) #if internals is a float, the value will not be converted + if query["internals"] == 0: + del query["internals"] + + #Keys which should be lists (sets) to search in + for key in ["feature", "feature_anchor", "attribute_values", "relative_location"]: + if type(query.get(key, [])) != list: + query[key] = set([query[key]]) + + #Check the default values of input + if "strand" in query: + query["strand"] = query["strand"].lower() + valid = set(['ignore', 'same', 'opposite']) + if query["strand"] not in valid: + logger.error("Invalid strand ({0}) set in query {1}. Valid options are: {2}".format(query["strand"], i+1, valid)) + sys.exit() - if "distance" in query: - if type(query["distance"]) == list: - query["distance"] = [int(query["distance"][0]), int(query["distance"][1])] - else: - query["distance"] = [int(query["distance"]), int(query["distance"])] #same distance in both positions + if "feature_anchor" in query: + query["feature_anchor"] = set([element.lower() for element in query["feature_anchor"]]) + valid = set(["start", "center", "end"]) + invalid = query["feature_anchor"] - valid + if len(invalid) > 0: + logger.error("Invalid feature_anchor ({0}) set in query {1}. Valid options are: {2}".format(invalid, i+1, valid)) + sys.exit() + + if "relative_location" in query: + query["relative_location"] = set([case_conversion.get(str(element).lower(), element) for element in query["relative_location"]]) + valid = set(["PeakInsideFeature", "FeatureInsidePeak", "Upstream", "Downstream", "OverlapStart", "OverlapEnd"]) + invalid = query["relative_location"] - valid + if len(invalid) > 0: + logger.error("Invalid relative_location ({0}) set in query {1}. Valid options are: {3}".format(invalid, i+1, valid)) + sys.exit() #Name the query if it was not already named if "name" not in query: query["name"] = query_prefix + str(i + 1) + ############## Warnings related to config/queries ############# + #Check for general input in queries, such as no show_attributes + if len(cfg_dict.get("show_attributes", [])) == 0: + logger.warning("No show_attributes given in config - no attributes for annotations are displayed in output.") + + #Catch duplicates in query names + query_names = [query["name"] for query in cfg_dict["queries"]] + if len(query_names) != len(set(query_names)): + logger.warning("Duplicates in query names: {0}".format(query_names)) + return(cfg_dict) + + def config_string(cfg_dict): """ Pretty-print cfg_dict with one-line queries """ - upper_level = ["queries", "show_attributes", "priority", "gtf", "bed"] + upper_level = ["queries", "show_attributes", "priority", "gtf", "bed", "prefix", "outdir", "threads"] query_level = ["feature", "feature_anchor", "distance", "strand", "relative_location", "filter_attribute", "attribute_values", "internals", "name"] upper_lines = [] for upper_key in upper_level: - upper_str = "\"{0}\":".format(upper_key) - if upper_key == "queries": + query_lines = "\"queries\":[\n" - queries = [] - for query in cfg_dict[upper_key]: - - one_query = [] - for query_key in query_level: - query_string = "" - - if query_key in query: - value = None - if type(query[query_key]) == list: - if len(query[query_key]) > 0: - value = "[" + ",".join(["\"{0}\"".format(value) for value in query[query_key]]) + "]" - else: - if query[query_key] != None: - value = "\"{0}\"".format(query[query_key]) - - if value is not None: - query_string += "\"{0}\": {1}".format(query_key, value) - one_query.append(query_string) - - querystr = " {" + ", ".join(one_query) + "}" - queries.append(querystr) - - upper_str += "[\n" + ",\n".join(queries) + "\n]" - - elif upper_key == "show_attributes": - upper_str += "[" + ",".join(["\"{0}\"".format(value) for value in cfg_dict[upper_key]]) + "]" + #Convert sets to lists + for query in cfg_dict["queries"]: + for key in query: + if type(query[key]) == set: + query[key] = list(query[key]) - else: - upper_str += "\"{0}\"".format(cfg_dict[upper_key]) + query_strings = [json.dumps(query, sort_keys=True) for query in cfg_dict["queries"]] + query_lines += " " + ",\n ".join(query_strings) + "\n ]" - upper_lines.append(upper_str ) + upper_lines.append(query_lines) + + elif upper_key == "show_attributes" and upper_key in cfg_dict: + upper_lines.append("\"{0}\": {1}".format(upper_key, json.dumps(cfg_dict[upper_key]))) - fstr = "{\n" + ",\n".join(upper_lines) + "\n}\n" + else: + if upper_key in cfg_dict: + upper_lines.append("\"{0}\": \"{1}\"".format(upper_key, cfg_dict[upper_key])) + + config_string = "{\n" + ",\n".join(upper_lines) + "\n}\n" - return(fstr) + return(config_string) def check_file_access(fil, logger): """ Check if file exists and can be read """ @@ -203,7 +290,7 @@ def parse_bedfile(bedfile, gtf_has_chr): if gtf_has_chr == True: gtf_chr = "chr" + chrom if "chr" not in chrom else chrom - peak_dict = {"gtf_chr": gtf_chr, "peak_chr":chrom, "peak_start":start, "peak_end":end, "peak_id":name, "peak_score":score, "strand":strand, "internal_peak_id": "peak_{0}".format(i+1)} + peak_dict = {"gtf_chr": gtf_chr, "peak_chr":chrom, "peak_start":start, "peak_end":end, "peak_id":name, "peak_score":score, "peak_strand":strand, "internal_peak_id": i+1} peak_dict.update(dict(zip(additional_header, additional))) peaks.append(peak_dict) @@ -230,14 +317,19 @@ def check_chr(file, lines=100): def subset_gtf(gtf, features, sub_gtf): """Removes lines with features not in features from a gtf file.""" - feat2cut = np.unique(features) + feat2cut = set(np.unique(features)) with open(gtf, "r") as f: lines = f.readlines() - gtf_query_feat = [line for line in lines if line.startswith("#") or line.split("\t")[2] in feat2cut] + gtf_query_feat = [line for line in lines if line.split("\t")[2] in feat2cut] # List of lines containing only selected features from query with open(sub_gtf, "w") as f: f.write("".join(gtf_query_feat)) - +def is_empty(value): + """ Check whether value is empty """ + if (type(value) == str and value.strip() == "") or (type(value) == list and len(value) == 0): + return(True) + else: + return(False) \ No newline at end of file diff --git a/utils/uropa_summary.R b/utils/uropa_summary.R index 701807c..519ec8b 100755 --- a/utils/uropa_summary.R +++ b/utils/uropa_summary.R @@ -135,7 +135,7 @@ features <- c() plot.new() df.uropa.final <- read.table(final.hits, header=TRUE, sep="\t",stringsAsFactors = FALSE) # number of peaks annoteted with uropa run - num.peaks <- length(unique(df.uropa.final$peak_id)) + num.peaks <- length(df.uropa.final$peak_id) # stats is based on annoted peaks -> remove na rows df.uropa.final[,"distance"] <- as.numeric(df.uropa.final[,"distance"]) df.uropa.final <- df.uropa.final[complete.cases(df.uropa.final),]